{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,20]],"date-time":"2026-05-20T16:42:36Z","timestamp":1779295356472,"version":"3.51.4"},"publisher-location":"Cham","reference-count":63,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031736490","type":"print"},{"value":"9783031736506","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T00:00:00Z","timestamp":1732147200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T00:00:00Z","timestamp":1732147200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73650-6_6","type":"book-chapter","created":{"date-parts":[[2024,11,20]],"date-time":"2024-11-20T18:16:36Z","timestamp":1732126596000},"page":"87-104","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":76,"title":["GenAD: Generative End-to-End Autonomous Driving"],"prefix":"10.1007","author":[{"given":"Wenzhao","family":"Zheng","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ruiqi","family":"Song","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xianda","family":"Guo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chenming","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Long","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,11,21]]},"reference":[{"key":"6_CR1","doi-asserted-by":"crossref","unstructured":"Bouchard, F., Sedwards, S., Czarnecki, K.: A rule-based behaviour planner for autonomous driving. In: IJCRR, pp. 263\u2013279 (2022)","DOI":"10.1007\/978-3-031-21541-4_17"},{"key":"6_CR2","doi-asserted-by":"crossref","unstructured":"Caesar, H., et al.: nuScenes: a multimodal dataset for autonomous driving. In: CVPR, pp. 11621\u201311631 (2020)","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"6_CR3","unstructured":"Chai, Y., Sapp, B., Bansal, M., Anguelov, D.: Multipath: multiple probabilistic anchor trajectory hypotheses for behavior prediction. arXiv preprint arXiv:1910.05449 (2019)"},{"key":"6_CR4","doi-asserted-by":"crossref","unstructured":"Chen, Y., Liu, J., Zhang, X., Qi, X., Jia, J.: Voxelnext: fully sparse voxelnet for 3D object detection and tracking. arXiv preprint arXiv:2303.11301 (2023)","DOI":"10.1109\/CVPR52729.2023.02076"},{"key":"6_CR5","doi-asserted-by":"crossref","unstructured":"Cheng, J., Xin, R., Wang, S., Liu, M.: MPNP: multi-policy neural planner for urban driving. In: IROS, pp. 10549\u201310554 (2022)","DOI":"10.1109\/IROS47612.2022.9982111"},{"key":"6_CR6","doi-asserted-by":"crossref","unstructured":"Cho, K., et al.: Learning phrase representations using RNN encoder\u2013decoder for statistical machine translation. In: EMNLP, p.\u00a01724 (2014)","DOI":"10.3115\/v1\/D14-1179"},{"key":"6_CR7","unstructured":"Dauner, D., Hallgarten, M., Geiger, A., Chitta, K.: Parting with misconceptions about learning-based vehicle motion planning. In: CoRL (2023)"},{"key":"6_CR8","unstructured":"Dosovitskiy, A., Ros, G., Codevilla, F., Lopez, A., Koltun, V.: Carla: an open urban driving simulator. In: CoRL (2017)"},{"key":"6_CR9","doi-asserted-by":"crossref","unstructured":"Duan, Y., Guo, X., Zhu, Z.: Diffusiondepth: diffusion denoising approach for monocular depth estimation. arXiv preprint arXiv:2303.05021 (2023)","DOI":"10.1007\/978-3-031-73247-8_25"},{"key":"6_CR10","unstructured":"Duan, Y., Guo, X., Zhu, Z., Wang, Z., Wang, Y.K., Lin, C.T.: Maskfuser: masked fusion of joint multi-modal tokenization for end-to-end autonomous driving. arXiv preprint arXiv:2405.07573 (2024)"},{"key":"6_CR11","doi-asserted-by":"crossref","unstructured":"Graham, B., Engelcke, M., Van Der\u00a0Maaten, L.: 3D semantic segmentation with submanifold sparse convolutional networks. In: CVPR, pp. 9224\u20139232 (2018)","DOI":"10.1109\/CVPR.2018.00961"},{"key":"6_CR12","doi-asserted-by":"crossref","unstructured":"Gu, J., et al.: ViP3D: end-to-end visual trajectory prediction via 3D agent queries. arXiv preprint arXiv:2208.01582 (2022)","DOI":"10.1109\/CVPR52729.2023.00532"},{"key":"6_CR13","unstructured":"Guo, X., et al.: Openstereo: a comprehensive benchmark for stereo matching and strong baseline. arXiv preprint arXiv:2312.00343 (2023)"},{"key":"6_CR14","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"6_CR15","doi-asserted-by":"crossref","unstructured":"Hu, A., et al.: Fiery: future instance prediction in bird\u2019s-eye view from surround monocular cameras. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.01499"},{"key":"6_CR16","doi-asserted-by":"crossref","unstructured":"Hu, P., Huang, A., Dolan, J., Held, D., Ramanan, D.: Safe local motion planning with self-supervised freespace forecasting. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.01254"},{"key":"6_CR17","doi-asserted-by":"crossref","unstructured":"Hu, S., Chen, L., Wu, P., Li, H., Yan, J., Tao, D.: ST-P3: end-to-end vision-based autonomous driving via spatial-temporal feature learning. In: ECCV (2022)","DOI":"10.1007\/978-3-031-19839-7_31"},{"key":"6_CR18","doi-asserted-by":"crossref","unstructured":"Hu, Y., et\u00a0al.: Planning-oriented autonomous driving. In: CVPR, pp. 17853\u201317862 (2023)","DOI":"10.1109\/CVPR52729.2023.01712"},{"key":"6_CR19","unstructured":"Huang, J., Huang, G., Zhu, Z., Du, D.: BEVDet: high-performance multi-camera 3D object detection in bird-eye-view. arXiv preprint arXiv:2112.11790 (2021)"},{"key":"6_CR20","doi-asserted-by":"crossref","unstructured":"Huang, Y., Zheng, W., Zhang, B., Zhou, J., Lu, J.: SelfOcc: self-supervised vision-based 3D occupancy prediction. In: CVPR, pp. 19946\u201319956 (2024)","DOI":"10.1109\/CVPR52733.2024.01885"},{"key":"6_CR21","doi-asserted-by":"crossref","unstructured":"Huang, Y., Zheng, W., Zhang, Y., Zhou, J., Lu, J.: Tri-perspective view for vision-based 3D semantic occupancy prediction. In: CVPR, pp. 9223\u20139232 (2023)","DOI":"10.1109\/CVPR52729.2023.00890"},{"key":"6_CR22","doi-asserted-by":"crossref","unstructured":"Huang, Y., Zheng, W., Zhang, Y., Zhou, J., Lu, J.: GaussianFormer: scene as Gaussians for vision-based 3D semantic occupancy prediction. In: ECCV (2024)","DOI":"10.1007\/978-3-031-73383-3_22"},{"key":"6_CR23","unstructured":"Jiang, B., et al.: Perceive, interact, predict: learning dynamic and static clues for end-to-end motion prediction. arXiv preprint arXiv:2212.02181 (2022)"},{"key":"6_CR24","doi-asserted-by":"crossref","unstructured":"Jiang, B., et al.: VAD: vectorized scene representation for efficient autonomous driving. arXiv preprint arXiv:2303.12077 (2023)","DOI":"10.1109\/ICCV51070.2023.00766"},{"key":"6_CR25","doi-asserted-by":"crossref","unstructured":"Jiang, Y., et al.: PolarFormer: multi-camera 3D object detection with polar transformers. arXiv preprint arXiv:2206.15398 (2022)","DOI":"10.1609\/aaai.v37i1.25185"},{"key":"6_CR26","doi-asserted-by":"crossref","unstructured":"Khurana, T., Hu, P., Dave, A., Ziglar, J., Held, D., Ramanan, D.: Differentiable raycasting for self-supervised occupancy forecasting. In: ECCV (2022)","DOI":"10.1007\/978-3-031-19839-7_21"},{"key":"6_CR27","unstructured":"Kingma, D.P., Welling, M.: Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114 (2013)"},{"key":"6_CR28","doi-asserted-by":"crossref","unstructured":"Li, Q., Wang, Y., Wang, Y., Zhao, H.: Hdmapnet: an online HD map construction and evaluation framework. In: ICRA (2022)","DOI":"10.1109\/ICRA46639.2022.9812383"},{"key":"6_CR29","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: Bevdepth: acquisition of reliable depth for multi-view 3D object detection. arXiv preprint arXiv:2206.10092 (2022)","DOI":"10.1609\/aaai.v37i2.25233"},{"key":"6_CR30","doi-asserted-by":"crossref","unstructured":"Li, Z., et al.: Bevformer: learning bird\u2019s-eye-view representation from multi-camera images via spatiotemporal transformers. In: ECCV (2022)","DOI":"10.1007\/978-3-031-20077-9_1"},{"key":"6_CR31","doi-asserted-by":"crossref","unstructured":"Liang, M., et al.: Learning lane graph representations for motion forecasting. In: ECCV (2020)","DOI":"10.1007\/978-3-030-58536-5_32"},{"key":"6_CR32","unstructured":"Liang, T., et al.: BEVFusion: a simple and robust lidar-camera fusion framework. arXiv preprint arXiv:2205.13790 (2022)"},{"key":"6_CR33","unstructured":"Liao, B., et al.: MapTR: structured modeling and learning for online vectorized HD map construction. arXiv preprint arXiv:2208.14437 (2022)"},{"key":"6_CR34","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Doll\u00e1r, P., Girshick, R., He, K., Hariharan, B., Belongie, S.: Feature pyramid networks for object detection. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.106"},{"key":"6_CR35","unstructured":"Liu, Y., Wang, Y., Wang, Y., Zhao, H.: Vectormapnet: end-to-end vectorized HD map learning. arXiv preprint arXiv:2206.08920 (2022)"},{"key":"6_CR36","doi-asserted-by":"crossref","unstructured":"Liu, Y., Zhang, J., Fang, L., Jiang, Q., Zhou, B.: Multimodal motion prediction with stacked transformers. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00749"},{"key":"6_CR37","doi-asserted-by":"crossref","unstructured":"Liu, Y., Wang, T., Zhang, X., Sun, J.: PETR: position embedding transformation for multi-view 3D object detection. arXiv preprint arXiv:2203.05625 (2022)","DOI":"10.1007\/978-3-031-19812-0_31"},{"key":"6_CR38","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: BEVFusion: multi-task multi-sensor fusion with unified bird\u2019s-eye view representation. arXiv preprint arXiv:2205.13542 (2022)","DOI":"10.1109\/ICRA48891.2023.10160968"},{"key":"6_CR39","unstructured":"Loshchilov, I., Hutter, F.: SGDR: stochastic gradient descent with warm restarts. arXiv preprint arXiv:1608.03983 (2016)"},{"key":"6_CR40","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"},{"key":"6_CR41","doi-asserted-by":"crossref","unstructured":"Mao, J., et al.: Voxel transformer for 3D object detection. In: ICCV, pp. 3164\u20133173 (2021)","DOI":"10.1109\/ICCV48922.2021.00315"},{"key":"6_CR42","unstructured":"Ngiam, J., et\u00a0al.: Scene transformer: a unified architecture for predicting multiple agent trajectories. arXiv preprint arXiv:2106.08417 (2021)"},{"key":"6_CR43","doi-asserted-by":"crossref","unstructured":"Phan-Minh, T., Grigore, E.C., Boulton, F.A., Beijbom, O., Wolff, E.M.: Covernet: Multimodal behavior prediction using trajectory sets. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01408"},{"key":"6_CR44","doi-asserted-by":"crossref","unstructured":"Philion, J., Fidler, S.: Lift, splat, shoot: encoding images from arbitrary camera rigs by implicitly unprojecting to 3D. In: ECCV, pp. 194\u2013210 (2020)","DOI":"10.1007\/978-3-030-58568-6_12"},{"key":"6_CR45","doi-asserted-by":"crossref","unstructured":"Pini, S., Perone, C.S., Ahuja, A., Ferreira, A.S.R., Niendorf, M., Zagoruyko, S.: Safe real-world autonomous driving by learning to predict and plan with a mixture of experts. In: ICRA, pp. 10069\u201310075 (2023)","DOI":"10.1109\/ICRA48891.2023.10160992"},{"key":"6_CR46","doi-asserted-by":"crossref","unstructured":"Ratliff, N.D., Bagnell, J.A., Zinkevich, M.A.: Maximum margin planning. In: Proceedings of the 23rd International Conference on Machine Learning, pp. 729\u2013736 (2006)","DOI":"10.1145\/1143844.1143936"},{"key":"6_CR47","doi-asserted-by":"crossref","unstructured":"Reading, C., Harakeh, A., Chae, J., Waslander, S.L.: Categorical depth distribution network for monocular 3D object detection. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00845"},{"key":"6_CR48","unstructured":"Scheel, O., Bergamini, L., Wolczyk, M., Osi\u0144ski, B., Ondruska, P.: Urban driver: learning to drive from real-world demonstrations using policy gradients. In: Conference on Robot Learning, pp. 718\u2013728. PMLR (2022)"},{"key":"6_CR49","unstructured":"Tian, X., Jiang, T., Yun, L., Wang, Y., Wang, Y., Zhao, H.: OCC3D: a large-scale 3D occupancy prediction benchmark for autonomous driving. arXiv preprint arXiv:2304.14365 (2023)"},{"key":"6_CR50","doi-asserted-by":"crossref","unstructured":"Tong, W., et\u00a0al.: Scene as occupancy. In: ICCV, pp. 8406\u20138415 (2023)","DOI":"10.1109\/ICCV51070.2023.00772"},{"issue":"2","key":"6_CR51","doi-asserted-by":"publisher","first-page":"1805","DOI":"10.1103\/PhysRevE.62.1805","volume":"62","author":"M Treiber","year":"2000","unstructured":"Treiber, M., Hennecke, A., Helbing, D.: Congested traffic states in empirical observations and microscopic simulations. Phys. Rev. E 62(2), 1805 (2000)","journal-title":"Phys. Rev. E"},{"key":"6_CR52","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NeurIPS (2017)"},{"key":"6_CR53","doi-asserted-by":"crossref","unstructured":"Wang, X., et al.: Openoccupancy: a large scale benchmark for surrounding semantic occupancy perception. arXiv preprint arXiv:2303.03991 (2023)","DOI":"10.1109\/ICCV51070.2023.01636"},{"key":"6_CR54","doi-asserted-by":"crossref","unstructured":"Wei, Y., Zhao, L., Zheng, W., Zhu, Z., Zhou, J., Lu, J.: SurroundOcc: multi-camera 3D occupancy prediction for autonomous driving. In: ICCV, pp. 21729\u201321740 (2023)","DOI":"10.1109\/ICCV51070.2023.01986"},{"key":"6_CR55","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Weng, X., Ou, Y., Kitani, K.M.: Agentformer: agent-aware transformers for socio-temporal multi-agent forecasting. In: ICCV, pp. 9813\u20139823 (2021)","DOI":"10.1109\/ICCV48922.2021.00967"},{"key":"6_CR56","doi-asserted-by":"crossref","unstructured":"Zeng, S., Zheng, W., Lu, J., Yan, H.: Hardness-aware scene synthesis for semi-supervised 3D object detection. TMM (2024)","DOI":"10.1109\/TMM.2024.3396297"},{"key":"6_CR57","doi-asserted-by":"crossref","unstructured":"Zeng, W., et al.: End-to-end interpretable neural motion planner. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00886"},{"key":"6_CR58","unstructured":"Zhang, Y., et al.: Beverse: unified perception and prediction in birds-eye-view for vision-centric autonomous driving. arXiv preprint arXiv:2205.09743 (2022)"},{"key":"6_CR59","doi-asserted-by":"crossref","unstructured":"Zhao, L., et al.: Lowrankocc: tensor decomposition and low-rank recovery for vision-based 3D semantic occupancy prediction. In: CVPR, pp. 9806\u20139815 (2024)","DOI":"10.1109\/CVPR52733.2024.00936"},{"key":"6_CR60","doi-asserted-by":"crossref","unstructured":"Zheng, W., Chen, W., Huang, Y., Zhang, B., Duan, Y., Lu, J.: Occworld: learning a 3D occupancy world model for autonomous driving. In: ECCV (2024)","DOI":"10.1007\/978-3-031-72624-8_4"},{"key":"6_CR61","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Tuzel, O.: Voxelnet: end-to-end learning for point cloud based 3D object detection. In: CVPR, pp. 4490\u20134499 (2018)","DOI":"10.1109\/CVPR.2018.00472"},{"key":"6_CR62","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., Dai, J.: Deformable DETR: deformable transformers for end-to-end object detection. arXiv preprint arXiv:2010.04159 (2020)"},{"key":"6_CR63","unstructured":"Zuo, S., Zheng, W., Huang, Y., Zhou, J., Lu, J.: Pointocc: cylindrical tri-perspective view for point-based 3D semantic occupancy prediction. arXiv preprint arXiv:2308.16896 (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73650-6_6","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,20]],"date-time":"2024-11-20T19:03:06Z","timestamp":1732129386000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73650-6_6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,21]]},"ISBN":["9783031736490","9783031736506"],"references-count":63,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73650-6_6","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,21]]},"assertion":[{"value":"21 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}