{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,20]],"date-time":"2026-06-20T02:08:35Z","timestamp":1781921315001,"version":"3.54.5"},"reference-count":72,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"NSFC","doi-asserted-by":"publisher","award":["62225603,62441615,623B2038"],"award-info":[{"award-number":["62225603,62441615,623B2038"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.02582","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"27817-27827","source":"Crossref","is-referenced-by-count":4,"title":["HERMES: A Unified Self-Driving World Model for Simultaneous 3D Scene Understanding and Generation"],"prefix":"10.1109","author":[{"given":"Xin","family":"Zhou","sequence":"first","affiliation":[{"name":"Huazhong University of Science and Technology"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Dingkang","family":"Liang","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Sifan","family":"Tu","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiwu","family":"Chen","sequence":"additional","affiliation":[{"name":"Mach Drive"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yikang","family":"Ding","sequence":"additional","affiliation":[{"name":"MEGVII Technology"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Dingyuan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Feiyang","family":"Tan","sequence":"additional","affiliation":[{"name":"Mach Drive"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Hengshuang","family":"Zhao","sequence":"additional","affiliation":[{"name":"The University of Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiang","family":"Bai","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","first-page":"3","article-title":"Gpt-4 technical report","author":"Achiam","year":"2023","journal-title":"arXiv"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01373"},{"key":"ref3","first-page":"6","article-title":"Meteor: An automatic metric for mt evaluation with improved correlation with human judgments","volume-title":"Proc. Annual Meeting of the Association for Computational Linguistics Workshop","author":"Banerjee","year":"2005"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4231-5"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.02283"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00276"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01297"},{"key":"ref9","first-page":"3","article-title":"Dreamllm: Synergistic multimodal comprehension and creation","volume-title":"Proc. of Intl. Conf. on Learning Representations","author":"Dong","year":"2024"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2024.105171"},{"key":"ref11","first-page":"2","article-title":"Magicdrive: Street view generation with diverse 3d geometry control","volume-title":"Proc. of Intl. Conf. on Learning Representations","author":"Gao","year":"2024"},{"key":"ref12","first-page":"2","article-title":"Vista: A generalizable driving world model with high fidelity and versatile controllability","volume-title":"Proc. of Advances in Neural Information Processing Systems","volume":"1","author":"Gao","year":"2024"},{"key":"ref13","first-page":"3","article-title":"Dome: Taming diffusion model into high-fidelity controllable occupancy world model","author":"Gu","year":"2024","journal-title":"arXiv"},{"key":"ref14","first-page":"2","article-title":"World models","volume-title":"Proc. of Advances in Neural Information Processing Systems","author":"Ha","year":"2018"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.52202\/075280-3311"},{"key":"ref16","first-page":"2","volume":"1","author":"Hu","year":"2023","journal-title":"Gaia-1: A generative world model for autonomous driving."},{"key":"ref17","first-page":"6","article-title":"Drivemm: All-in-one large multimodal model for autonomous driving","author":"Huang","year":"2024","journal-title":"arXiv"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72643-9_12"},{"key":"ref19","first-page":"6","article-title":"Gpt-4o system card","author":"Hurst","year":"2024","journal-title":"arXiv"},{"key":"ref20","first-page":"2","article-title":"Adriver-i: A general world model for autonomous driving","author":"Jia","year":"2023","journal-title":"arXiv"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00114"},{"key":"ref22","first-page":"6","article-title":"Llava-onevision: Easy visual task transfer","author":"Li","year":"2024","journal-title":"arXiv"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160489"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73229-4_27"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_1"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02527"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2025.3594749"},{"key":"ref28","first-page":"6","article-title":"Rouge: A package for automatic evaluation of summaries","volume-title":"Proc. Annual Meeting of the Association for Computational Linguistics Workshop","author":"Lin","year":"2004"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1516"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72989-8_19"},{"key":"ref33","first-page":"2","article-title":"Unleashing generalization of end-to-end autonomous driving with controllable long video generation","author":"Ma","year":"2024","journal-title":"arXiv"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02030"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01470"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73347-5_17"},{"key":"ref37","first-page":"6","article-title":"Time will tell: New outlooks and a baseline for temporal multi-view 3d object detection","volume-title":"Proc. of Intl. Conf. on Learning Representations","author":"Park","year":"2022"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58568-6_12"},{"key":"ref39","first-page":"7","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. of Intl. Conf. on Machine Learning","author":"Radford","year":"2021"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01432"},{"key":"ref41","first-page":"3","article-title":"Drivelm: Driving with graph visual question answering","volume-title":"Proc. of European Conference on Computer Vision","volume":"2","author":"Sima"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"ref44","first-page":"3","article-title":"Occsora: 4d occupancy generation models as world simulators for autonomous driving","author":"Wang","year":"2024","journal-title":"arXiv"},{"key":"ref45","first-page":"4","article-title":"Neus: Learning neural implicit surfaces by volume rendering for multi-view reconstruction","volume-title":"Proc. of Advances in Neural Information Processing Systems","volume":"3","author":"Wang","year":"2021"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00335"},{"issue":"3","key":"ref47","first-page":"5","article-title":"Omnidrive: A holistic 11 m -agent framework for autonomous driving with 3d perception, reasoning and planning","volume-title":"Proc. of IEEE Intl. Conf. on Computer Vision and Pattern Recognition","volume":"2","author":"Wang","year":"2025"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00107"},{"key":"ref49","first-page":"3","article-title":"Drivemlm: Aligning multi-modal large language models with behavioral planning states for autonomous driving","author":"Wang","year":"2023","journal-title":"arXiv"},{"key":"ref50","first-page":"2","article-title":"Drivedreamer: Towards real-worlddriven world models for autonomous driving","volume-title":"Proc. of European Conference on Computer Vision","author":"Wang"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01397"},{"key":"ref52","first-page":"3","article-title":"Occllama: An occupancy-language-action generative world model for autonomous driving","author":"Wei","year":"2024","journal-title":"arXiv"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00659"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19812-0_32"},{"key":"ref55","first-page":"3","article-title":"Visionllm v2: An end-to-end generalist multimodal large language model for hundreds of visionlanguage tasks","volume-title":"Proc. of Advances in Neural Information Processing Systems","author":"Wu","year":"2024"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2024.3440097"},{"key":"ref57","first-page":"3","article-title":"Renderworld: World model with self-supervised 3d label","author":"Yan","year":"2024","journal-title":"arXiv"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01710"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01443"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01389"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01390"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72970-6_4"},{"key":"ref63","first-page":"3","article-title":"Learning unsupervised world models for autonomous driving via discrete diffusion","volume-title":"Proc. of Intl. Conf. on Learning Representations","volume":"1","author":"Zhang","year":"2023"},{"key":"ref64","first-page":"2","article-title":"Bevworld: A multimodal world model for autonomous driving via unified bev latent space","author":"Zhang","year":"2024","journal-title":"arXiv"},{"key":"ref65","first-page":"3","article-title":"Psalm: Pixelwise segmentation with large multi-modal model","volume-title":"Proc. of European Conference on Computer Vision","author":"Zhang"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i10.33130"},{"key":"ref67","first-page":"5","article-title":"Extending large vision-language model for diverse interactive tasks in autonomous driving","author":"Zhao","year":"2025","journal-title":"arXiv"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72624-8_4"},{"key":"ref69","first-page":"2","article-title":"Doe-1: Closed-loop autonomous driving with large world model","author":"Zheng","year":"2024","journal-title":"arXiv"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73033-7_8"},{"key":"ref71","first-page":"4","volume":"3","author":"Zhu","year":"2023","journal-title":"Ponderv2: Pave the way for 3d foundation model with a universal pre-training paradigm."},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/icra55743.2025.11128001"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11446328.pdf?arnumber=11446328","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T05:21:07Z","timestamp":1777612867000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11446328\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":72,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.02582","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}