{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T03:53:04Z","timestamp":1781495584299,"version":"3.54.1"},"reference-count":57,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"11","license":[{"start":{"date-parts":[[2025,11,1]],"date-time":"2025-11-01T00:00:00Z","timestamp":1761955200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,11,1]],"date-time":"2025-11-01T00:00:00Z","timestamp":1761955200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,11,1]],"date-time":"2025-11-01T00:00:00Z","timestamp":1761955200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Robot. Autom. Lett."],"published-print":{"date-parts":[[2025,11]]},"DOI":"10.1109\/lra.2025.3611145","type":"journal-article","created":{"date-parts":[[2025,9,17]],"date-time":"2025-09-17T17:32:39Z","timestamp":1758130359000},"page":"11713-11720","source":"Crossref","is-referenced-by-count":2,"title":["Efficient Multi-Camera Tokenization With Triplanes for End-to-End Driving"],"prefix":"10.1109","volume":"10","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8698-202X","authenticated-orcid":false,"given":"Boris","family":"Ivanovic","sequence":"first","affiliation":[{"name":"NVIDIA Research, Santa Clara, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Cristiano","family":"Saltori","sequence":"additional","affiliation":[{"name":"NVIDIA Research, Santa Clara, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6898-9463","authenticated-orcid":false,"given":"Yurong","family":"You","sequence":"additional","affiliation":[{"name":"NVIDIA Research, Santa Clara, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yan","family":"Wang","sequence":"additional","affiliation":[{"name":"NVIDIA Research, Santa Clara, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Wenjie","family":"Luo","sequence":"additional","affiliation":[{"name":"NVIDIA Research, Santa Clara, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0206-4337","authenticated-orcid":false,"given":"Marco","family":"Pavone","sequence":"additional","affiliation":[{"name":"NVIDIA Research, Santa Clara, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1117\/12.3054764"},{"key":"ref3","first-page":"1556","article-title":"A survey on model compression for large language models","volume-title":"Trans. Assoc. Comput. Linguistics","volume":"12","author":"Zhu","year":"2024"},{"key":"ref4","first-page":"3483","article-title":"Learning structured output representation using deep conditional generative models","volume-title":"Proc. Conf. Neural Inf. Process. Syst.","author":"Sohn","year":"2015"},{"key":"ref5","first-page":"6309","article-title":"Neural discrete representation learning","volume-title":"Proc. Conf. Neural Inf. Process. Syst.","author":"Oord","year":"2017"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2010.11929"},{"key":"ref8","article-title":"Scaling laws in patchification: An image is worth 50,176 tokens and more","author":"Wang","year":"2025"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01712"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00766"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01463"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2023.xix.025"},{"key":"ref13","article-title":"A generalist agent","volume-title":"Trans. Mach. Learn. Res.","author":"Reed","year":"2022"},{"key":"ref14","article-title":"RT-2: Vision-language-action models transfer web knowledge to robotic control","author":"Brohan","year":"2023"},{"key":"ref15","article-title":"Open X-embodiment: Robotic learning datasets and RT-X models","year":"2023"},{"key":"ref16","first-page":"14975","article-title":"VIMA: General robot manipulation with multimodal prompts","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Jiang","year":"2023"},{"key":"ref17","article-title":"GAIA-1: A generative world model for autonomous driving","author":"Hu","year":"2023"},{"key":"ref18","article-title":"LINGO-1: Exploring natural language for autonomous driving","year":"2023"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2024.3440097"},{"key":"ref20","article-title":"DriveVLM: The convergence of autonomous driving and large vision-language models","volume-title":"Proc. Conf. Robot Learn.","author":"Tian","year":"2024"},{"key":"ref21","article-title":"OmniDrive: A holistic llm-agent framework for autonomous driving with 3D perception, reasoning and planning","author":"Wang","year":"2024"},{"key":"ref22","article-title":"EMMA: End-to-end multimodal model for autonomous driving","author":"Hwang","year":"2024"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"ref24","article-title":"Gemini: A family of highly capable multimodal models","year":"2023"},{"key":"ref25","article-title":"MobileVLM: A fast, reproducible and strong vision language assistant for mobile devices","author":"Chu","year":"2023"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01420"},{"key":"ref27","first-page":"128940","article-title":"An image is worth 32 tokens for reconstruction and generation","volume-title":"Proc. Conf. Neural Inf. Process. Syst.","author":"Yu","year":"2024"},{"key":"ref28","article-title":"Spectral image tokenizer","author":"Esteves","year":"2024"},{"key":"ref29","article-title":"How many tokens is an image worth?","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Duggal","year":"2025"},{"key":"ref30","article-title":"ElasticTok: Adaptive tokenization for image and video","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Yan","year":"2025"},{"issue":"4","key":"ref31","article-title":"3D Gaussian splatting for real-time radiance field rendering","volume-title":"Proc. SIGGRAPH","volume":"42","author":"Kerbl","year":"2023"},{"key":"ref32","article-title":"DistillNeRF: Perceiving 3D scenes from single-glance images by distilling neural fields and foundation model features","volume-title":"Proc. Conf. Neural Inf. Process. Syst.","author":"Wang","year":"2024"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_24"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00025"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01120"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01201"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00021"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01565"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02000"},{"key":"ref40","article-title":"LRM: Large reconstruction model for single image to 3D","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Hong","year":"2024"},{"key":"ref41","first-page":"1","article-title":"Frankenstein: Generating semantic-compositional 3D scenes in one tri-plane","volume-title":"Proc. SIGGRAPH Asia","author":"Han","year":"2024"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00890"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01885"},{"key":"ref44","article-title":"A survey on occupancy perception for autonomous driving: The information fusion perspective","volume-title":"Inf. Fusion","volume":"114","author":"Xu","year":"2025"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00539"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01103"},{"key":"ref48","article-title":"Vector-quantized image modeling with improved VQGAN","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Yu","year":"2022"},{"key":"ref49","article-title":"Finite scalar quantization: VQ-VAE made simple","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Mentzer","year":"2024"},{"key":"ref50","article-title":"DINOv2: Learning robust visual features without supervision","volume-title":"Trans. Mach. Learn. Res.","author":"Oquab","year":"2024"},{"key":"ref51","article-title":"Accelerate the future of AI-defined vehicles and autonomous driving","author":"Wu","year":"2025"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref53","article-title":"STORM: Spatio-temporal reconstruction model for large-scale outdoor scenes","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Yang","year":"2025"},{"key":"ref54","article-title":"Tokenize the world into object-level knowledge to address long-tail events in autonomous driving","volume-title":"Proc. Conf. Robot Learn.","author":"Tian","year":"2024"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02568"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"ref57","first-page":"28706","article-title":"NAVSIM: Data-driven non-reactive autonomous vehicle simulation and benchmarking","volume-title":"Proc. Conf. Neural Inf. Process. Syst.","author":"Dauner","year":"2024"}],"container-title":["IEEE Robotics and Automation Letters"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/7083369\/11169302\/11168172.pdf?arnumber=11168172","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,11]],"date-time":"2025-10-11T05:38:02Z","timestamp":1760161082000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11168172\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11]]},"references-count":57,"journal-issue":{"issue":"11"},"URL":"https:\/\/doi.org\/10.1109\/lra.2025.3611145","relation":{},"ISSN":["2377-3766","2377-3774"],"issn-type":[{"value":"2377-3766","type":"electronic"},{"value":"2377-3774","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11]]}}}