{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,27]],"date-time":"2026-06-27T16:03:27Z","timestamp":1782576207935,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755341","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:54:15Z","timestamp":1761375255000},"page":"4193-4202","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["NavigScene: Bridging Local Perception and Global Navigation for Beyond-Visual-Range Autonomous Driving"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0311-6135","authenticated-orcid":false,"given":"Qucheng","family":"Peng","sequence":"first","affiliation":[{"name":"Center for Research in Computer Vision, University of Central Florida, Orlando, Florida, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0194-1358","authenticated-orcid":false,"given":"Chen","family":"Bai","sequence":"additional","affiliation":[{"name":"Xpeng Motors, Santa Clara, California, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-0612-3823","authenticated-orcid":false,"given":"Guoxiang","family":"Zhang","sequence":"additional","affiliation":[{"name":"Xpeng Motors, Santa Clara, California, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6379-7617","authenticated-orcid":false,"given":"Bo","family":"Xu","sequence":"additional","affiliation":[{"name":"Xpeng Motors, Santa Clara, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-8327-1584","authenticated-orcid":false,"given":"Xiaotong","family":"Liu","sequence":"additional","affiliation":[{"name":"Xpeng Motors, Santa Clara, California, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-9640-0563","authenticated-orcid":false,"given":"Xiaoyin","family":"Zheng","sequence":"additional","affiliation":[{"name":"Xpeng Motors, Santa Clara, California, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3957-7061","authenticated-orcid":false,"given":"Chen","family":"Chen","sequence":"additional","affiliation":[{"name":"Center for Research in Computer Vision, University of Central Florida, Orlando, Florida, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5905-2321","authenticated-orcid":false,"given":"Cheng","family":"Lu","sequence":"additional","affiliation":[{"name":"Xpeng Motors, Santa Clara, California, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al., 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_2_2_1","volume-title":"Spice: Semantic propositional image caption evaluation. In Computer Vision-ECCV 2016: 14th European Conference","author":"Anderson Peter","year":"2016","unstructured":"Peter Anderson, Basura Fernando, Mark Johnson, and Stephen Gould. 2016. Spice: Semantic propositional image caption evaluation. In Computer Vision-ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11-14, 2016, Proceedings, Part V 14. Springer, 382-398."},{"key":"e_1_3_2_2_3_1","volume-title":"Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65-72","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65-72."},{"key":"e_1_3_2_2_4_1","volume-title":"Proceedings of the 31st International Conference on Computational Linguistics. 9074-9084","author":"Bi Jing","year":"2025","unstructured":"Jing Bi, Yuting Wu, Weiwei Xing, and Zhenjie Wei. 2025. Enhancing the Reasoning Capabilities of Small Language Models via Solution Guidance Fine-Tuning. In Proceedings of the 31st International Conference on Computational Linguistics. 9074-9084."},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"e_1_3_2_2_6_1","volume-title":"End-to-end autonomous driving: Challenges and frontiers","author":"Chen Li","year":"2024","unstructured":"Li Chen, Penghao Wu, Kashyap Chitta, Bernhard Jaeger, Andreas Geiger, and Hongyang Li. 2024b. End-to-end autonomous driving: Challenges and frontiers. IEEE Transactions on Pattern Analysis and Machine Intelligence (2024)."},{"key":"e_1_3_2_2_7_1","volume-title":"Vadv2: End-to-end vectorized autonomous driving via probabilistic planning. arXiv preprint arXiv:2402.13243","author":"Chen Shaoyu","year":"2024","unstructured":"Shaoyu Chen, Bo Jiang, Hao Gao, Bencheng Liao, Qing Xu, Qian Zhang, Chang Huang, Wenyu Liu, and Xinggang Wang. 2024a. Vadv2: End-to-end vectorized autonomous driving via probabilistic planning. arXiv preprint arXiv:2402.13243 (2024)."},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIV.2023.3318070"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3573900.3593631"},{"key":"e_1_3_2_2_10_1","first-page":"28706","article-title":"Navsim: Data-driven non-reactive autonomous vehicle simulation and benchmarking","volume":"37","author":"Dauner Daniel","year":"2024","unstructured":"Daniel Dauner, Marcel Hallgarten, Tianyu Li, Xinshuo Weng, Zhiyu Huang, Zetong Yang, Hongyang Li, Igor Gilitschenski, Boris Ivanovic, Marco Pavone, et al., 2024. Navsim: Data-driven non-reactive autonomous vehicle simulation and benchmarking. Advances in Neural Information Processing Systems, Vol. 37 (2024), 28706-28719.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_11_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 13668-13677","author":"Ding Xinpeng","year":"2024","unstructured":"Xinpeng Ding, Jianhua Han, Hang Xu, Xiaodan Liang, Wei Zhang, and Xiaomeng Li. 2024. Holistic autonomous driving understanding by bird's-eye-view injected multi-modal large models. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 13668-13677."},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.12"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680583"},{"key":"e_1_3_2_2_14_1","volume-title":"International Joint Conference on Artificial Intelligence. Springer, 34-49","author":"He Yangfan","year":"2024","unstructured":"Yangfan He, Xinyan Wang, and Tianyu Shi. 2024a. Ddpm-moco: Advancing industrial surface defect generation and detection with generative and contrastive learning. In International Joint Conference on Artificial Intelligence. Springer, 34-49."},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01712"},{"key":"e_1_3_2_2_16_1","volume-title":"The Thirty-eight Conference on Neural Information Processing Systems Datasets and Benchmarks Track.","author":"Jia Xiaosong","unstructured":"Xiaosong Jia, Zhenjie Yang, Qifeng Li, Zhiyuan Zhang, and Junchi Yan. [n.d.]. Bench2Drive: Towards Multi-Ability Benchmarking of Closed-Loop End-To-End Autonomous Driving. In The Thirty-eight Conference on Neural Information Processing Systems Datasets and Benchmarks Track."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00766"},{"key":"e_1_3_2_2_18_1","volume-title":"Multi-Modal Large Language Model with RAG Strategies in Soccer Commentary Generation. In 2025 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV). IEEE, 6197-6206","author":"Li Xiang","year":"2025","unstructured":"Xiang Li, Yangfan He, Shuaishuai Zu, Zhengyang Li, Tianyu Shi, Yiting Xie, and Kevin Zhang. 2025. Multi-Modal Large Language Model with RAG Strategies in Soccer Commentary Generation. In 2025 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV). IEEE, 6197-6206."},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01124"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680572"},{"key":"e_1_3_2_2_21_1","volume-title":"Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74-81.","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74-81."},{"key":"e_1_3_2_2_22_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2023), 34892-34916."},{"key":"e_1_3_2_2_23_1","volume-title":"Decoupled Weight Decay Regularization. In International Conference on Learning Representations.","author":"Loshchilov Ilya","year":"2019","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. Decoupled Weight Decay Regularization. In International Conference on Learning Representations."},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72980-5_15"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2011.6094584"},{"key":"e_1_3_2_2_26_1","volume-title":"European Conference on Computer Vision. Springer, 292-308","author":"Nie Ming","year":"2024","unstructured":"Ming Nie, Renyuan Peng, Chunwei Wang, Xinyue Cai, Jianhua Han, Hang Xu, and Li Zhang. 2024. Reason2drive: Towards interpretable and chain-based reasoning for autonomous driving. In European Conference on Computer Vision. Springer, 292-308."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01398"},{"key":"e_1_3_2_2_28_1","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311-318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311-318."},{"key":"e_1_3_2_2_29_1","volume-title":"The Thirteenth International Conference on Learning Representations.","author":"Peng Qucheng","year":"2025","unstructured":"Qucheng Peng, Benjamin Planche, Zhongpai Gao, Meng Zheng, Anwesa Choudhuri, Terrence Chen, Chen Chen, and Ziyan Wu. 2025a. 3D Vision-Language Gaussian Splatting. In The Thirteenth International Conference on Learning Representations."},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00218"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/FG61629.2025.11099339"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28253"},{"key":"e_1_3_2_2_33_1","first-page":"53728","article-title":"Direct preference optimization: Your language model is secretly a reward model","volume":"36","author":"Rafailov Rafael","year":"2023","unstructured":"Rafael Rafailov, Archit Sharma, Eric Mitchell, Christopher D Manning, Stefano Ermon, and Chelsea Finn. 2023. Direct preference optimization: Your language model is secretly a reward model. Advances in Neural Information Processing Systems, Vol. 36 (2023), 53728-53741.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681483"},{"key":"e_1_3_2_2_35_1","volume-title":"European Conference on Computer Vision. Springer, 256-274","author":"Sima Chonghao","year":"2024","unstructured":"Chonghao Sima, Katrin Renz, Kashyap Chitta, Li Chen, Hanxue Zhang, Chengen Xie, Jens Bei\u00dfwenger, Ping Luo, Andreas Geiger, and Hongyang Li. 2024. Drivelm: Driving with graph visual question answering. In European Conference on Computer Vision. Springer, 256-274."},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02089"},{"key":"e_1_3_2_2_37_1","unstructured":"Gilbert Strang and Kai Borre. 1997. Linear algebra geodesy and GPS. Siam."},{"key":"e_1_3_2_2_38_1","volume-title":"Sparsedrive: End-to-end autonomous driving via sparse scene representation. arXiv preprint arXiv:2405.19620","author":"Sun Wenchao","year":"2024","unstructured":"Wenchao Sun, Xuewu Lin, Yining Shi, Chuang Zhang, Haoran Wu, and Sifa Zheng. 2024. Sparsedrive: End-to-end autonomous driving via sparse scene representation. arXiv preprint arXiv:2405.19620 (2024)."},{"key":"e_1_3_2_2_39_1","volume-title":"Beginning google maps API 3","author":"Svennerberg Gabriel","unstructured":"Gabriel Svennerberg. 2010. Beginning google maps API 3. Apress."},{"key":"e_1_3_2_2_40_1","first-page":"1","article-title":"Deep learning and the information bottleneck principle. In 2015 ieee information theory workshop (itw)","author":"Tishby Naftali","year":"2015","unstructured":"Naftali Tishby and Noga Zaslavsky. 2015. Deep learning and the information bottleneck principle. In 2015 ieee information theory workshop (itw). Ieee, 1-5.","journal-title":"Ieee"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1389"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i14.29541"},{"key":"e_1_3_2_2_44_1","first-page":"80522","article-title":"V-petl bench: A unified visual parameter-efficient transfer learning benchmark","volume":"37","author":"Xin Yi","year":"2024","unstructured":"Yi Xin, Siqi Luo, Xuyang Liu, Haodi Zhou, Xinyu Cheng, Christina E Lee, Junlong Du, Haozhe Wang, MingCai Chen, Ting Liu, et al., 2024b. V-petl bench: A unified visual parameter-efficient transfer learning benchmark. Advances in Neural Information Processing Systems, Vol. 37 (2024), 80522-80535.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_45_1","volume-title":"Parameter-efficient fine-tuning for pre-trained vision models: A survey. arXiv preprint arXiv:2402.02242","author":"Xin Yi","year":"2024","unstructured":"Yi Xin, Siqi Luo, Haodi Zhou, Junlong Du, Xiaohong Liu, Yue Fan, Qing Li, and Yuntao Du. 2024c. Parameter-efficient fine-tuning for pre-trained vision models: A survey. arXiv preprint arXiv:2402.02242 (2024)."},{"key":"e_1_3_2_2_46_1","volume-title":"Yupeng Zhou, Renrui Zhang, Le Zhuo, et al.","author":"Xin Yi","year":"2025","unstructured":"Yi Xin, Juncheng Yan, Qi Qin, Zhen Li, Dongyang Liu, Shicheng Li, Victor Shea-Jay Huang, Yupeng Zhou, Renrui Zhang, Le Zhuo, et al., 2025a. Lumina-mGPT 2.0: Stand-Alone AutoRegressive Image Modeling. arXiv preprint arXiv:2507.17801 (2025)."},{"key":"e_1_3_2_2_47_1","unstructured":"Yi Xin Le Zhuo Qi Qin Siqi Luo Yuewen Cao Bin Fu Yangfan He Hongsheng Li Guangtao Zhai Xiaohong Liu et al. 2025b. Resurrect Mask AutoRegressive Modeling for Efficient and Scalable Image Generation. arXiv preprint arXiv:2507.13032 (2025)."},{"key":"e_1_3_2_2_48_1","volume-title":"Drivegpt4: Interpretable end-to-end autonomous driving via large language model","author":"Xu Zhenhua","year":"2024","unstructured":"Zhenhua Xu, Yujia Zhang, Enze Xie, Zhen Zhao, Yong Guo, Kwan-Yee K Wong, Zhenguo Li, and Hengshuang Zhao. 2024. Drivegpt4: Interpretable end-to-end autonomous driving via large language model. IEEE Robotics and Automation Letters (2024)."},{"key":"e_1_3_2_2_49_1","unstructured":"An Yang Baosong Yang Beichen Zhang Binyuan Hui Bo Zheng Bowen Yu Chengyuan Li Dayiheng Liu Fei Huang Haoran Wei et al. 2024b. Qwen2. 5 technical report. arXiv preprint arXiv:2412.15115 (2024)."},{"key":"e_1_3_2_2_50_1","volume-title":"Dong Chen, Jianhui Wang, Tianyu Shi, Arsalan Heydarian, and Pei Liu.","author":"Yang Chen","year":"2024","unstructured":"Chen Yang, Yangfan He, Aaron Xuxiang Tian, Dong Chen, Jianhui Wang, Tianyu Shi, Arsalan Heydarian, and Pei Liu. 2024a. Wcdt: World-centric diffusion transformer for traffic scene generation. arXiv preprint arXiv:2404.02082 (2024)."},{"key":"e_1_3_2_2_51_1","volume-title":"LLaMA-Adapter: Efficient Fine-tuning of Large Language Models with Zero-initialized Attention. In The Twelfth International Conference on Learning Representations.","author":"Zhang Renrui","year":"2024","unstructured":"Renrui Zhang, Jiaming Han, Chris Liu, Aojun Zhou, Pan Lu, Yu Qiao, Hongsheng Li, and Peng Gao. 2024. LLaMA-Adapter: Efficient Fine-tuning of Large Language Models with Zero-initialized Attention. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_2_52_1","volume-title":"European Conference on Computer Vision. Springer, 87-104","author":"Zheng Wenzhao","year":"2024","unstructured":"Wenzhao Zheng, Ruiqi Song, Xianda Guo, Chenming Zhang, and Long Chen. 2024. Genad: Generative end-to-end autonomous driving. In European Conference on Computer Vision. Springer, 87-104."},{"key":"e_1_3_2_2_53_1","volume-title":"ReAgent-V: A Reward-Driven Multi-Agent Framework for Video Understanding. arXiv preprint arXiv:2506.01300","author":"Zhou Yiyang","year":"2025","unstructured":"Yiyang Zhou, Yangfan He, Yaofeng Su, Siwei Han, Joel Jang, Gedas Bertasius, Mohit Bansal, and Huaxiu Yao. 2025. ReAgent-V: A Reward-Driven Multi-Agent Framework for Video Understanding. arXiv preprint arXiv:2506.01300 (2025)."},{"key":"e_1_3_2_2_54_1","volume-title":"Human-centric Reward Optimization for Reinforcement Learning-based Automated Driving using Large Language Models. arXiv preprint arXiv:2405.04135","author":"Zhou Ziqi","year":"2024","unstructured":"Ziqi Zhou, Jingyue Zhang, Jingyuan Zhang, Yangfan He, Boyue Wang, Tianyu Shi, and Alaa Khamis. 2024. Human-centric Reward Optimization for Reinforcement Learning-based Automated Driving using Large Language Models. arXiv preprint arXiv:2405.04135 (2024)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755341","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:01:22Z","timestamp":1765339282000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755341"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":54,"alternative-id":["10.1145\/3746027.3755341","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755341","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}