{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:50:13Z","timestamp":1765309813883,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":57,"publisher":"ACM","funder":[{"name":"the National Natural Science Foundation of China","award":["No. 62332016"],"award-info":[{"award-number":["No. 62332016"]}]},{"name":"National Key R&D Program of China","award":["No.2023YFB4704500"],"award-info":[{"award-number":["No.2023YFB4704500"]}]},{"name":"Guangdong Province R&D Program","award":["2020B0909050001"],"award-info":[{"award-number":["2020B0909050001"]}]},{"name":"Anhui Province Development and Reform Commission 2021 New Energy and Intelligent Connected Vehicle Innovation Project"},{"name":"the Youth Innovation Promotion Association of the Chinese Academy of Sciences"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755767","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:54:17Z","timestamp":1761375257000},"page":"2178-2187","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["CalibWorkflow: A General MLLM-Guided Workflow for Centimeter-Level Cross-Sensor Calibration"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-0161-9539","authenticated-orcid":false,"given":"Xingchen","family":"Li","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Heifei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1882-2467","authenticated-orcid":false,"given":"Wuyang","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, China, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0964-7279","authenticated-orcid":false,"given":"Guoliang","family":"You","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7164-7245","authenticated-orcid":false,"given":"Xiaomeng","family":"Chu","sequence":"additional","affiliation":[{"name":"Yale University, New Haven, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-7319-409X","authenticated-orcid":false,"given":"Wenhao","family":"Yu","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-0754-3953","authenticated-orcid":false,"given":"Yifan","family":"Duan","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7896-6632","authenticated-orcid":false,"given":"Yuxuan","family":"Xiao","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6520-255X","authenticated-orcid":false,"given":"Yanyong","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al., 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican Malcolm Reynolds et al. 2022. Flamingo: a visual language model for few-shot learning. Advances in neural information processing systems Vol. 35 (2022) 23716-23736."},{"key":"e_1_3_2_1_3_1","unstructured":"Anthropic AI. 2024. Model Card Addendum: Claude 3.5 Haiku and Upgraded Claude 3.5 Sonnet. https:\/\/www.anthropic.com Accessed: 2025-03-15."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00822"},{"key":"e_1_3_2_1_5_1","volume-title":"Mustafa Hajijc, Mina Sartipib, and Siyang Cao.","author":"Chenga Lei","year":"2025","unstructured":"Lei Chenga, Lihao Guoa, Tianya Zhangb, Tam Bangb, Austin Harrisb, Mustafa Hajijc, Mina Sartipib, and Siyang Cao. 2025. CalibRefine: Deep Learning-Based Online Automatic Targetless LiDAR-Camera Calibration with Iterative and Attention-Driven Post-Refinement. arXiv preprint arXiv:2502.17648 (2025)."},{"key":"e_1_3_2_1_6_1","unstructured":"Yifan Duan Xinran Zhang Guoliang You Yilong Wu Xingchen Li Yao Li Xiaomeng Chu Jie Peng Yu Zhang Jianmin Ji et al. 2024. Rotation Initialization and Stepwise Refinement for Universal LiDAR Calibration. arXiv preprint arXiv:2405.05589 (2024)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACVW60836.2024.00102"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIM.2022.3204338"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1177\/0278364913491297"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/icra.2012.6224570"},{"key":"e_1_3_2_1_11_1","volume-title":"target-free lidar-camera extrinsic calibration via cross-modal mask matching","author":"Huang Zhiwei","year":"2024","unstructured":"Zhiwei Huang, Yikang Zhang, Qijun Chen, and Rui Fan. 2024. Online, target-free lidar-camera extrinsic calibration via cross-modal mask matching. IEEE Transactions on Intelligent Vehicles (2024)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2018.8593360"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/MFI52462.2021.9591203"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_2_1_15_1","volume-title":"single-shot, target-less, and automatic LiDAR-camera extrinsic calibration toolbox. arXiv preprint arXiv:2302.05094","author":"Koide Kenji","year":"2023","unstructured":"Kenji Koide, Shuji Oishi, Masashi Yokozuka, and Atsuhiko Banno. 2023. General, single-shot, target-less, and automatic LiDAR-camera extrinsic calibration toolbox. arXiv preprint arXiv:2302.05094 (2023)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICTC62082.2024.10826946"},{"volume-title":"Robotics: science and systems","author":"Levinson Jesse","key":"e_1_3_2_1_17_1","unstructured":"Jesse Levinson and Sebastian Thrun. 2013. Automatic Online Calibration of Cameras and Lasers.. In Robotics: science and systems, Vol. 2. Citeseer."},{"key":"e_1_3_2_1_18_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730-19742."},{"key":"e_1_3_2_1_19_1","volume-title":"Edgecalib: Multi-frame weighted edge features for automatic targetless lidar-camera calibration","author":"Li Xingchen","year":"2024","unstructured":"Xingchen Li, Yifan Duan, Beibei Wang, Haojie Ren, Guoliang You, Yu Sheng, Jianmin Ji, and Yanyong Zhang. 2024. Edgecalib: Multi-frame weighted edge features for automatic targetless lidar-camera calibration. IEEE Robotics and Automation Letters (2024)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.21203\/rs.3.rs-2018540\/v1"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2022.3193465"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/RCAR47638.2019.9044146"},{"key":"e_1_3_2_1_23_1","volume-title":"SE-Calib: Semantic Edges based LiDAR-Camera Boresight Online Calibration in Urban Scenes","author":"Liao Youqi","year":"2023","unstructured":"Youqi Liao, Jianping Li, Shuhao Kang, Qiang Li, Guifang Zhu, Shenghai Yuan, Zhen Dong, and Bisheng Yang. 2023. SE-Calib: Semantic Edges based LiDAR-Camera Boresight Online Calibration in Urban Scenes. IEEE Transactions on Geoscience and Remote Sensing (2023)."},{"key":"e_1_3_2_1_24_1","volume-title":"CLMASP: Coupling Large Language Models with Answer Set Programming for Robotic Task Planning. arXiv preprint arXiv:2406.03367","author":"Lin Xinrui","year":"2024","unstructured":"Xinrui Lin, Yangfan Wu, Huanyu Yang, Yu Zhang, Yanyong Zhang, and Jianmin Ji. 2024. CLMASP: Coupling Large Language Models with Answer Set Programming for Robotic Task Planning. arXiv preprint arXiv:2406.03367 (2024)."},{"key":"e_1_3_2_1_25_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2023), 34892-34916."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/cvprw53098.2021.00324"},{"key":"e_1_3_2_1_27_1","volume-title":"Gpt-driver: Learning to drive with gpt. arXiv preprint arXiv:2310.01415","author":"Mao Jiageng","year":"2023","unstructured":"Jiageng Mao, Yuxi Qian, Junjie Ye, Hang Zhao, and Yue Wang. 2023a. Gpt-driver: Learning to drive with gpt. arXiv preprint arXiv:2310.01415 (2023)."},{"key":"e_1_3_2_1_28_1","volume-title":"A language agent for autonomous driving. arXiv preprint arXiv:2311.10813","author":"Mao Jiageng","year":"2023","unstructured":"Jiageng Mao, Junjie Ye, Yuxi Qian, Marco Pavone, and Yue Wang. 2023b. A language agent for autonomous driving. arXiv preprint arXiv:2311.10813 (2023)."},{"key":"e_1_3_2_1_29_1","volume-title":"CalibOnline: Online Detection and Calibration of Extrinsic Parameters between LiDAR and Monocular Camera","author":"Pan Feng","year":"2025","unstructured":"Feng Pan and Wei Wang. 2025. CalibOnline: Online Detection and Calibration of Extrinsic Parameters between LiDAR and Monocular Camera. IEEE Sensors Journal (2025)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v26i1.8379"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2020.2969164"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01543"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/IVS.2017.7995968"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2019.2937909"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA40945.2020.9196831"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10609987"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.3390\/s24123878"},{"key":"e_1_3_2_1_38_1","volume-title":"Australian Conference on Robotics and Automation. 3-5.","author":"Taylor Zachary","year":"2012","unstructured":"Zachary Taylor and Juan Nieto. 2012. A mutual information approach to automatic calibration of camera and lidar in natural environments. In Australian Conference on Robotics and Automation. 3-5."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/icra.2015.7139872"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/TRO.2016.2596771"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/TRO.2016.2596771"},{"key":"e_1_3_2_1_42_1","unstructured":"Gemini Team Rohan Anil Sebastian Borgeaud Jean-Baptiste Alayrac Jiahui Yu Radu Soricut Johan Schalkwyk Andrew M Dai Anja Hauth Katie Millican et al. 2023. Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)."},{"key":"e_1_3_2_1_43_1","volume-title":"CORP: A Multi-Modal Dataset for Campus-Oriented Roadside Perception Tasks. arXiv preprint arXiv:2404.03191","author":"Wang Beibei","year":"2024","unstructured":"Beibei Wang, Shuang Meng, Lu Zhang, Chenjie Wang, Jingjing Huang, Yao Li, Haojie Ren, Yuxuan Xiao, Yuru Peng, Jianmin Ji, et al., 2024. CORP: A Multi-Modal Dataset for Campus-Oriented Roadside Perception Tasks. arXiv preprint arXiv:2404.03191 (2024)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/IROS58592.2024.10801605"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610018"},{"key":"e_1_3_2_1_46_1","volume-title":"Large multimodal agents: A survey. arXiv preprint arXiv:2402.15116","author":"Xie Junlin","year":"2024","unstructured":"Junlin Xie, Zhihong Chen, Ruifei Zhang, Xiang Wan, and Guanbin Li. 2024. Large multimodal agents: A survey. arXiv preprint arXiv:2402.15116 (2024)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00033"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00987"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02482"},{"key":"e_1_3_2_1_50_1","volume-title":"Doraemongpt: Toward understanding dynamic scenes with large language models (exemplified as a video agent). arXiv preprint arXiv:2401.08392","author":"Yang Zongxin","year":"2024","unstructured":"Zongxin Yang, Guikun Chen, Xiaodi Li, Wenguan Wang, and Yi Yang. 2024a. Doraemongpt: Toward understanding dynamic scenes with large language models (exemplified as a video agent). arXiv preprint arXiv:2401.08392 (2024)."},{"key":"e_1_3_2_1_51_1","volume-title":"The dawn of lmms: Preliminary explorations with gpt-4v (ision). arXiv preprint arXiv:2309.17421","author":"Yang Zhengyuan","year":"2023","unstructured":"Zhengyuan Yang, Linjie Li, Kevin Lin, Jianfeng Wang, Chung-Ching Lin, Zicheng Liu, and Lijuan Wang. 2023. The dawn of lmms: Preliminary explorations with gpt-4v (ision). arXiv preprint arXiv:2309.17421, Vol. 9, 1 (2023), 1."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/tim.2020.2999137"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2021.3098923"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/iros.2004.1389752"},{"key":"e_1_3_2_1_55_1","volume-title":"European Conference on Computer Vision. Springer, 186-202","author":"Zhang Sha","year":"2024","unstructured":"Sha Zhang, Di Huang, Jiajun Deng, Shixiang Tang, Wanli Ouyang, Tong He, and Yanyong Zhang. 2024. Agent3d-zero: An agent for zero-shot 3d understanding. In European Conference on Computer Vision. Springer, 186-202."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48506.2021.9561216"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2018.8593660"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755767","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:45:20Z","timestamp":1765309520000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755767"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":57,"alternative-id":["10.1145\/3746027.3755767","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755767","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}