{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:55:29Z","timestamp":1781538929483,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":72,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"DELPHI project","award":["101104263"],"award-info":[{"award-number":["101104263"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810832","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"1025-1034","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Text4Radar-V2X: Text-guided 4D Radar for Cooperative 3D Object Detection"],"prefix":"10.1145","author":[{"given":"Xiangyuan","family":"Peng","sequence":"first","affiliation":[{"name":"Infineon Technologies AG\/Technical University of Munich, Germany"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kay","family":"Bierzynski","sequence":"additional","affiliation":[{"name":"Infineon Technologies AG, Germany"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Lorenzo","family":"Servadei","sequence":"additional","affiliation":[{"name":"Technical University of Munich, Germany"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Robert","family":"Wille","sequence":"additional","affiliation":[{"name":"Technical University of Munich, Germany"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.02356"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW69036.2025.00195"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"crossref","unstructured":"Chao Xiang Chen Feng Xiaopo Xie Botian Shi Hao Lu Yisheng Lv Mingchuan Yang and Zhendong Niu. 2023. Multi-sensor fusion and cooperative perception for autonomous driving: A review. IEEE Intelligent Transportation Systems Magazine 15 5 (2023) 36\u201358.","DOI":"10.1109\/MITS.2023.3283864"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"crossref","unstructured":"Lili Fan Junhao Wang Yuanmeng Chang Yuke Li Yutong Wang and Dongpu Cao. 2024. 4D mmWave radar for autonomous driving perception: A comprehensive survey. IEEE Transactions on Intelligent Vehicles 9 4 (2024) 4606\u20134620.","DOI":"10.1109\/TIV.2024.3380244"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01436"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"crossref","unstructured":"Long Zhuang Yiqing Yao Nuo Li Zijian Wang Lingtong Zhong Zijing Zhang and Tao Zhang. 2025. 4DRC-OC: Online Calibration of 4D Millimeter Wave Radar-Camera with Depth Map Assistance. IEEE Robotics and Automation Letters (2025).","DOI":"10.1109\/LRA.2025.3558453"},{"key":"e_1_3_3_1_8_2","unstructured":"Aaron Hurst Adam Lerer Adam\u00a0P Goucher Adam Perelman Aditya Ramesh Aidan Clark AJ Ostrow Akila Welihinda Alan Hayes Alec Radford et\u00a0al. 2024. Gpt-4o system card. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.21276 (2024)."},{"key":"e_1_3_3_1_9_2","unstructured":"Jinze Bai Shuai Bai Yunfei Chu Zeyu Cui Kai Dang Xiaodong Deng Yang Fan Wenbin Ge Yu Han Fei Huang et\u00a0al. 2023. Qwen technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.16609 (2023)."},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA55743.2025.11128399"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","unstructured":"Jiaxing Zhang Chengjun Ge Wen Xiao Miao Tang Jon Mills Benjamin Coifman and Nengcheng Chen. 2026. Roadside lidar-based scene understanding toward intelligent traffic perception: A comprehensive review. ISPRS Journal of Photogrammetry and Remote Sensing 233 (2026) 69\u201388. 10.1016\/j.isprsjprs.2026.01.012","DOI":"10.1016\/j.isprsjprs.2026.01.012"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"crossref","unstructured":"Hai Wu Shijia Zhao Xun Huang Qiming Xia Chenglu Wen Li Jiang Xin Li and Cheng Wang. 2025. Unsupervised 3D Object Detection by Commonsense Clue. IEEE Transactions on Pattern Analysis and Machine Intelligence (2025).","DOI":"10.1109\/TPAMI.2025.3598341"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02551"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA55743.2025.11127889"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/ITSC58415.2024.10919920"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"crossref","unstructured":"Xin Bi Caien Weng Panpan Tong Baojie Fan and Arno Eichberge. 2025. MAFF-Net: Enhancing 3D Object Detection With 4D Radar Via Multi-Assist Feature Fusion. IEEE Robotics and Automation Letters (2025).","DOI":"10.1109\/LRA.2025.3550707"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01105"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"crossref","unstructured":"Xiaogang Song Zhenhua Zhou Lei Zhang Xiaofeng Lu and Xinhong Hei. 2023. Psns-ssd: Pixel-level suppressed nonsalient semantic and multicoupled channel enhancement attention for 3d object detection. IEEE Robotics and Automation Letters 9 1 (2023) 603\u2013610.","DOI":"10.1109\/LRA.2023.3335773"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"crossref","unstructured":"Yuyan Wu and Hae\u00a0Young Noh. 2025. 4DRadarRBD: 4D mmWave radar-based road boundary detection in autonomous driving. Frontiers in Signal Processing 5 (2025) 1667789.","DOI":"10.3389\/frsip.2025.1667789"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72335-3_12"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"crossref","unstructured":"Shaoshuai Shi Li Jiang Jiajun Deng Zhe Wang Chaoxu Guo Jianping Shi Xiaogang Wang and Hongsheng Li. 2023. PV-RCNN++: Point-voxel feature set abstraction with local vector representation for 3D object detection. International Journal of Computer Vision 131 2 (2023) 531\u2013551.","DOI":"10.1007\/s11263-022-01710-9"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN60899.2024.10651526"},{"key":"e_1_3_3_1_23_2","volume-title":"ACM Multimedia 2024","author":"Jiang Shengyin","unstructured":"Shengyin Jiang, Shaoqing Xu, Li Liu, Ziying Song, Yang Bo, Zhi-Xin Yang, et\u00a0al. [n. d.]. SparseInteraction: Sparse Semantic Guidance for Radar and Camera 3D Object Detection. In ACM Multimedia 2024."},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","unstructured":"Jianan Liu Qiuchi Zhao Weiyi Xiong Tao Huang Qing-Long Han and Bing Zhu. 2024. SMURF: Spatial Multi-Representation Fusion for 3D Object Detection With 4D Imaging Radar. IEEE Transactions on Intelligent Vehicles 9 1 (2024) 799\u2013812. 10.1109\/TIV.2023.3322729","DOI":"10.1109\/TIV.2023.3322729"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/ITSC57777.2023.10422406"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA55743.2025.11127889"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"crossref","unstructured":"Lianqing Zheng Sen Li Bin Tan Long Yang Sihan Chen Libo Huang Jie Bai Xichan Zhu and Zhixiong Ma. 2023. Rcfusion: Fusing 4d radar and camera with bird\u2019s-eye view features for 3d object detection. IEEE Transactions on Instrumentation and Measurement (2023).","DOI":"10.1109\/TIM.2023.3280525"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"crossref","unstructured":"Shanliang Yao Runwei Guan Zhaodong Wu Yi Ni Zile Huang Ryan\u00a0Wen Liu Yong Yue Weiping Ding Eng\u00a0Gee Lim Hyungjoon Seo et\u00a0al. 2024. Waterscenes: A multi-task 4d radar-camera fusion dataset and benchmarks for autonomous driving on water surfaces. IEEE Transactions on Intelligent Transportation Systems 25 11 (2024) 16584\u201316598.","DOI":"10.1109\/TITS.2024.3415772"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.02617"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i4.32397"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.02525"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/IROS60139.2025.11246399"},{"key":"e_1_3_3_1_33_2","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang et\u00a0al. 2025. Qwen2. 5-vl technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.13923 (2025)."},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28253"},{"key":"e_1_3_3_1_35_2","first-page":"256","volume-title":"European conference on computer vision","author":"Sima Chonghao","year":"2024","unstructured":"Chonghao Sima, Katrin Renz, Kashyap Chitta, Li Chen, Hanxue Zhang, Chengen Xie, Jens Bei\u00dfwenger, Ping Luo, Andreas Geiger, and Hongyang Li. 2024. Drivelm: Driving with graph visual question answering. In European conference on computer vision. Springer, 256\u2013274."},{"key":"e_1_3_3_1_36_2","unstructured":"Yi Xu Yuxin Hu Zaiwei Zhang Gregory\u00a0P Meyer Siva\u00a0Karthik Mustikovela Siddhartha Srinivasa Eric\u00a0M Wolff and Xin Huang. 2024. Vlm-ad: End-to-end autonomous driving through vision-language model supervision. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.14446 (2024)."},{"key":"e_1_3_3_1_37_2","unstructured":"Xiaoyu Tian Junru Gu Bailin Li Yicheng Liu Yang Wang Zhiyong Zhao Kun Zhan Peng Jia Xianpeng Lang and Hang Zhao. 2024. Drivevlm: The convergence of autonomous driving and large vision-language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.12289 (2024)."},{"key":"e_1_3_3_1_38_2","unstructured":"Bo Jiang Shaoyu Chen Bencheng Liao Xingyu Zhang Wei Yin Qian Zhang Chang Huang Wenyu Liu and Xinggang Wang. 2024. Senna: Bridging large vision-language models and end-to-end autonomous driving. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.22313 (2024)."},{"key":"e_1_3_3_1_39_2","unstructured":"Jianbiao Mei Yukai Ma Xuemeng Yang Licheng Wen Xinyu Cai Xin Li Daocheng Fu Bo Zhang Pinlong Cai Min Dou et\u00a0al. 2024. Continuously learning adapting and improving: A dual-process approach to autonomous driving. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.15324 (2024)."},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10611018"},{"key":"e_1_3_3_1_41_2","first-page":"129","volume-title":"European Conference on Computer Vision","author":"Zhou Yunsong","year":"2024","unstructured":"Yunsong Zhou, Linyan Huang, Qingwen Bu, Jia Zeng, Tianyu Li, Hang Qiu, Hongzi Zhu, Minyi Guo, Yu Qiao, and Hongyang Li. 2024. Embodied understanding of driving scenarios. In European Conference on Computer Vision. Springer, 129\u2013148."},{"key":"e_1_3_3_1_42_2","unstructured":"Zhenhua Xu Yujia Zhang Enze Xie Zhen Zhao Yong Guo Kwan-Yee\u00a0K Wong Zhenguo Li and Hengshuang Zhao. 2024. Drivegpt4: Interpretable end-to-end autonomous driving via large language model. IEEE Robotics and Automation Letters (2024)."},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01609"},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01432"},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"crossref","unstructured":"Erfei Cui Wenhai Wang Zhiqi Li Jiangwei Xie Haoming Zou Hanming Deng Gen Luo Lewei Lu Xizhou Zhu and Jifeng Dai. 2025. DriveMLM: aligning multi-modal large language models with behavioral planning states for autonomous driving. Visual Intelligence 3 1 (2025) 22.","DOI":"10.1007\/s44267-025-00095-w"},{"key":"e_1_3_3_1_46_2","unstructured":"Lei Yang Xinyu Zhang Jun Li Chen Wang Jiaqi Ma Zhiying Song Tong Zhao Ziying Song Li Wang Mo Zhou et\u00a0al. 2024. V2x-radar: A multi-modal dataset with 4d radar for cooperative perception. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2411.10962 (2024)."},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA46639.2022.9812038"},{"key":"e_1_3_3_1_48_2","doi-asserted-by":"crossref","unstructured":"Yue Hu Shaoheng Fang Zixing Lei Yiqi Zhong and Siheng Chen. 2022. Where2comm: Communication-efficient collaborative perception via spatial confidence maps. Advances in neural information processing systems 35 (2022) 4874\u20134886.","DOI":"10.52202\/068431-0352"},{"key":"e_1_3_3_1_49_2","volume-title":"Conference on Robot Learning (CoRL)","author":"Zhengzhong\u00a0Tu Hao Xiang Wei Shao Bolei Zhou Jiaqi\u00a0Ma Runsheng\u00a0Xu,","year":"2022","unstructured":"Hao Xiang Wei Shao Bolei Zhou Jiaqi\u00a0Ma Runsheng\u00a0Xu, Zhengzhong\u00a0Tu. 2022. CoBEVT: Cooperative Bird\u2019s Eye View Semantic Segmentation with Sparse Transformers. In Conference on Robot Learning (CoRL)."},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i9.33040"},{"key":"e_1_3_3_1_51_2","unstructured":"Binyu Zhao Wei Zhang and Zhaonian Zou. 2023. Bm2cp: Efficient collaborative perception with lidar-camera modalities. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.14702 (2023)."},{"key":"e_1_3_3_1_52_2","doi-asserted-by":"crossref","unstructured":"Junwei You Zhuoyu Jiang Zilin Huang Haotian Shi Rui Gan Keshu Wu Xi Cheng Xiaopeng Li and Bin Ran. 2026. V2x-vlm: End-to-end v2x cooperative autonomous driving through large vision-language models. Transportation Research Part C: Emerging Technologies 183 (2026) 105457.","DOI":"10.1016\/j.trc.2025.105457"},{"key":"e_1_3_3_1_53_2","unstructured":"Junwei You Pei Li Zhuoyu Jiang Zilin Huang Rui Gan Haotian Shi and Bin Ran. 2025. V2X-REALM: Vision-Language Model-Based Robust End-to-End Cooperative Autonomous Driving with Adaptive Long-Tail Modeling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2506.21041 (2025)."},{"key":"e_1_3_3_1_54_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01418"},{"key":"e_1_3_3_1_55_2","unstructured":"Hao Jiang Chuan Hu Yukang Shi Yuan He Ke Wang Xi Zhang and Zhipeng Zhang. 2025. Structured Labeling Enables Faster Vision-Language Models for End-to-End Autonomous Driving. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2506.05442 (2025)."},{"key":"e_1_3_3_1_56_2","unstructured":"Yiheng Li Cunxin Fan Chongjian Ge Zhihao Zhao Chenran Li Chenfeng Xu Huaxiu Yao Masayoshi Tomizuka Bolei Zhou Chen Tang et\u00a0al. 2024. WOMD-Reasoning: A Large-Scale Dataset for Interaction Reasoning in Driving. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.04281 (2024)."},{"key":"e_1_3_3_1_57_2","doi-asserted-by":"crossref","unstructured":"Kexin Tian Jingrui Mao Yunlong Zhang Jiwan Jiang Yang Zhou and Zhengzhong Tu. 2025. Nuscenes-spatialqa: A spatial understanding and reasoning benchmark for vision-language models in autonomous driving. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2504.03164 (2025).","DOI":"10.1109\/ICCVW69036.2025.00480"},{"key":"e_1_3_3_1_58_2","unstructured":"Aditya Taparia Noel Ngu Mario Leiva Joshua\u00a0Shay Kricheli John Corcoran Nathaniel\u00a0D Bastian Gerardo Simari Paulo Shakarian and Ransalu Senanayake. 2025. VLC Fusion: Vision-Language Conditioned Sensor Fusion for Robust Object Detection. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2505.12715 (2025)."},{"key":"e_1_3_3_1_59_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00734"},{"key":"e_1_3_3_1_60_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02061"},{"key":"e_1_3_3_1_61_2","unstructured":"Hsu-kuang Chiu Ryo Hachiuma Chien-Yi Wang Stephen\u00a0F Smith Yu-Chiang\u00a0Frank Wang and Min-Hung Chen. 2025. V2v-llm: Vehicle-to-vehicle cooperative autonomous driving with multi-modal large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.09980 (2025)."},{"key":"e_1_3_3_1_62_2","unstructured":"Hsu kuang Chiu Ryo Hachiuma Chien-Yi Wang Yu-Chiang\u00a0Frank Wang Min-Hung Chen and Stephen\u00a0F. Smith. 2025. V2V-GoT: Vehicle-to-Vehicle Cooperative Autonomous Driving with Multimodal Large Language Models and Graph-of-Thoughts. arxiv:https:\/\/arXiv.org\/abs\/2509.18053\u00a0[cs.RO] https:\/\/arxiv.org\/abs\/2509.18053"},{"key":"e_1_3_3_1_63_2","unstructured":"Runwei Guan Rongsheng Hu Shangshu Chen Ningyuan Xiao Xue Xia Jiayang Liu Beibei Chen Ziren Tang Ningwei Ouyang Shaofeng Liang et\u00a0al. 2025. RoadSceneVQA: Benchmarking Visual Question Answering in Roadside Perception Systems for Intelligent Transportation System. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2511.18286 (2025)."},{"key":"e_1_3_3_1_64_2","doi-asserted-by":"publisher","DOI":"10.1109\/ITSC48978.2021.9564754"},{"key":"e_1_3_3_1_65_2","doi-asserted-by":"crossref","unstructured":"Dong-Hee Paek Seung-Hyun Kong and Kevin\u00a0Tirta Wijaya. 2022. K-radar: 4d radar object detection for autonomous driving in various weather conditions. Advances in Neural Information Processing Systems 35 (2022) 3819\u20133829.","DOI":"10.52202\/068431-0276"},{"key":"e_1_3_3_1_66_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19842-7_7"},{"key":"e_1_3_3_1_67_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02137"},{"key":"e_1_3_3_1_68_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160546"},{"key":"e_1_3_3_1_69_2","doi-asserted-by":"crossref","unstructured":"Haowen Lai Peng Yin and Sebastian Scherer. 2022. Adafusion: Visual-lidar fusion with adaptive weights for place recognition. IEEE Robotics and Automation Letters 7 4 (2022) 12038\u201312045.","DOI":"10.1109\/LRA.2022.3210880"},{"key":"e_1_3_3_1_70_2","doi-asserted-by":"publisher","DOI":"10.1109\/IROS58592.2024.10801398"},{"key":"e_1_3_3_1_71_2","unstructured":"Diederik\u00a0P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1412.6980 (2014)."},{"key":"e_1_3_3_1_72_2","doi-asserted-by":"crossref","unstructured":"Xun Huang Jinlong Wang Qiming Xia Siheng Chen Bisheng Yang Xin Li Cheng Wang and Chenglu Wen. 2024. V2x-r: Cooperative lidar-4d radar fusion for 3d object detection with denoising diffusion. arXiv preprint:2411.08402 (2024).","DOI":"10.1109\/CVPR52734.2025.02551"},{"key":"e_1_3_3_1_73_2","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748\u20138763."}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:20:23Z","timestamp":1781536823000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810832"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":72,"alternative-id":["10.1145\/3805622.3810832","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810832","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}