{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T19:45:17Z","timestamp":1780343117822,"version":"3.54.1"},"reference-count":250,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"12","license":[{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"DiDi GAIA Research Cooperation Initiative","award":["CCF-DiDi GAIA 202304"],"award-info":[{"award-number":["CCF-DiDi GAIA 202304"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Intell. Veh."],"published-print":{"date-parts":[[2024,12]]},"DOI":"10.1109\/tiv.2024.3406372","type":"journal-article","created":{"date-parts":[[2024,5,28]],"date-time":"2024-05-28T14:19:57Z","timestamp":1716905997000},"page":"8040-8063","source":"Crossref","is-referenced-by-count":24,"title":["Delving Into Multi-Modal Multi-Task Foundation Models for Road Scene Understanding: From Learning Paradigm Perspectives"],"prefix":"10.1109","volume":"9","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-0917-7011","authenticated-orcid":false,"given":"Sheng","family":"Luo","sequence":"first","affiliation":[{"name":"School of Computer Science and Engineering, Southeast University, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-5563-7300","authenticated-orcid":false,"given":"Wei","family":"Chen","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Southeast University, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-7615-5299","authenticated-orcid":false,"given":"Wanxin","family":"Tian","sequence":"additional","affiliation":[{"name":"Didi Chuxing, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Rui","family":"Liu","sequence":"additional","affiliation":[{"name":"Didi Chuxing, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Luanxuan","family":"Hou","sequence":"additional","affiliation":[{"name":"Didi Chuxing, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiubao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Didi Chuxing, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2654-3084","authenticated-orcid":false,"given":"Haifeng","family":"Shen","sequence":"additional","affiliation":[{"name":"Didi Chuxing, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-4721-3014","authenticated-orcid":false,"given":"Ruiqi","family":"Wu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Southeast University, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4596-0098","authenticated-orcid":false,"given":"Shuyi","family":"Geng","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Southeast University, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3021-3229","authenticated-orcid":false,"given":"Yi","family":"Zhou","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Southeast University, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8264-6117","authenticated-orcid":false,"given":"Ling","family":"Shao","sequence":"additional","affiliation":[{"name":"UCAS-Terminus AI Lab, University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yi","family":"Yang","sequence":"additional","affiliation":[{"name":"Didi Chuxing, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Bojun","family":"Gao","sequence":"additional","affiliation":[{"name":"Didi Chuxing, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Qun","family":"Li","sequence":"additional","affiliation":[{"name":"Didi Chuxing, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-9510-2279","authenticated-orcid":false,"given":"Guobin","family":"Wu","sequence":"additional","affiliation":[{"name":"Didi Chuxing, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"ref2","first-page":"23716","article-title":"Flamingo: A visual language model for few-shot learning","volume-title":"Proc. Int. Conf. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Alayrac","year":"2022"},{"key":"ref3","article-title":"DriveMLM: Aligning multi-modal large language models with behavioral planning states for autonomous driving","author":"Wang","year":"2023"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01499"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ITSC57777.2023.10421901"},{"key":"ref6","article-title":"Apollo auto. Baidu","year":"2019"},{"key":"ref7","article-title":"Openflamingo: An open-source framework for training large autoregressive vision-language models","author":"Awadalla","year":"2023"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20047-2_16"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00116"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.02157"},{"key":"ref11","article-title":"BEiT: BERT pre-training of image transformers","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Bao"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/RTCSA.2018.00011"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/s40687-022-00354-y"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1016\/j.trc.2021.103548"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/SSCI52147.2023.10371887"},{"key":"ref16","article-title":"MUVO: A multimodal generative world model for autonomous driving with geometric representations","author":"Bogdoll","year":"2023"},{"key":"ref17","first-page":"2165","article-title":"RT-2: Vision-language-action models transfer web knowledge to robotic control","volume-title":"Proc. 7th Conf. Robot Learn.","volume":"229","author":"Zitkovich"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.025"},{"key":"ref19","article-title":"Video generation models as world simulators","author":"Brooks","year":"2024"},{"key":"ref20","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. Int. Conf. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Brown","year":"2020"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"ref22","article-title":"Less is more: Removing text-regions improves clip training efficiency and robustness","author":"Cao","year":"2023"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2024.3435937"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/icra57147.2024.10611018"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00678"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.5555\/3524938.3525087"},{"key":"ref28","first-page":"22243","article-title":"Big self-supervised models are strong semi-supervised learners","volume-title":"Proc. Int. Conf. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Chen","year":"2020"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-023-01852-4"},{"key":"ref30","article-title":"Improved baselines with momentum contrastive learning","author":"Chen","year":"2020"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02076"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2025.3543072"},{"key":"ref33","article-title":"Large model based referring camouflaged object detection","author":"Cheng","year":"2023"},{"key":"ref34","article-title":"Language-guided 3D object detection in point cloud for autonomous driving","author":"Cheng","year":"2023"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3200245"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i2.27897"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2018.2858464"},{"issue":"70","key":"ref38","first-page":"1","article-title":"Scaling instruction-finetuned language models","volume":"25","author":"Chung","year":"2024","journal-title":"J. Mach. Learn. Res."},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.350"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/WACVW60836.2024.00101"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/MITS.2024.3381793"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/WACVW60836.2024.00106"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1145\/3632181"},{"key":"ref44","article-title":"Instructblip: Towards general-purpose vision-language models with instruction tuning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Dai","year":"2024"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00720"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1215"},{"key":"ref47","article-title":"Talk2bev: Language-enhanced bird\u2019s-eye view maps for autonomous driving","author":"Dewangan","year":"2023"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20053-3_5"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/TIV.2023.3287359"},{"key":"ref50","article-title":"HiLM-D: Towards high-resolution understanding in multimodal large language models for autonomous driving","author":"Ding","year":"2023"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01058"},{"key":"ref52","article-title":"Applications of computer vision in autonomous vehicles: Methods, challenges and future directions","author":"Dong","year":"2023"},{"key":"ref53","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Dosovitskiy"},{"key":"ref54","first-page":"1","article-title":"CARLA: An open urban driving simulator","volume-title":"Proc. 1st Annu. Conf. Robot Learn.","author":"Dosovitskiy","year":"2017"},{"key":"ref55","first-page":"8469","article-title":"PaLM-E: An embodied multimodal language model","volume-title":"Proc. 40th Int. Conf. Mach. Learn.","author":"Driess"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1007\/s10514-023-10132-6"},{"key":"ref57","article-title":"Diversity is all you need: Learning skills without a reward function","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Eysenbach"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01855"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2020.2972974"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/WACVW60836.2024.00102"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1117\/12.3054764"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6248074"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00340"},{"key":"ref65","article-title":"Recurrent world models facilitate policy evolution","volume-title":"Proc. Int. Conf. Adv. Neural Inf. Process. Syst.","volume":"31","author":"Ha","year":"2018"},{"key":"ref66","article-title":"Measuring the impact of scene level objects on object detection: Towards quantitative explanations of detection decisions","author":"Haar","year":"2024"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19839-7_20"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01433"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00727"},{"key":"ref73","article-title":"GAIA-1: A generative world model for autonomous driving","author":"Hu","year":"2023"},{"key":"ref74","article-title":"LoRA: Low-rank adaptation of large language models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Hu"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01712"},{"key":"ref76","article-title":"GOOD: Exploring geometric cues for detecting objects in an open world","volume-title":"Proc. 11th Int. Conf. Learn. Representations","author":"Huang"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1145\/3703155"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/WACVW60836.2024.00104"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW53098.2021.00325"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00757"},{"key":"ref81","article-title":"Autonomy 2.0: Why is self-driving always 5 years away?","author":"Jain","year":"2021"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160614"},{"key":"ref83","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Jia","year":"2021"},{"key":"ref84","article-title":"Adriver-I: A general world model for autonomous driving","author":"Jia","year":"2023"},{"key":"ref85","article-title":"Revisiting multi-modal 3D semantic segmentation in real-world autonomous driving","author":"Jiang","year":"2023"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i3.28037"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160326"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/IROS55552.2023.10341367"},{"key":"ref89","first-page":"7482","article-title":"Multi-task learning using uncertainty to weigh losses for scene geometry and semantics","volume-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit.","author":"Kendall","year":"2018"},{"key":"ref90","article-title":"Real-time traffic object detection for autonomous driving","author":"Khan","year":"2024"},{"key":"ref91","article-title":"Fully sparse long range 3D object detection using range experts and multimodal virtual points","author":"Khoche","year":"2023"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_35"},{"key":"ref93","article-title":"Camera height doesn\u2019t change: Unsupervised monocular scale-aware road-scene depth estimation","author":"Kinoshita","year":"2023"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1073\/pnas.1611835114"},{"issue":"1","key":"ref96","article-title":"A path towards autonomous machine intelligence version 0.9.2, 2022-06-27","volume":"62","author":"LeCun","year":"2022","journal-title":"Open Rev."},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.329"},{"key":"ref98","first-page":"19730","article-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19839-7_24"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00539"},{"key":"ref101","article-title":"Towards knowledge-driven autonomous driving","author":"Li","year":"2023"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02240"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_1"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-97-3005-6_33"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2022.3158253"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.3389\/fnbot.2022.916808"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1109\/IROS55552.2023.10341834"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00927"},{"key":"ref109","first-page":"19645","article-title":"Effective adaptation in multi-task co-training for unified autonomous driving","volume-title":"Proc. Int. Conf. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Liang","year":"2022"},{"key":"ref110","first-page":"12037","article-title":"Pareto multi-task learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Lin","year":"2019"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.00819"},{"key":"ref112","article-title":"World model on million-length video and language with ringattention","author":"Liu","year":"2024"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.02484"},{"key":"ref114","article-title":"Visual instruction tuning","volume-title":"Proc. 37th Conf. Neural Inf. Process. Syst.","author":"Liu","year":"2023"},{"key":"ref115","article-title":"Generalized few-shot 3D object detection of lidar point cloud for autonomous driving","author":"Liu","year":"2023"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00644"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00197"},{"key":"ref118","article-title":"Roberta: A robustly optimized BERT pretraining approach","author":"Liu","year":"2019"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.279"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1145\/3219819.3220007"},{"key":"ref121","first-page":"6522","article-title":"Efficient continuous Pareto exploration in multi-task learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Ma","year":"2020"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72995-9_23"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01434"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00110"},{"key":"ref125","article-title":"GPT-driver: Learning to drive with GPT","author":"Mao","year":"2023"},{"key":"ref126","article-title":"A. language agent for autonomous driving","author":"Mao","year":"2023"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1016\/s0079-7421(08)60536-8"},{"key":"ref128","volume-title":"An Approach to Environmental Psychology","author":"Mehrabian","year":"1974"},{"key":"ref129","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160674"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2023.3295254"},{"key":"ref131","article-title":"Uniworld: Autonomous driving pre-training via world models","author":"Min","year":"2023"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01470"},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00339"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.1109\/tits.2024.3510642"},{"key":"ref135","doi-asserted-by":"publisher","DOI":"10.1109\/HONET59747.2023.10374639"},{"key":"ref136","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00790"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73347-5_17"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446116"},{"key":"ref139","article-title":"GPT-4v(ision) system card","year":"2023"},{"key":"ref140","article-title":"GPT-4v(ision) technical work and authors","year":"2023"},{"key":"ref141","doi-asserted-by":"publisher","DOI":"10.1109\/tuffc.2012.2167\/mm1"},{"key":"ref142","article-title":"ChatGPT: Optimizing language models for dialogue","year":"2022"},{"key":"ref143","first-page":"27730","article-title":"Training language models to follow instructions with human feedback","volume-title":"Proc. Int. Conf. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Ouyang","year":"2022"},{"key":"ref144","article-title":"Open X-embodiment: Robotic learning datasets and RT-X models","volume-title":"Proc. Towards Generalist Robots: Learn. Paradigms Scalable Skill Acquisition  CoRL2023","author":"Vuong","year":"2023"},{"key":"ref145","article-title":"Incorporating language-driven appearance knowledge units with visual cues in pedestrian detection","author":"Park","year":"2023"},{"key":"ref146","doi-asserted-by":"publisher","DOI":"10.1109\/tcsvt.2023.3343495"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2023.126658"},{"key":"ref148","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.02161"},{"key":"ref149","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00700"},{"key":"ref150","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28253"},{"key":"ref151","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford","year":"2021"},{"key":"ref152","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00373"},{"key":"ref153","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.587"},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.1109\/DSA56465.2022.00107"},{"key":"ref155","article-title":"Progressive neural networks","author":"Rusu","year":"2016"},{"key":"ref156","doi-asserted-by":"publisher","DOI":"10.1109\/IROS45743.2020.9341600"},{"key":"ref157","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00734"},{"key":"ref158","first-page":"525","article-title":"Multi-task learning as multi-objective optimization","volume-title":"Proc. Int.Conf. Adv. Neural Inf. Process. Syst.","author":"Sener","year":"2018"},{"key":"ref159","article-title":"LanguageMPC: Large language models as decision makers for autonomous driving","author":"Sha","year":"2023"},{"key":"ref160","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01432"},{"key":"ref161","first-page":"15558","article-title":"K-lite: Learning transferable visual models with external knowledge","volume-title":"Proc. Int. Conf. Adv. Neural Inf. Process. Syst.","author":"Shen","year":"2022"},{"key":"ref162","doi-asserted-by":"publisher","DOI":"10.1109\/tip.2024.3463409\/mm1"},{"key":"ref163","article-title":"Chain-of-instructions: Compositional instruction tuning on large language models","author":"Anugrah","year":"2024"},{"key":"ref164","first-page":"948","article-title":"Lip-Loc: Lidar image pretraining for cross-modal localization","volume-title":"Proc. IEEE\/CVF Winter Conf. Appl. Comput. Vis.","author":"Shubodh","year":"2024"},{"key":"ref165","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72943-0_15"},{"key":"ref166","article-title":"End-to-end autonomous driving using deep learning: A systematic review","author":"Singh","year":"2023"},{"key":"ref167","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN.2019.8851842"},{"key":"ref168","doi-asserted-by":"publisher","DOI":"10.1109\/tgrs.2023.3331893"},{"key":"ref169","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2024\/141"},{"key":"ref170","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-78172-8_9"},{"key":"ref171","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01026"},{"key":"ref172","article-title":"CPSOR-GCN: A vehicle trajectory prediction method powered by emotion and cognitive theory","author":"Tang","year":"2023"},{"key":"ref173","doi-asserted-by":"publisher","DOI":"10.1109\/IVS.2018.8500504"},{"key":"ref174","article-title":"DriveVLM: The convergence of autonomous driving and large vision-language models","author":"Tian","year":"2024"},{"key":"ref175","article-title":"Llama: Open and efficient foundation language models","author":"Touvron","year":"2023"},{"key":"ref176","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023"},{"key":"ref177","doi-asserted-by":"publisher","DOI":"10.1201\/9781003328957-13"},{"key":"ref178","first-page":"6309","article-title":"Neural discrete representation learning","volume-title":"Proc. Int. Conf. Adv. Neural Inf. Process. Syst.","author":"Van Den Oord","year":"2017"},{"key":"ref179","doi-asserted-by":"publisher","DOI":"10.1109\/icra57147.2024.10610905"},{"key":"ref180","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref181","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-63596-0_54"},{"key":"ref182","article-title":"Voyager: An open-ended embodied agent with large language models","volume-title":"Proc. NeurIPS Found. Models Decis. Mak. Workshop","author":"Wang","year":"2023"},{"key":"ref183","doi-asserted-by":"publisher","DOI":"10.1109\/TVT.2024.3394350"},{"key":"ref184","doi-asserted-by":"publisher","DOI":"10.1109\/tiv.2024.3449278"},{"key":"ref185","doi-asserted-by":"publisher","DOI":"10.1109\/icra57147.2024.10611590"},{"key":"ref186","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01385"},{"key":"ref187","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73195-2_4"},{"key":"ref188","article-title":"Frustratingly simple few-shot object detection","author":"Wang","year":"2020"},{"key":"ref189","article-title":"Empowering autonomous driving with large language models: A safety perspective","author":"Wang","year":"2023"},{"key":"ref190","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01624"},{"key":"ref191","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01397"},{"key":"ref192","article-title":"Revisiting the power of prompt for visual tuning","author":"Wang","year":"2024"},{"key":"ref193","article-title":"Lingo-1: Exploring natural language for autonomous driving","year":"2023"},{"key":"ref194","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01426"},{"key":"ref195","article-title":"Bev-clip: Multi-modal Bev retrieval methodology for complex scene in autonomous driving","author":"Wei","year":"2024"},{"key":"ref196","first-page":"24824","article-title":"Chain-of-thought prompting elicits reasoning in large language models","volume-title":"Proc. Int. Conf. Adv. Neural Inf. Process. Syst.","author":"Wei","year":"2022"},{"key":"ref197","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01428"},{"key":"ref198","article-title":"Learning to model diverse driving behaviors in highly interactive autonomous driving scenarios with multi-agent reinforcement learning","author":"Weiwei","year":"2024"},{"key":"ref199","article-title":"DiLu: A knowledge-driven approach to autonomous driving with large language models","volume-title":"Proc.12th Int. Conf. Learn. Representations","author":"Wen"},{"key":"ref200","article-title":"On the road with GPT-4V (ision): Early explorations of visual-language model on autonomous driving","author":"Wen","year":"2023"},{"key":"ref201","article-title":"Visual chatGPT: Talking, drawing and editing with visual foundation models","author":"Wu","year":"2023"},{"key":"ref202","doi-asserted-by":"publisher","DOI":"10.1007\/s11633-022-1339-y"},{"key":"ref203","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01406"},{"key":"ref204","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i8.32902"},{"key":"ref205","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-99-8850-1_1"},{"key":"ref206","doi-asserted-by":"publisher","DOI":"10.1109\/icra57147.2024.10610018"},{"key":"ref207","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02044"},{"key":"ref208","article-title":"Learning to adapt sam for segmenting cross-domain point clouds","author":"Xidong","year":"2023"},{"key":"ref209","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01613"},{"key":"ref210","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00943"},{"key":"ref211","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2019.8793743"},{"key":"ref212","doi-asserted-by":"publisher","DOI":"10.1109\/tse.2024.3388572"},{"key":"ref213","doi-asserted-by":"publisher","DOI":"10.1109\/tip.2025.3550011"},{"key":"ref214","doi-asserted-by":"publisher","DOI":"10.1109\/lra.2024.3440097"},{"key":"ref215","article-title":"Forging vision foundation models for autonomous driving: Challenges, methodologies, and opportunities","author":"Yan","year":"2024"},{"key":"ref216","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01453"},{"key":"ref217","article-title":"Traffic sign interpretation in real road scene","author":"Yang","year":"2023"},{"key":"ref218","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01443"},{"key":"ref219","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i9.33001"},{"key":"ref220","article-title":"MixSup: Mixed-grained supervision for label-efficient LiDAR-based 3D object detection","volume-title":"Proc. 12th Int. Conf. Learn. Representations","author":"Yang"},{"key":"ref221","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00140"},{"key":"ref222","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2018.8546189"},{"key":"ref223","article-title":"A survey of large language models for autonomous driving","author":"Yang","year":"2023"},{"key":"ref224","article-title":"Taskprompter: Spatial-channel multi-task prompting for dense scene understanding","volume-title":"Proc. 11th Int. Conf. Learn. Representations","author":"Ye","year":"2023"},{"key":"ref225","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01161"},{"key":"ref226","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02214-4"},{"key":"ref227","article-title":"ReSimAD: Zero-shot 3D domain transfer for autonomous driving with source reconstruction and target simulation","volume-title":"Proc. 12th Int. Conf. Learn. Representations","author":"Zhang"},{"key":"ref228","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"ref229","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72907-2_1"},{"key":"ref230","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i7.28546"},{"key":"ref231","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-short.107"},{"key":"ref232","article-title":"LLaMA-adapter: Efficient fine-tuning of large language models with zero-initialized attention","volume-title":"Proc.12th Int. Conf. Learn. Representations","author":"Zhang"},{"key":"ref233","doi-asserted-by":"publisher","DOI":"10.1016\/j.tranpol.2024.03.006"},{"key":"ref234","doi-asserted-by":"publisher","DOI":"10.1016\/j.trf.2015.05.008"},{"key":"ref235","article-title":"CAE v2: Context autoencoder with CLIP latent alignment","author":"Zhang","year":"2023","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref236","article-title":"Meta-transformer: A unified framework for multimodal learning","author":"Zhang","year":"2023"},{"key":"ref237","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02073"},{"key":"ref238","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10161243"},{"key":"ref239","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00681"},{"key":"ref240","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72624-8_4"},{"key":"ref241","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73650-6_6"},{"key":"ref242","doi-asserted-by":"publisher","DOI":"10.1109\/tiv.2024.3406867"},{"key":"ref243","article-title":"LiDAR-PTQ: Post-training quantization for point cloud 3D object detection","volume-title":"Proc.12th Int. Conf. Learn. Representations","author":"Zhou"},{"key":"ref244","article-title":"Vision language models in autonomous driving and intelligent transportation systems","author":"Zhou","year":"2023"},{"key":"ref245","doi-asserted-by":"publisher","DOI":"10.1109\/icra57147.2024.10610779"},{"key":"ref246","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73033-7_8"},{"key":"ref247","article-title":"MiniGPT-4: Enhancing vision-language understanding with advanced large language models","volume-title":"Proc. 12th Int. Conf. Learn. Representations","author":"Zhu"},{"key":"ref248","article-title":"Open world object detection in the era of foundation models","author":"Zohar","year":"2023"},{"key":"ref249","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611805"},{"key":"ref250","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP51287.2024.10648203"}],"container-title":["IEEE Transactions on Intelligent Vehicles"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/7274857\/11077821\/10540321.pdf?arnumber=10540321","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,7,18]],"date-time":"2025-07-18T17:47:20Z","timestamp":1752860840000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10540321\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12]]},"references-count":250,"journal-issue":{"issue":"12"},"URL":"https:\/\/doi.org\/10.1109\/tiv.2024.3406372","relation":{},"ISSN":["2379-8904","2379-8858"],"issn-type":[{"value":"2379-8904","type":"electronic"},{"value":"2379-8858","type":"print"}],"subject":[],"published":{"date-parts":[[2024,12]]}}}