{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:43:28Z","timestamp":1763192608274,"version":"3.45.0"},"reference-count":31,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004735","name":"Natural Science Foundation of Hunan Province","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100004735","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1109\/ijcnn64981.2025.11228491","type":"proceedings-article","created":{"date-parts":[[2025,11,14]],"date-time":"2025-11-14T18:46:15Z","timestamp":1763145975000},"page":"1-8","source":"Crossref","is-referenced-by-count":0,"title":["Omni-V2X: A Vision-Language Model for Actionable Insights in Vehicle-to-Everything Systems"],"prefix":"10.1109","author":[{"given":"Nicanor","family":"Mayumu","sequence":"first","affiliation":[{"name":"Central South University,School of Computer Science and Engineering,Changsha,China"}]},{"given":"Deng","family":"Xiaoheng","sequence":"additional","affiliation":[{"name":"Central South University,School of Electronic Information,Changsha,China"}]},{"given":"Patrick","family":"Mukala","sequence":"additional","affiliation":[{"name":"University of Wollongong in Dubai,School of Computer Science,Dubai,United Arab Emirates"}]},{"given":"Saif Ur Rehman","family":"Khan","sequence":"additional","affiliation":[{"name":"Central South University,School of Computer Science and Engineering,Changsha,China"}]},{"given":"Muhammad Usman","family":"Saeed","sequence":"additional","affiliation":[{"name":"Central South University,School of Computer Science and Engineering,Changsha,China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19842-7_7"},{"article-title":"How2comm: Communication-efficient and collaboration-pragmatic multi-agent perception","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","author":"Yang","key":"ref2"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01669"},{"key":"ref4","doi-asserted-by":"crossref","DOI":"10.1007\/978-3-031-19824-3_19","volume-title":"Latency-aware collaborative perception","author":"Lei","year":"2022"},{"volume-title":"Flow-based feature fusion for vehicle-infrastructure cooperative 3d object detection","year":"2023","author":"Yu","key":"ref5"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA46639.2022.9812038"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.02067"},{"key":"ref8","doi-asserted-by":"crossref","DOI":"10.1109\/CVPR52729.2023.00892","volume-title":"Collaboration helps camera overtake lidar in 3d detection","author":"Hu","year":"2023"},{"volume-title":"Drivegpt4: Interpretable end-to-end autonomous driving via large language model","year":"2024","author":"Xu","key":"ref9"},{"key":"ref10","doi-asserted-by":"crossref","DOI":"10.1109\/CVPR52733.2024.01416","volume-title":"Driving everywhere with large language model policy adaptation","author":"Li","year":"2024"},{"article-title":"Omnidrive: A holistic llm-agent framework for autonomous driving with 3d perception, reasoning and planning","year":"2024","author":"Wang","key":"ref11"},{"volume-title":"Drivelm: Driving with graph visual question answering","year":"2024","author":"Sima","key":"ref12"},{"article-title":"Drivevlm: The convergence of autonomous driving and large vision-language models","year":"2024","author":"Tian","key":"ref13"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10611018"},{"volume-title":"Long-term recurrent convolutional networks for visual recognition and description","year":"2016","author":"Donahue","key":"ref15"},{"volume-title":"Show and tell: A neural image caption generator","year":"2015","author":"Vinyals","key":"ref16"},{"volume-title":"Open-set object detection: Towards unified problem formulation and benchmarking","year":"2024","author":"Ammar","key":"ref17"},{"volume-title":"Unifying vision-and-language tasks via text generation","year":"2021","author":"Cho","key":"ref18"},{"key":"ref19","article-title":"Unified multimodal pretraining for vision-language understanding","author":"Chen","year":"2022","journal-title":"IEEE Transactions on Multimedia"},{"article-title":"Unifying vision and language with multimodal transformers","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Wang","key":"ref20"},{"volume-title":"Gemini: A family of highly capable multimodal models","year":"2024","key":"ref21"},{"volume-title":"The llama 3 herd of models","year":"2024","author":"Grattafiori","key":"ref22"},{"volume-title":"Phi-3 technical report: A highly capable language model locally on your phone","year":"2024","author":"Abdin","key":"ref23"},{"volume-title":"Lora: Low-rank adaptation of large language models","year":"2021","author":"Hu","key":"ref24"},{"volume-title":"Qlora: Efficient finetuning of quantized llms","year":"2023","author":"Dettmers","key":"ref25"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00531"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"ref28","first-page":"74","article-title":"Rouge: A package for automatic evaluation of summaries","author":"Lin","year":"2004","journal-title":"Text Summarization Branches Out"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"volume-title":"Spice: Semantic propositional image caption evaluation","year":"2016","author":"Anderson","key":"ref30"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-009-0275-4"}],"event":{"name":"2025 International Joint Conference on Neural Networks (IJCNN)","start":{"date-parts":[[2025,6,30]]},"location":"Rome, Italy","end":{"date-parts":[[2025,7,5]]}},"container-title":["2025 International Joint Conference on Neural Networks (IJCNN)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11227166\/11227148\/11228491.pdf?arnumber=11228491","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:40:11Z","timestamp":1763192411000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11228491\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":31,"URL":"https:\/\/doi.org\/10.1109\/ijcnn64981.2025.11228491","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]}}}