{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,22]],"date-time":"2026-06-22T10:00:40Z","timestamp":1782122440529,"version":"3.54.5"},"reference-count":85,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62373387"],"award-info":[{"award-number":["62373387"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003453","name":"Natural Science Foundation of Guangdong Province","doi-asserted-by":"publisher","award":["2023A1515030264"],"award-info":[{"award-number":["2023A1515030264"]}],"id":[{"id":"10.13039\/501100003453","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004826","name":"Beijing Natural Science Foundation","doi-asserted-by":"publisher","award":["4252048"],"award-info":[{"award-number":["4252048"]}],"id":[{"id":"10.13039\/501100004826","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100017607","name":"Shenzhen Fundamental Research Program","doi-asserted-by":"publisher","award":["JCYJ20240813151301003"],"award-info":[{"award-number":["JCYJ20240813151301003"]}],"id":[{"id":"10.13039\/501100017607","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100009076","name":"University of Science and Technology of China","doi-asserted-by":"publisher","award":["2421002"],"award-info":[{"award-number":["2421002"]}],"id":[{"id":"10.13039\/501100009076","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.00257","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"2675-2685","source":"Crossref","is-referenced-by-count":7,"title":["Where, What, Why: Towards Explainable Driver Attention Prediction"],"prefix":"10.1109","author":[{"given":"Yuchen","family":"Zhou","sequence":"first","affiliation":[{"name":"Sun Yat-sen University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jiayu","family":"Tang","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiaoyan","family":"Xiao","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yueyao","family":"Lin","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Linkai","family":"Liu","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zipeng","family":"Guo","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Hao","family":"Fei","sequence":"additional","affiliation":[{"name":"National University of Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiaobo","family":"Xia","sequence":"additional","affiliation":[{"name":"National University of Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Chao","family":"Gou","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/tits.2021.3055120"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/IV51971.2022.9827175"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01293"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00752"},{"key":"ref5","article-title":"Recurrent mixture density network for spatiotemporal visual attention","author":"Bazzani","year":"2017","journal-title":"ICLR"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2024\/882"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0614"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_6"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73242-3_18"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01230"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00154"},{"key":"ref12","volume-title":"Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality","author":"Chiang","year":"2023"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/WACVW60836.2024.00106"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2019.2915540"},{"issue":"3","key":"ref15","first-page":"2413","article-title":"Driving visual saliency prediction of dynamic night scenes via a spatiotemporal dual-encoder network","volume":"25","author":"Deng","year":"2023","journal-title":"IEEE T-ITS"},{"key":"ref16","first-page":"1","article-title":"Fbnet: Feedbackrecursive cnn for saliency detection","volume-title":"MVA","author":"Ding","year":"2021"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2834826"},{"key":"ref18","article-title":"An image is worth 16\u00d716 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2021","journal-title":"ICLR. OpenReview.net"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ITSC.2019.8917218"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2020.3044678"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/s11227-021-04151-2"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/TIV.2022.3173397"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.7551\/mitpress\/7503.003.0073"},{"issue":"2","key":"ref24","first-page":"3","article-title":"Lora: Low-rank adaptation of large language models","volume":"1","author":"Hu","year":"2022","journal-title":"ICLR"},{"issue":"2","key":"ref25","first-page":"1800","article-title":"Data-driven estimation of driver attention using calibration-free eye gaze and scene features","volume":"69","author":"Hu","year":"2021","journal-title":"IEEE T-IE"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2022.3208004"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-97-8792-0_13"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446034"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/34.730558"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/tits.2024.3510116"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19778-9_8"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/tits.2022.3186613"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/IV55156.2024.10588528"},{"key":"ref34","article-title":"Deep gaze I: boosting saliency prediction with feature maps trained on imagenet","author":"K\u00fcmmerer","year":"2015","journal-title":"ICLRW"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00915"},{"key":"ref36","first-page":"12888","article-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"International conference on machine learning","author":"Li","year":"2022"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2022.3165619"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2024.3428317"},{"key":"ref39","article-title":"Perception, reason, think, and plan: A survey on large multimodal reasoning models","author":"Li","year":"2025","journal-title":"arXiv preprint arXiv"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ITSC58415.2024.10919532"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01268"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1516"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610797"},{"key":"ref46","article-title":"Decoupled weight decay regularization","author":"Loshchilov","year":"2017","journal-title":"arXiv preprint arXiv"},{"key":"ref47","article-title":"Deem: Diffusion models serve as the eyes of large language models for image perception","author":"Luo","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-acl.1009"},{"key":"ref49","article-title":"Gui-r1: A generalist r1-style vision-language action model for gui agents","author":"Luo","year":"2025","journal-title":"arXiv preprint arXiv"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01190"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2845370"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00943"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1038\/s41562-023-01543-7"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1146\/annurev.ne.13.030190.000325"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2020.107404"},{"key":"ref56","first-page":"8748","article-title":"Learning transferable visual models from natural language super-vision","volume-title":"ICML","author":"Radford","year":"2021"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3295058"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72943-0_15"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1007\/s44267-024-00064-9"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1177\/1745691620953773"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/JAS.2022.105716"},{"key":"ref62","article-title":"Drivevlm: The convergence of autonomous driving and large vision-language models","author":"Tian","year":"2024","journal-title":"CoRL"},{"key":"ref63","article-title":"Next-gpt: Any-to-any multimodal llm","author":"Wu","year":"2024","journal-title":"ICML"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-20873-8_42"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00370"},{"key":"ref66","first-page":"11717","article-title":"Pyramid grafting network for onestage high resolution saliency detection","author":"Xie","year":"2022","journal-title":"CVPR"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1016\/j.compeleceng.2024.109104"},{"key":"ref68","first-page":"1890","article-title":"Voila-a: Aligning vision-language models with user\u2019s gaze attention","volume":"37","author":"Yan","year":"2025","journal-title":"NeurIPS"},{"key":"ref69","article-title":"Qwen2. 5 technical report","author":"Yang","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1145\/3661312"},{"key":"ref71","volume-title":"Rama: Retrieval-augmented multi-agent framework for misinformation detection in multimodal fact-checking","author":"Yang","year":"2025"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1007\/s44267-023-00005-y"},{"key":"ref73","article-title":"From seeing to doing: Bridging reasoning and decision for robotic manipulation","author":"Yuan","year":"2025","journal-title":"arXiv preprint arXiv"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02214-4"},{"key":"ref75","article-title":"Da-bev: Depth aware bev transformer for 3d object detection","author":"Zhang","year":"2023","journal-title":"CoRR"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72775-7_2"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2020\/689"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i2.32157"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612581"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2023.3309309"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02682"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/TIV.2024.3384989"},{"key":"ref83","first-page":"3122","article-title":"Few-shot adversarial prompt learning on vision-language models","volume":"37","author":"Zhou","year":"2024","journal-title":"NIPS"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2025.3565410"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00786"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11444546.pdf?arnumber=11444546","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T05:29:45Z","timestamp":1777613385000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11444546\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":85,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.00257","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}