{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:57:53Z","timestamp":1781539073196,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":40,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810608","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"1793-1797","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Object Referring-Guided Scanpath Prediction with Perception-Enhanced Vision-Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1494-6193","authenticated-orcid":false,"given":"Rong","family":"Quan","sequence":"first","affiliation":[{"name":"Nanjing University of Aeronautics and Astronautics\u200c, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-3836-0392","authenticated-orcid":false,"given":"Yantao","family":"Lai","sequence":"additional","affiliation":[{"name":"Nanjing University of Aeronautics and Astronautics\u200c, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2784-3449","authenticated-orcid":false,"given":"Dong","family":"Liang","sequence":"additional","affiliation":[{"name":"Nanjing University of Aeronautics and Astronautics\u200c, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0306-534X","authenticated-orcid":false,"given":"Jie","family":"Qin","sequence":"additional","affiliation":[{"name":"Nanjing University of Aeronautics and Astronautics\u200c, Nanjing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2022.XVIII.025"},{"key":"e_1_3_3_1_3_2","unstructured":"Shuai Bai Yuxuan Cai Ruizhe Chen Keqin Chen Xionghui Chen Zesen Cheng Lianghao Deng Wei Ding Chang Gao Chunjiang Ge Wenbin Ge Zhifang Guo Qidong Huang Jie Huang Fei Huang Binyuan Hui Shutong Jiang Zhaohai Li Mingsheng Li Mei Li Kaixin Li Zicheng Lin Junyang Lin Xuejing Liu Jiawei Liu Chenglong Liu Yang Liu Dayiheng Liu Shixuan Liu Dunjie Lu Ruilin Luo Chenxu Lv Rui Men Lingchen Meng Xuancheng Ren Xingzhang Ren Sibo Song Yuchong Sun Jun Tang Jianhong Tu Jianqiang Wan Peng Wang Pengfei Wang Qiuyue Wang Yuxuan Wang Tianbao Xie Yiheng Xu Haiyang Xu Jin Xu Zhibo Yang Mingkun Yang Jianxin Yang An Yang Bowen Yu Fei Zhang Hang Zhang Xi Zhang Bo Zheng Humen Zhong Jingren Zhou Fan Zhou Jing Zhou Yuanzhi Zhu and Ke Zhu. 2025. Qwen3-VL Technical Report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2511.21631 (2025)."},{"key":"e_1_3_3_1_4_2","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang et\u00a0al. 2025. Qwen2. 5-vl technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.13923 (2025)."},{"key":"e_1_3_3_1_5_2","unstructured":"Siddhant Bansal Michael Wray and Dima Damen. 2024. Hoi-ref: Hand-object interaction referral in egocentric vision. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.09933 (2024)."},{"key":"e_1_3_3_1_6_2","unstructured":"Giuseppe Cartella Marcella Cornia Vittorio Cuculo Alessandro D\u2019Amelio Dario Zanca Giuseppe Boccignone and Rita Cucchiara. 2024. Trends applications and challenges in human attention modelling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.18673 (2024)."},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01073"},{"key":"e_1_3_3_1_8_2","unstructured":"Zhe Chen Weiyun Wang Yue Cao Yangzhou Liu Zhangwei Gao Erfei Cui Jinguo Zhu Shenglong Ye Hao Tian Zhaoyang Liu et\u00a0al. 2024. Expanding performance boundaries of open-source multimodal models with model data and test-time scaling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.05271 (2024)."},{"key":"e_1_3_3_1_9_2","unstructured":"Junyoung Chung Caglar Gulcehre KyungHyun Cho and Yoshua Bengio. 2014. Empirical evaluation of gated recurrent neural networks on sequence modeling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1412.3555 (2014)."},{"key":"e_1_3_3_1_10_2","first-page":"4171","volume-title":"Proceedings of the 2019 Conference of the North American chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT)","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the 2019 Conference of the North American chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT). 4171\u20134186."},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-49695-1_37"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"crossref","unstructured":"Alex Graves. 2012. Long short-term memory. Supervised sequence labelling with recurrent neural networks (2012) 37\u201345.","DOI":"10.1007\/978-3-642-24797-2_4"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2004.1315204"},{"key":"e_1_3_3_1_15_2","unstructured":"Edward\u00a0J Hu Yelong Shen Phillip Wallis Zeyuan Allen-Zhu Yuanzhi Li Shean Wang Lu Wang Weizhu Chen et\u00a0al. 2022. Lora: Low-rank adaptation of large language models.ICLR 1 2 (2022) 3."},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446034"},{"key":"e_1_3_3_1_17_2","unstructured":"Ozgur Kara Harris Nisar and James\u00a0M Rehg. 2025. DiffEye: Diffusion-Based Continuous Eye-Tracking Data Generation Conditioned on Natural Images. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2509.16767 (2025)."},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"crossref","unstructured":"Ewen Lavoie Jacqueline\u00a0S Hebert and Craig\u00a0S Chapman. 2024. Comparing eye\u2013hand coordination between controller-mediated virtual reality and a real-world object interaction task. Journal of Vision 24 2 (2024) 9\u20139.","DOI":"10.1167\/jov.24.2.9"},{"key":"e_1_3_3_1_19_2","unstructured":"Daeun Lee Subhojyoti Mukherjee Branislav Kveton Ryan\u00a0A Rossi Viet\u00a0Dac Lai Seunghyun Yoon Trung Bui Franck Dernoncourt and Mohit Bansal. 2025. StreamGaze: Gaze-Guided Temporal Reasoning and Proactive Understanding in Streaming Videos. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2512.01707 (2025)."},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.324"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"crossref","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong\u00a0Jae Lee. 2023. Visual instruction tuning. Advances in Neural Information Processing systems (NeurIPs) 36 (2023) 34892\u201334916.","DOI":"10.52202\/075280-1516"},{"key":"e_1_3_3_1_23_2","unstructured":"Yinhan Liu Myle Ott Naman Goyal Jingfei Du Mandar Joshi Danqi Chen Omer Levy Mike Lewis Luke Zettlemoyer and Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1907.11692 (2019)."},{"key":"e_1_3_3_1_24_2","first-page":"197","volume-title":"Chinese Conference on Pattern Recognition and Computer Vision (PRCV)","author":"Liu Yifei","year":"2025","unstructured":"Yifei Liu and Rong Quan. 2025. Effective Text-Directed Scanpath Prediction via Comprehensive Multi-modal Information Fusion. In Chinese Conference on Pattern Recognition and Computer Vision (PRCV). Springer, 197\u2013211."},{"key":"e_1_3_3_1_25_2","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1711.05101 (2017)."},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"crossref","unstructured":"Daniel Martin Ana Serrano Alexander\u00a0W Bergman Gordon Wetzstein and Belen Masia. 2022. Scangan360: A generative model of realistic scanpaths for 360 images. IEEE Transactions on Visualization and Computer Graphics 28 5 (2022) 2003\u20132013.","DOI":"10.1109\/TVCG.2022.3150502"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01249"},{"key":"e_1_3_3_1_28_2","first-page":"236","volume-title":"European Conference on Computer Vision (ECCV)","author":"Mondal Sounak","year":"2024","unstructured":"Sounak Mondal, Seoyoung Ahn, Zhibo Yang, Niranjan Balasubramanian, Dimitris Samaras, Gregory Zelinsky, and Minh Hoai. 2024. Look Hear: Gaze Prediction for Speech-directed Human Attention. In European Conference on Computer Vision (ECCV). Springer, 236\u2013255."},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00145"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1145\/2945078.2945153"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISMAR59233.2023.00037"},{"key":"e_1_3_3_1_32_2","first-page":"73","volume-title":"European Conference on Computer Vision (ECCV)","author":"Quan Rong","year":"2024","unstructured":"Rong Quan, Yantao Lai, Mengyu Qiu, and Dong Liang. 2024. Pathformer3D: A 3D Scanpath Transformer for 360\u00b0 Images. In European Conference on Computer Vision (ECCV). Springer, 73\u201390."},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01236"},{"key":"e_1_3_3_1_34_2","first-page":"1247","volume-title":"Conference on Robot Learning","author":"Saran Akanksha","year":"2020","unstructured":"Akanksha Saran, Elaine\u00a0Schaertl Short, Andrea Thomaz, and Scott Niekum. 2020. Understanding teacher gaze patterns for robot learning. In Conference on Robot Learning. PMLR, 1247\u20131258."},{"key":"e_1_3_3_1_35_2","first-page":"6989","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Sui Xiangjie","year":"2023","unstructured":"Xiangjie Sui, Yuming Fang, Hanwei Zhu, Shiqi Wang, and Zhou Wang. 2023. Scandmm: A deep markov model of scanpath prediction for 360deg images. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 6989\u20136999."},{"key":"e_1_3_3_1_36_2","unstructured":"Yu Sun Shuohuan Wang Yukun Li Shikun Feng Xuyi Chen Han Zhang Xin Tian Danxiang Zhu Hao Tian and Hua Wu. 2019. Ernie: Enhanced representation through knowledge integration. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1904.09223 (2019)."},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2018.00206"},{"key":"e_1_3_3_1_38_2","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan\u00a0N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. Advances in Neural Information Processing systems (NeurIPs) 30 (2017)."},{"key":"e_1_3_3_1_39_2","first-page":"23318","volume-title":"International Conference on Machine Learning (ICML)","author":"Wang Peng","year":"2022","unstructured":"Peng Wang, An Yang, Rui Men, Junyang Lin, Shuai Bai, Zhikang Li, Jianxin Ma, Chang Zhou, Jingren Zhou, and Hongxia Yang. 2022. Ofa: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. In International Conference on Machine Learning (ICML). 23318\u201323340."},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01238"},{"key":"e_1_3_3_1_41_2","unstructured":"Haobo Yuan Xiangtai Li Tao Zhang Zilong Huang Shilin Xu Shunping Ji Yunhai Tong Lu Qi Jiashi Feng and Ming-Hsuan Yang. 2025. Sa2VA: Marrying SAM2 with LLaVA for Dense Grounded Understanding of Images and Videos. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.04001 (2025)."}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:49:19Z","timestamp":1781538559000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810608"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":40,"alternative-id":["10.1145\/3805622.3810608","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810608","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}