{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:05:18Z","timestamp":1765339518680,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":47,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755348","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:54:15Z","timestamp":1761375255000},"page":"8174-8183","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["PRE-MAP: Personalized Reinforced Eye-tracking Multimodal LLM for High-Resolution Multi-Attribute Point Prediction"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-3838-4124","authenticated-orcid":false,"given":"Hanbing","family":"Wu","sequence":"first","affiliation":[{"name":"College of Computer Science and Technology, Jilin University, Changchun city, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-6161-060X","authenticated-orcid":false,"given":"Ping","family":"Jiang","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0085-3527","authenticated-orcid":false,"given":"Anyang","family":"Su","sequence":"additional","affiliation":[{"name":"Mininglamp Technology, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4044-5701","authenticated-orcid":false,"given":"Chenxu","family":"Zhao","sequence":"additional","affiliation":[{"name":"Mininglamp Technology, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7656-350X","authenticated-orcid":false,"given":"Tianyu","family":"Fu","sequence":"additional","affiliation":[{"name":"Mininglamp Technology, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-4577-3002","authenticated-orcid":false,"given":"Minghui","family":"Wu","sequence":"additional","affiliation":[{"name":"Mininglamp Technology, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-5009-0692","authenticated-orcid":false,"given":"Beiping","family":"Tan","sequence":"additional","affiliation":[{"name":"Mininglamp Technology, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3637-2581","authenticated-orcid":false,"given":"Huiying","family":"Li","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, Jilin University, Changchun, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00625"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.physbeh.2019.03.023"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2815601"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02402"},{"key":"e_1_3_2_1_5_1","unstructured":"Zhe Chen Weiyun Wang Yue Cao Yangzhou Liu Zhangwei Gao Erfei Cui Jinguo Zhu Shenglong Ye Hao Tian Zhaoyang Liu et al. 2024b. Expanding Performance Boundaries of Open-Source Multimodal Models with Model Data and Test-Time Scaling. arXiv preprint arXiv:2412.05271 (2024)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2851672"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58558-7_25"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.5555\/3001460.3001507"},{"key":"e_1_3_2_1_9_1","unstructured":"Gemini Team et al. 2024. Gemini 1.5: Unlocking Multimodal Understanding Across Millions of Tokens of Context. arXiv preprint (2024)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV61041.2025.00163"},{"key":"e_1_3_2_1_11_1","volume-title":"FastSal: A Computationally Efficient Network for Visual Saliency Prediction. In 2020 25th International Conference on Pattern Recognition (ICPR). IEEE, 9054-9061","author":"Hu Feiyan","year":"2021","unstructured":"Feiyan Hu and Kevin McGuinness. 2021. FastSal: A Computationally Efficient Network for Visual Saliency Prediction. In 2020 25th International Conference on Pattern Recognition (ICPR). IEEE, 9054-9061."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/34.730558"},{"key":"e_1_3_2_1_13_1","first-page":"573","article-title":"Eye Tracking in Human-Computer Interaction and Usability Research: Ready to Deliver the Promises. In The Mind's Eye","author":"Jacob Robert JK","year":"2003","unstructured":"Robert JK Jacob and Keith S Karn. 2003. Eye Tracking in Human-Computer Interaction and Usability Research: Ready to Deliver the Promises. In The Mind's Eye. North-Holland, 573-605.","journal-title":"North-Holland"},{"key":"e_1_3_2_1_14_1","volume-title":"EML-NET: An Expandable Multi-Layer Network for Saliency Prediction. CoRR","author":"Jia Sen","year":"2018","unstructured":"Sen Jia. 2018. EML-NET: An Expandable Multi-Layer Network for Saliency Prediction. CoRR, Vol. abs\/1805.01047 (2018). http:\/\/arxiv.org\/abs\/1805.01047"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00213"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00213"},{"key":"e_1_3_2_1_17_1","first-page":"192","volume-title":"Science","volume":"215","author":"Jonides John","year":"1982","unstructured":"John Jonides, David E. Irwin, and Steven Yantis. 1982. Integrating Visual Information from Successive Fixations. Science, Vol. 215, 4529 (1982), 192-194."},{"key":"e_1_3_2_1_18_1","unstructured":"Tilke Judd Fr\u00e9do Durand and Antonio Torralba. 2012. A Benchmark of Computational Models of Saliency to Predict Human Fixations. Technical Report. CSAIL Technical Reports."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2009.5459462"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.5555\/1622737.1622748"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01268"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.04.080"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.visres.2010.08.016"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.3389\/fmedt.2023.1253001"},{"volume-title":"Advances in Neural Information Processing Systems","author":"Mathe Stefan","key":"e_1_3_2_1_25_1","unstructured":"Stefan Mathe and Cristian Sminchisescu. 2013. Action from Still Image Dataset and Inverse Optimal Control to Learn Task Specific Visual Scanpaths. In Advances in Neural Information Processing Systems, C.J. Burges, L. Bottou, M. Welling, Z. Ghahramani, and K.Q. Weinberger (Eds.), Vol. 26. Curran Associates, Inc."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1146\/annurev-psych-122414-033400"},{"key":"e_1_3_2_1_27_1","volume-title":"Sensors","volume":"20","author":"Moroto Yuya","year":"2020","unstructured":"Yuya Moroto, Keisuke Maeda, Takahiro Ogawa, and Miki Haseyama. 2020. Few-shot personalized saliency prediction based on adaptive image selection considering object and visual attention. Sensors, Vol. 20, 8 (2020)."},{"key":"e_1_3_2_1_28_1","first-page":"647","volume-title":"Science","volume":"341","author":"Muchnik Lev","year":"2013","unstructured":"Lev Muchnik, Sinan Aral, and Sean J Taylor. 2013. Social influence bias: A randomized experiment. Science, Vol. 341, 6146 (2013), 647-651."},{"key":"e_1_3_2_1_29_1","volume-title":"Advances in Neural Information Processing Systems","volume":"18","author":"Bruce","unstructured":"Bruce N. and Tsotsos J., 2005. Saliency based on information maximization. In Advances in Neural Information Processing Systems, Vol. 18."},{"key":"e_1_3_2_1_30_1","unstructured":"OpenAI. 2024. GPT-4o Introduction. (2024). Accessed: 2024-11-15."},{"key":"e_1_3_2_1_31_1","volume-title":"MDS-ViTNet: Improving Saliency Prediction for Eye-Tracking with Vision Transformer. Doklady Mathematics","volume":"110","author":"Polezhaev I.","year":"2024","unstructured":"I. Polezhaev, Igor Goncharenko, and N. Iurina. 2024. MDS-ViTNet: Improving Saliency Prediction for Eye-Tracking with Vision Transformer. Doklady Mathematics, Vol. 110, Suppl 1 (2024)."},{"key":"e_1_3_2_1_32_1","volume-title":"Cappuccio","author":"Racine Timothy P.","year":"2012","unstructured":"Timothy P. Racine, David A. Leavens, Colwyn Trevarthen, Peter Hobson, Jessica Hobson, Vasudevi Reddy, Malinda Carpenter, Kristin Liebal, Stephen V. Shepherd, and Massimiliano L. Cappuccio. 2012. Joint Attention: New Developments in Psychology, Philosophy of Mind, and Social Neuroscience. MIT Press, Cambridge, MA."},{"key":"e_1_3_2_1_33_1","volume-title":"Direct preference optimization: Your language model is secretly a reward model. Advances in neural information processing systems","author":"Rafailov Rafael","year":"2023","unstructured":"Rafael Rafailov, Archit Sharma, Eric Mitchell, Christopher D Manning, Stefano Ermon, and Chelsea Finn. 2023. Direct preference optimization: Your language model is secretly a reward model. Advances in neural information processing systems, Vol. 36 (2023), 53728-53741."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.147"},{"volume-title":"The Brain, Emotion, and Depression","author":"Rolls Edmund T.","key":"e_1_3_2_1_35_1","unstructured":"Edmund T. Rolls. 2018. The Brain, Emotion, and Depression. Oxford University Press, Oxford."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1163\/156856896x00123"},{"key":"e_1_3_2_1_37_1","unstructured":"John Schulman Filip Wolski Prafulla Dhariwal et al. 2017. Proximal Policy Optimization Algorithms. arXiv preprint (2017)."},{"key":"e_1_3_2_1_38_1","unstructured":"Zhihong Shao Peiyi Wang Qihao Zhu Runxin Xu Junxiao Song Mingchuan Zhang Y.K. Li Y. Wu and Daya Guo. 2024. DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.3389\/fnint.2010.00005"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00221-024-06823-w"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2403.13653"},{"key":"e_1_3_2_1_42_1","volume-title":"Barto","author":"Sutton Richard S.","year":"1998","unstructured":"Richard S. Sutton and Andrew G. Barto. 1998. Reinforcement Learning: An Introduction. Vol. 1. MIT Press, Cambridge, MA."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2017.2787612"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680810"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2866563"},{"key":"e_1_3_2_1_46_1","volume-title":"arXiv preprint arXiv:2412.15115","author":"Yang An","year":"2024","unstructured":"An Yang, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chengyuan Li, Dayiheng Liu, Fei Huang, Haoran Wei, Huan Lin, Jian Yang, Jianhong Tu, Jianwei Zhang, Jianxin Yang, Jiaxi Yang, Jingren Zhou, Junyang Lin, Kai Dang, Keming Lu, Keqin Bao, Kexin Yang, Le Yu, Mei Li, Mingfeng Xue, Pei Zhang, Qin Zhu, Rui Men, Runji Lin, Tianhao Li, Tingyu Xia, Xingzhang Ren, Xuancheng Ren, Yang Fan, Yang Su, Yichang Zhang, Yu Wan, Yuqiong Liu, Zeyu Cui, Zhenru Zhang, and Zihan Qiu. 2024. Qwen2.5 Technical Report. arXiv preprint arXiv:2412.15115 (2024)."},{"key":"e_1_3_2_1_47_1","volume-title":"VideoLLaMA 3: Frontier Multimodal Foundation Models for Image and Video Understanding. arXiv preprint arXiv:2501.13106","author":"Zhang Boqiang","year":"2025","unstructured":"Boqiang Zhang, Kehan Li, Zesen Cheng, Zhiqiang Hu, Yuqian Yuan, Guanzheng Chen, Sicong Leng, Yuming Jiang, Hang Zhang, Xin Li, Peng Jin, Wenqi Zhang, Fan Wang, Lidong Bing, and Deli Zhao. 2025. VideoLLaMA 3: Frontier Multimodal Foundation Models for Image and Video Understanding. arXiv preprint arXiv:2501.13106 (2025)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755348","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:02:31Z","timestamp":1765339351000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755348"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":47,"alternative-id":["10.1145\/3746027.3755348","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755348","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}