{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:54:06Z","timestamp":1781538846726,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":49,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"DOI":"10.13039\/501100012226","name":"National Key R&D Program of China","doi-asserted-by":"publisher","award":["2024YFB3311600"],"award-info":[{"award-number":["2024YFB3311600"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"Anhui Provincial Natural Science Foundation","doi-asserted-by":"publisher","award":["2408085J040"],"award-info":[{"award-number":["2408085J040"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"Major Project of the Anhui Provincial Science and Technology Breakthrough Program","doi-asserted-by":"publisher","award":["202423k09020001"],"award-info":[{"award-number":["202423k09020001"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["JZ2024AHST0337\uff0cJZ2025HGQA0139\uff0cJZ2025HGTA0160\uff0c"],"award-info":[{"award-number":["JZ2024AHST0337\uff0cJZ2025HGQA0139\uff0cJZ2025HGTA0160\uff0c"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012226","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62272144\uff0c62501224"],"award-info":[{"award-number":["62272144\uff0c62501224"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810886","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"1523-1532","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["SIGaze: Toward Pixel-Level Single-Instance Gaze Object Prediction"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-2431-5580","authenticated-orcid":false,"given":"Dongxing","family":"Duan","sequence":"first","affiliation":[{"name":"School of Computer Science and Information Engineering, Hefei University of Technology, Hefei, Anhui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-3149-1615","authenticated-orcid":false,"given":"Xu","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Information Engineering, Hefei University of Technology, Hefei, Anhui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4462-315X","authenticated-orcid":false,"given":"Jingyuan","family":"Xu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Information Engineering, Hefei University of Technology, Hefei, Anhui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-0371-5557","authenticated-orcid":false,"given":"Ruijie","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Information Engineering, Hefei University of Technology, Hefei, Anhui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2594-254X","authenticated-orcid":false,"given":"Dan","family":"Guo","sequence":"additional","affiliation":[{"name":"School of Computer Science and Information Engineering, Hefei University of Technology, Hefei, Anhui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01373"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00925"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"crossref","unstructured":"Wenhe Chen Hui Xu Chao Zhu Xiaoli Liu Yinghua Lu Caixia Zheng and Jun Kong. 2021. Gaze estimation via the joint modeling of multiple cues. IEEE Transactions on Circuits and Systems for Video Technology 32 3 (2021) 1390\u20131402.","DOI":"10.1109\/TCSVT.2021.3071621"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"crossref","unstructured":"Eunji Chong Katha Chanda Zhefan Ye Audrey Southerland Nataniel Ruiz Rebecca\u00a0M Jones Agata Rozga and James\u00a0M Rehg. 2017. Detecting gaze towards eyes in natural social interactions and its use in child assessment. Proceedings of the ACM on Interactive Mobile Wearable and Ubiquitous Technologies 1 3 (2017) 1\u201320.","DOI":"10.1145\/3131902"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01228-1_24"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00544"},{"key":"e_1_3_3_1_9_2","unstructured":"Alexey Dosovitskiy. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2010.11929 (2020)."},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00667"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58580-8_24"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01123"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.81"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00552"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"crossref","unstructured":"Zhengxi Hu Kunxu Zhao Bohan Zhou Hang Guo Shichao Wu Yuxue Yang and Jingtai Liu. 2022. Gaze target estimation inspired by interactive attention. IEEE Transactions on Circuits and Systems for Video Technology 32 12 (2022) 8524\u20138536.","DOI":"10.1109\/TCSVT.2022.3190314"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/FG52635.2021.9666980"},{"key":"e_1_3_3_1_18_2","first-page":"369","volume-title":"European Conference on Computer Vision","author":"Jin Yang","year":"2024","unstructured":"Yang Jin, Lei Zhang, Shi Yan, Bin Fan, and Binglu Wang. 2024. Boosting gaze object prediction via pixel-level supervision from vision foundation model. In European Conference on Computer Vision. Springer, 369\u2013386."},{"key":"e_1_3_3_1_19_2","unstructured":"DiederikP. Kingma and Jimmy Ba. 2014. Adam: A Method for Stochastic Optimization. arXiv:https:\/\/arXiv.org\/abs\/ Learning arXiv: Learning (Dec 2014)."},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_45"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00297"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00465"},{"key":"e_1_3_3_1_24_2","first-page":"35","volume-title":"Asian Conference on Computer Vision","author":"Lian Dongze","year":"2018","unstructured":"Dongze Lian, Zehao Yu, and Shenghua Gao. 2018. Believe it or not, we know what you are looking at!. In Asian Conference on Computer Vision. Springer, 35\u201350."},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"crossref","unstructured":"Chunmian Lin Daxin Tian Xuting Duan Jianshan Zhou Dezong Zhao and Dongpu Cao. 2022. 3D-DFM: Anchor-free multimodal 3-D object detection with dynamic fusion module for autonomous driving. IEEE Transactions on Neural Networks and Learning Systems 34 12 (2022) 10812\u201310822.","DOI":"10.1109\/TNNLS.2022.3171553"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"crossref","unstructured":"Dong Liu Xin Zhao and Weiqiang Fan. 2025. A small object detection algorithm for mine environment. Engineering Applications of Artificial Intelligence 153 (2025) 110936.","DOI":"10.1016\/j.engappai.2025.110936"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1016\/B978-0-323-99864-2.00016-0"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-20085-5_23"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"crossref","unstructured":"Beno\u00eet Mass\u00e9 Sil\u00e8ye Ba and Radu Horaud. 2017. Tracking gaze and visual focus of attention of people involved in social interaction. IEEE transactions on pattern analysis and machine intelligence 40 11 (2017) 2711\u20132724.","DOI":"10.1109\/TPAMI.2017.2782819"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00094"},{"key":"e_1_3_3_1_31_2","unstructured":"Maxime Oquab Timoth\u00e9e Darcet Th\u00e9o Moutakanni Huy Vo Marc Szafraniec Vasil Khalidov Pierre Fernandez Daniel Haziza Francisco Massa Alaaeldin El-Nouby et\u00a0al. 2023. Dinov2: Learning robust visual features without supervision. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.07193 (2023)."},{"key":"e_1_3_3_1_32_2","unstructured":"Adria Recasens Aditya Khosla Carl Vondrick and Antonio Torralba. 2015. Where are they looking? Advances in neural information processing systems 28 (2015)."},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.160"},{"key":"e_1_3_3_1_34_2","unstructured":"Shaoqing Ren Kaiming He Ross Girshick and Jian Sun. 2015. Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems 28 (2015)."},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02689"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"crossref","unstructured":"Yuehao Song Xinggang Wang Jingfeng Yao Wenyu Liu Jinglin Zhang and Xiangmin Xu. 2024. Vitgaze: gaze following with interaction features in vision transformers. Visual Intelligence 2 1 (2024) 31.","DOI":"10.1007\/s44267-024-00064-9"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1145\/3610661.3616239"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01914"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00196"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00972"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW53098.2021.00349"},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"publisher","DOI":"10.1145\/3536221.3556624"},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01998"},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00224"},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"crossref","unstructured":"Danyang Tu Wei Shen Wei Sun Xiongkuo Min Guangtao Zhai and Changwen Chen. 2023. Un-Gaze: A unified transformer for joint gaze-location and gaze-object detection. IEEE Transactions on Circuits and Systems for Video Technology 34 5 (2023) 3271\u20133285.","DOI":"10.1109\/TCSVT.2023.3318839"},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i9.28883"},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01898"},{"key":"e_1_3_3_1_48_2","doi-asserted-by":"crossref","unstructured":"Xinming Wang Hanlin Zhang Zhiyong Wang Wei Nie Zhihao Yang Weihong Ren Qiong Xu Xiu Xu and Honghai Liu. 2023. Dual regression-enhanced gaze target detection in the wild. IEEE Transactions on Cybernetics 54 1 (2023) 219\u2013229.","DOI":"10.1109\/TCYB.2023.3244269"},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"crossref","unstructured":"Tianfei Zhou Siyuan Qi Wenguan Wang Jianbing Shen and Song-Chun Zhu. 2021. Cascaded parsing of human-object interaction recognition. IEEE Transactions on Pattern Analysis and Machine Intelligence 44 6 (2021) 2827\u20132840.","DOI":"10.1109\/TPAMI.2021.3049156"},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00093"}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:59:26Z","timestamp":1781535566000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810886"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":49,"alternative-id":["10.1145\/3805622.3810886","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810886","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}