{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:53:41Z","timestamp":1781538821029,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"the National Natural Science Foundation of China (NSFC)","award":["62472105"],"award-info":[{"award-number":["62472105"]}]},{"name":"the Natural Science Foundation of Guangdong Province","award":["2025A1515011385"],"award-info":[{"award-number":["2025A1515011385"]}]},{"name":"the Natural Science Foundation of Guangdong Province","award":["2024A1515010186"],"award-info":[{"award-number":["2024A1515010186"]}]},{"name":"the Research Foundation of Guangdong-hong Kong-Macao Applied Mathematics Center","award":["2025A1515060016"],"award-info":[{"award-number":["2025A1515060016"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810655","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"366-375","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["SCAN: Self-Calibrated Textual Anchoring for Dual-Granularity Video Screening in Text-Video Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-2680-7034","authenticated-orcid":false,"given":"Dixin","family":"Chen","sequence":"first","affiliation":[{"name":"Guangdong University of Technology, Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9092-3164","authenticated-orcid":false,"given":"Baoyao","family":"Yang","sequence":"additional","affiliation":[{"name":"Guangdong University of Technology, Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-7810-9776","authenticated-orcid":false,"given":"Haifeng","family":"Lin","sequence":"additional","affiliation":[{"name":"Guangdong University of Technology, Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-1826-8510","authenticated-orcid":false,"given":"Canrong","family":"Du","sequence":"additional","affiliation":[{"name":"Guangdong University of Technology, Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-5242-6197","authenticated-orcid":false,"given":"Wenbin","family":"Yao","sequence":"additional","affiliation":[{"name":"WeChat, Tencent, Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.618"},{"key":"e_1_3_3_1_3_2","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang et\u00a0al. 2025. Qwen2. 5-vl technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.13923 (2025)."},{"key":"e_1_3_3_1_4_2","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Bai Zechen","year":"2025","unstructured":"Zechen Bai, Tianjun Xiao, Tong He, Pichao WANG, Zheng Zhang, Thomas Brox, and Mike\u00a0Zheng Shou. 2025. Bridging Information Asymmetry in Text-video Retrieval: A Data-centric Approach. In The Thirteenth International Conference on Learning Representations."},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00513"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"crossref","unstructured":"Jingwen Chen Yingwei Pan Yehao Li Ting Yao Hongyang Chao and Tao Mei. 2023. Retrieval augmented convolutional encoder-decoder networks for video captioning. ACM Transactions on Multimedia Computing Communications and Applications 19 1s (2023) 1\u201324.","DOI":"10.1145\/3539225"},{"key":"e_1_3_3_1_8_2","unstructured":"Xing Cheng Hezheng Lin Xiangyu Wu Fan Yang and Dong Shen. 2021. Improving video-text retrieval by multi-stream corpus alignment and dual softmax loss. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2109.04290 (2021)."},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657833"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1145\/3731715.3733460"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"crossref","unstructured":"Xingzhong Du Hongzhi Yin Ling Chen Yang Wang Yi Yang and Xiaofang Zhou. 2018. Personalized video recommendation using rich contents from videos. IEEE Transactions on Knowledge and Data Engineering 32 3 (2018) 492\u2013505.","DOI":"10.1109\/TKDE.2018.2885520"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01262"},{"key":"e_1_3_3_1_13_2","unstructured":"Han Fang Pengfei Xiong Luhui Xu and Yu Chen. 2021. Clip2video: Mastering video-text retrieval via image clip. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2106.11097 (2021)."},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00495"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01025"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_36"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICME59968.2025.11209054"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01107"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02440"},{"key":"e_1_3_3_1_20_2","first-page":"4904","volume-title":"International conference on machine learning","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In International conference on machine learning. PMLR, 4904\u20134916."},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00234"},{"key":"e_1_3_3_1_22_2","unstructured":"Peng Jin Hao Li Li Yuan Shuicheng Yan and Jie Chen. 2024. Hierarchical banzhaf interaction for general video-language representation learning. IEEE Transactions on Pattern Analysis and Machine Intelligence (2024)."},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i4.32437"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.02271"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00999"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00379"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00782"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00634"},{"key":"e_1_3_3_1_29_2","unstructured":"Yang Liu Samuel Albanie Arsha Nagrani and Andrew Zisserman. 2019. Use what you have: Video retrieval using representations from collaborative experts. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1907.13487 (2019)."},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i4.28177"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"crossref","unstructured":"Huaishao Luo Lei Ji Ming Zhong Yang Chen Wen Lei Nan Duan and Tianrui Li. 2022. Clip4clip: An empirical study of clip for end to end video clip retrieval and captioning. Neurocomputing 508 (2022) 293\u2013304.","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"e_1_3_3_1_32_2","first-page":"76","volume-title":"European Conference on Computer Vision","author":"Ma Zongyang","year":"2024","unstructured":"Zongyang Ma, Ziqi Zhang, Yuxin Chen, Zhongang Qi, Chunfeng Yuan, Bing Li, Yingmin Luo, Xu Li, Xiaojuan Qi, Ying Shan, and Weiming Hu. 2024. Ea-vtr: Event-aware video-text retrieval. In European Conference on Computer Vision. Springer, 76\u201394."},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20059-5_5"},{"key":"e_1_3_3_1_34_2","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748\u20138763."},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01834"},{"key":"e_1_3_3_1_36_2","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Shen Leqi","year":"2025","unstructured":"Leqi Shen, Tianxiang Hao, Tao He, Sicheng Zhao, Yifeng Zhang, Pengzhang Liu, Yongjun Bao, and Guiguang Ding. 2025. TempMe: Video Temporal Token Merging for Efficient Text-Video Retrieval. In The Thirteenth International Conference on Learning Representations."},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28327"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1145\/3731715.3733314"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01566"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"crossref","unstructured":"Jiamian Wang Pichao Wang Dongfang Liu Qiang Guan Sohail Dianat Majid Rabbani Raghuveer Rao and Zhiqiang Tao. 2024. Diffusion-inspired truncated sampler for text-video retrieval. Advances in Neural Information Processing Systems 37 (2024) 3882\u20133906.","DOI":"10.52202\/079017-0127"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00504"},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19781-9_14"},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00365"},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01031"},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00640"},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"crossref","unstructured":"Wenhao Wu Xiaohan Wang Haipeng Luo Jingdong Wang Yi Yang and Wanli Ouyang. 2024. Cap4video++: Enhancing video understanding with auxiliary captions. IEEE Transactions on Pattern Analysis and Machine Intelligence 47 (2024) 5223\u20135237.","DOI":"10.1109\/TPAMI.2024.3410329"},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i8.32935"},{"key":"e_1_3_3_1_48_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"crossref","unstructured":"Haonan Zhang Pengpeng Zeng Lianli Gao Jingkuan Song Yihang Duan Xinyu Lyu and Heng\u00a0Tao Shen. 2025. Text-video retrieval with global-local semantic consistent learning. IEEE Transactions on Image Processing 34 (2025) 3463\u20133474.","DOI":"10.1109\/TIP.2025.3574925"},{"key":"e_1_3_3_1_51_2","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680839"},{"key":"e_1_3_3_1_52_2","doi-asserted-by":"crossref","unstructured":"Yanwei Zheng Bowen Huang Zekai Chen and Dongxiao Yu. 2025. Enhancing Text-Video Retrieval Performance With Low-Salient but Discriminative Objects. IEEE Transactions on Image Processing 34 (2025) 581\u2013593.","DOI":"10.1109\/TIP.2025.3527369"}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:55:20Z","timestamp":1781535320000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810655"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":51,"alternative-id":["10.1145\/3805622.3810655","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810655","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}