{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:55:32Z","timestamp":1781538932720,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":52,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"the Fundamental Research Funds for the Central Universities of China","award":["No. PA2025IISL0110"],"award-info":[{"award-number":["No. PA2025IISL0110"]}]},{"name":"the Fundamental Research Funds for the Central Universities of China","award":["No. JZ2025HGTB0226"],"award-info":[{"award-number":["No. JZ2025HGTB0226"]}]},{"name":"National Natural Science Foundation of China","award":["No. 62202139"],"award-info":[{"award-number":["No. 62202139"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810695","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"327-336","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Fine-grained Text-Video Retrieval with Patch-level Temporal Difference and Aggregation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-3196-2188","authenticated-orcid":false,"given":"Jialong","family":"Hu","sequence":"first","affiliation":[{"name":"Hefei University of Technology, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1262-764X","authenticated-orcid":false,"given":"Zijie","family":"Song","sequence":"additional","affiliation":[{"name":"Anhui University, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7243-2177","authenticated-orcid":false,"given":"Yang","family":"Wang","sequence":"additional","affiliation":[{"name":"Hefei University of Technology, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1042-8361","authenticated-orcid":false,"given":"Zhenzhen","family":"Hu","sequence":"additional","affiliation":[{"name":"Hefei University of Technology, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9446-249X","authenticated-orcid":false,"given":"Jia","family":"Li","sequence":"additional","affiliation":[{"name":"Hefei University of Technology, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-7849-6950","authenticated-orcid":false,"given":"Yixiao","family":"Ma","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5461-3986","authenticated-orcid":false,"given":"Richang","family":"Hong","sequence":"additional","affiliation":[{"name":"Hefei University of Technology, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.618"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"e_1_3_3_1_4_2","first-page":"4","volume-title":"Icml","author":"Bertasius Gedas","year":"2021","unstructured":"Gedas Bertasius, Heng Wang, and Lorenzo Torresani. 2021. Is space-time attention all you need for video understanding?. In Icml , Vol.\u00a02. 4."},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_3_1_6_2","unstructured":"Xing Cheng Hezheng Lin Xiangyu Wu Fan Yang and Dong Shen. 2021. Improving video-text retrieval by multi-stream corpus alignment and dual softmax loss. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2109.04290 (2021)."},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01262"},{"key":"e_1_3_3_1_8_2","unstructured":"Han Fang Pengfei Xiong Luhui Xu and Yu Chen. 2021. Clip2video: Mastering video-text retrieval via image clip. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2106.11097 (2021)."},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58548-8_13"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00495"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01025"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"crossref","unstructured":"Peng Jin Jinfa Huang Fenglin Liu Xian Wu Shen Ge Guoli Song David Clifton and Jie Chen. 2022. Expectation-maximization contrastive learning for compact video-and-language representations. NeurIPS 35 (2022) 30291\u201330306.","DOI":"10.52202\/068431-2196"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00244"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00234"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02563"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1145\/3746027.3754833"},{"key":"e_1_3_3_1_19_2","first-page":"12888","volume-title":"International conference on machine learning","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888\u201312900."},{"key":"e_1_3_3_1_20_2","first-page":"237","volume-title":"European conference on computer vision","author":"Li Kunchang","year":"2024","unstructured":"Kunchang Li, Xinhao Li, Yi Wang, Yinan He, Yali Wang, Limin Wang, and Yu Qiao. 2024. Videomamba: State space model for efficient video understanding. In European conference on computer vision. Springer, 237\u2013255."},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00379"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00865"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2025\/643"},{"key":"e_1_3_3_1_24_2","unstructured":"Yang Liu Samuel Albanie Arsha Nagrani and Andrew Zisserman. 2019. Use what you have: Video retrieval using representations from collaborative experts. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1907.13487 (2019)."},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19781-9_19"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"crossref","unstructured":"Huaishao Luo Lei Ji Ming Zhong Yang Chen Wen Lei Nan Duan and Tianrui Li. 2022. Clip4clip: An empirical study of clip for end to end video clip retrieval and captioning. Neurocomputing 508 (2022) 293\u2013304.","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547910"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00272"},{"key":"e_1_3_3_1_29_2","first-page":"8748","volume-title":"ICML","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In ICML. PmLR, 8748\u20138763."},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01834"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01835"},{"key":"e_1_3_3_1_32_2","unstructured":"Leqi Shen Tianxiang Hao Tao He Sicheng Zhao Yifeng Zhang Pengzhang Liu Yongjun Bao and Guiguang Ding. 2024. Tempme: Video temporal token merging for efficient text-video retrieval. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.01156 (2024)."},{"key":"e_1_3_3_1_33_2","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Two-stream convolutional networks for action recognition in videos. Advances in neural information processing systems 27 (2014)."},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICME59968.2025.11209469"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i7.32778"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28327"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01622"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01566"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"crossref","unstructured":"Jiamian Wang Pichao Wang Dongfang Liu Qiang Guan Sohail Dianat Majid Rabbani Raghuveer Rao and Zhiqiang Tao. 2024. Diffusion-inspired truncated sampler for text-video retrieval. Advances in Neural Information Processing Systems 37 (2024) 3882\u20133906.","DOI":"10.52202\/079017-0127"},{"key":"e_1_3_3_1_40_2","unstructured":"Qiang Wang Yanhao Zhang Yun Zheng Pan Pan and Xian-Sheng Hua. 2022. Disentangled representation learning for text-video retrieval. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2203.07111 (2022)."},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00504"},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00264"},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01031"},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i8.32935"},{"key":"e_1_3_3_1_45_2","unstructured":"Jian Xiao Zijie Song Jialong Hu Hao Cheng Jia Li Zhenzhen Hu and Richang Hong. 2025. Rebalancing Contrastive Alignment with Bottlenecked Semantic Increments in Text-Video Retrieval. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2505.12499 (2025)."},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"crossref","unstructured":"Hu Xu Gargi Ghosh Po-Yao Huang Dmytro Okhonko Armen Aghajanyan Florian Metze Luke Zettlemoyer and Christoph Feichtenhofer. 2021. Videoclip: Contrastive pre-training for zero-shot video-text understanding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2109.14084 (2021).","DOI":"10.18653\/v1\/2021.emnlp-main.544"},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"e_1_3_3_1_48_2","volume-title":"ICLR","author":"Xue Hongwei","year":"2023","unstructured":"Hongwei Xue, Yuchong Sun, Bei Liu, Jianlong Fu, Ruihua Song, Houqiang Li, and Jiebo Luo. 2023. Clip-vip: Adapting pre-trained image-text model to video-language alignment. In ICLR."},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i7.28475"},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01234-2_29"},{"key":"e_1_3_3_1_51_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00221"},{"key":"e_1_3_3_1_52_2","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3531950"},{"key":"e_1_3_3_1_53_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00877"}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:13:12Z","timestamp":1781536392000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810695"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":52,"alternative-id":["10.1145\/3805622.3810695","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810695","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}