{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,20]],"date-time":"2026-05-20T03:11:39Z","timestamp":1779246699594,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":17,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,5,8]],"date-time":"2025-05-08T00:00:00Z","timestamp":1746662400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,5,8]]},"DOI":"10.1145\/3701716.3715525","type":"proceedings-article","created":{"date-parts":[[2025,5,23]],"date-time":"2025-05-23T16:06:11Z","timestamp":1748016371000},"page":"873-877","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Selective Multi-grained Alignment for Text-Video Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-2690-8701","authenticated-orcid":false,"given":"Ziyi","family":"Bian","sequence":"first","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5043-7006","authenticated-orcid":false,"given":"Cong","family":"Jiang","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-0081-0900","authenticated-orcid":false,"given":"Fangzhi","family":"Zhu","sequence":"additional","affiliation":[{"name":"Independent Researcher, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1470-6998","authenticated-orcid":false,"given":"Zheng","family":"Zhang","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,5,23]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"crossref","unstructured":"Lisa Anne Hendricks Oliver Wang Eli Shechtman Josef Sivic Trevor Darrell and Bryan Russell. 2017. Localizing moments in video with natural language. In ICCV. 5803--5812.","DOI":"10.1109\/ICCV.2017.618"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"crossref","unstructured":"Simion-Vlad Bogolin Ioana Croitoru Hailin Jin Yang Liu and Samuel Albanie. 2022. Cross modal retrieval with querybank normalisation. In CVPR. 5194--5205.","DOI":"10.1109\/CVPR52688.2022.00513"},{"key":"e_1_3_2_2_3_1","unstructured":"David Chen and William B Dolan. 2011. Collecting highly parallel data for paraphrase evaluation. In ACL. 190--200."},{"key":"e_1_3_2_2_4_1","volume-title":"Improving video-text retrieval by multi-stream corpus alignment and dual softmax loss. arXiv preprint arXiv:2109.04290","author":"Cheng Xing","year":"2021","unstructured":"Xing Cheng, Hezheng Lin, Xiangyu Wu, Fan Yang, and Dong Shen. 2021. Improving video-text retrieval by multi-stream corpus alignment and dual softmax loss. arXiv preprint arXiv:2109.04290 (2021)."},{"key":"e_1_3_2_2_5_1","volume-title":"Uatvr: Uncertainty-adaptive text-video retrieval. In ICCV. 13723--13733.","author":"Fang Bo","year":"2023","unstructured":"Bo Fang, Wenhao Wu, Chang Liu, Yu Zhou, Yuxin Song, Weiping Wang, Xiangbo Shu, Xiangyang Ji, and Jingdong Wang. 2023. Uatvr: Uncertainty-adaptive text-video retrieval. In ICCV. 13723--13733."},{"key":"e_1_3_2_2_6_1","volume-title":"X-pool: Cross-modal language-video attention for text-video retrieval. In CVPR. 5006--5015.","author":"Gorti Satya Krishna","year":"2022","unstructured":"Satya Krishna Gorti, No\u00ebl Vouitsis, Junwei Ma, Keyvan Golestan, Maksims Volkovs, Animesh Garg, and Guangwei Yu. 2022. X-pool: Cross-modal language-video attention for text-video retrieval. In CVPR. 5006--5015."},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"crossref","unstructured":"Ranjay Krishna Kenji Hata Frederic Ren Li Fei-Fei and Juan Carlos Niebles. 2017. Dense-captioning events in videos. In ICCV. 706--715.","DOI":"10.1109\/ICCV.2017.83"},{"key":"e_1_3_2_2_8_1","unstructured":"Pandeng Li Chen-Wei Xie Liming Zhao Hongtao Xie Jiannan Ge Yun Zheng Deli Zhao and Yongdong Zhang. 2023. Progressive spatio-temporal prototype matching for text-video retrieval. In ICCV. 4100--4110."},{"key":"e_1_3_2_2_9_1","volume-title":"CLIP4Clip: An Empirical Study of CLIP for End to End Video Clip Retrieval. Neurocomputing","author":"Luo Huaishao","year":"2021","unstructured":"Huaishao Luo, Lei Ji, Ming Zhong, Yang Chen, Wen Lei, Nan Duan, and Tianrui Li. 2021. CLIP4Clip: An Empirical Study of CLIP for End to End Video Clip Retrieval. Neurocomputing (2021)."},{"key":"e_1_3_2_2_10_1","volume-title":"Contrastive Incomplete Cross-Modal Hashing. TKDE","author":"Luo Haoyang","year":"2024","unstructured":"Haoyang Luo, Zheng Zhang, and Liqiang Nie. 2024. Contrastive Incomplete Cross-Modal Hashing. TKDE (2024)."},{"key":"e_1_3_2_2_11_1","volume-title":"X-clip: End-to-end multi-grained contrastive learning for video-text retrieval. In ACM MM. 638--647.","author":"Ma Yiwei","year":"2022","unstructured":"Yiwei Ma, Guohai Xu, Xiaoshuai Sun, Ming Yan, Ji Zhang, and Rongrong Ji. 2022. X-clip: End-to-end multi-grained contrastive learning for video-text retrieval. In ACM MM. 638--647."},{"key":"e_1_3_2_2_12_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In ICML. 8748--8763."},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"crossref","unstructured":"Kaibin Tian Ruixiang Zhao Zijie Xin Bangxiang Lan and Xirong Li. 2024. Holistic Features are almost Sufficient for Text-to-Video Retrieval. In CVPR. 17138--17147.","DOI":"10.1109\/CVPR52733.2024.01622"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"crossref","unstructured":"Ziyang Wang Yi-Lin Sung Feng Cheng Gedas Bertasius and Mohit Bansal. 2023. Unified coarse-to-fine alignment for video-text retrieval. In ICCV. 2816--2827.","DOI":"10.1109\/ICCV51070.2023.00264"},{"key":"e_1_3_2_2_15_1","volume-title":"Contrastive Multi-Bit Collaborative Learning for Deep Cross-Modal Hashing. TKDE","author":"Wu Qingpeng","year":"2024","unstructured":"Qingpeng Wu, Zheng Zhang, Yishu Liu, Jingyi Zhang, and Liqiang Nie. 2024. Contrastive Multi-Bit Collaborative Learning for Deep Cross-Modal Hashing. TKDE (2024)."},{"key":"e_1_3_2_2_16_1","volume-title":"Msr-vtt: A large video description dataset for bridging video and language. In CVPR. 5288--5296.","author":"Xu Jun","year":"2016","unstructured":"Jun Xu, Tao Mei, Ting Yao, and Yong Rui. 2016. Msr-vtt: A large video description dataset for bridging video and language. In CVPR. 5288--5296."},{"key":"e_1_3_2_2_17_1","volume-title":"BadCM: Invisible Backdoor Attack Against Cross-Modal Learning. TIP","author":"Zhang Zheng","year":"2024","unstructured":"Zheng Zhang, Xu Yuan, Lei Zhu, Jingkuan Song, and Liqiang Nie. 2024. BadCM: Invisible Backdoor Attack Against Cross-Modal Learning. TIP (2024)."}],"event":{"name":"WWW '25: The ACM Web Conference 2025","location":"Sydney NSW Australia","acronym":"WWW '25","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Companion Proceedings of the ACM on Web Conference 2025"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3701716.3715525","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3701716.3715525","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,7]],"date-time":"2025-10-07T17:37:23Z","timestamp":1759858643000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3701716.3715525"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,8]]},"references-count":17,"alternative-id":["10.1145\/3701716.3715525","10.1145\/3701716"],"URL":"https:\/\/doi.org\/10.1145\/3701716.3715525","relation":{},"subject":[],"published":{"date-parts":[[2025,5,8]]},"assertion":[{"value":"2025-05-23","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}