{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,24]],"date-time":"2026-03-24T11:49:09Z","timestamp":1774352949886,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":53,"publisher":"ACM","funder":[{"name":"Natural Science Foundation of China","award":["62302453,61976192"],"award-info":[{"award-number":["62302453,61976192"]}]},{"name":"Zhejiang Provincial Natural Science Foundation of China","award":["LMS25F020003,LRG25F020002"],"award-info":[{"award-number":["LMS25F020003,LRG25F020002"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1145\/3731715.3733330","type":"proceedings-article","created":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T18:29:43Z","timestamp":1750876183000},"page":"1367-1376","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Event-Driven Hybrid and Cross-Stage Guide for Video Corpus Moment Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6753-6569","authenticated-orcid":false,"given":"Zheng","family":"Wang","sequence":"first","affiliation":[{"name":"School of Computer Science and Technology, Zhejiang University of Technology, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-3777-9764","authenticated-orcid":false,"given":"Kun","family":"Huang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Zhejiang University of Technology, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-7659-3877","authenticated-orcid":false,"given":"Zengrong","family":"Lin","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Zhejiang University of Technology, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6177-3862","authenticated-orcid":false,"given":"Cong","family":"Bai","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Zhejiang University of Technology, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,6,30]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.618"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1015"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018175"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01065"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3316025"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2024.128029"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3532078"},{"key":"e_1_3_2_1_8_1","first-page":"4065","article-title":"Dual encoding for video retrieval by text","volume":"44","author":"Dong Jianfeng","year":"2021","unstructured":"Jianfeng Dong, Xirong Li, Chaoxi Xu, Xun Yang, Gang Yang, Xun Wang, and Meng Wang. 2021. Dual encoding for video retrieval by text. IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. 44, 8 (2021), 4065--4080.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"e_1_3_2_1_10_1","volume-title":"Exploiting visual semantic reasoning for video-text retrieval. arXiv preprint arXiv:2006.08889","author":"Feng Zerun","year":"2020","unstructured":"Zerun Feng, Zhimin Zeng, Caili Guo, and Zheng Li. 2020. Exploiting visual semantic reasoning for video-text retrieval. arXiv preprint arXiv:2006.08889 (2020)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CDC.2007.4434535"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58548-8_13"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.563"},{"key":"e_1_3_2_1_14_1","volume-title":"Excl: Extractive clip localization using natural language descriptions. arXiv preprint arXiv:1904.02755","author":"Ghosh Soham","year":"2019","unstructured":"Soham Ghosh, Anuva Agarwal, Zarana Parekh, and Alexander Hauptmann. 2019. Excl: Extractive clip localization using natural language descriptions. arXiv preprint arXiv:1904.02755 (2019)."},{"key":"e_1_3_2_1_15_1","volume-title":"Coot: Cooperative hierarchical transformer for video-text representation learning. Advances in neural information processing systems","author":"Ging Simon","year":"2020","unstructured":"Simon Ging, Mohammadreza Zolfaghari, Hamed Pirsiavash, and Thomas Brox. 2020. Coot: Cooperative hierarchical transformer for video-text representation learning. Advances in neural information processing systems, Vol. 33 (2020), 22605--22618."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3222664"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475241"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018393"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_20_1","volume-title":"Event-aware Video Corpus Moment Retrieval. arXiv preprint arXiv:2402.13566","author":"Hou Danyang","year":"2024","unstructured":"Danyang Hou, Liang Pang, Huawei Shen, and Xueqi Cheng. 2024a. Event-aware Video Corpus Moment Retrieval. arXiv preprint arXiv:2402.13566 (2024)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3652583.3658088"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475281"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00244"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462974"},{"key":"e_1_3_2_1_25_1","volume-title":"Modal-specific pseudo query generation for video corpus moment retrieval. arXiv preprint arXiv:2210.12617","author":"Jung Minjoon","year":"2022","unstructured":"Minjoon Jung, Seongho Choi, Joochan Kim, Jin-Hwa Kim, and Byoung-Tak Zhang. 2022. Modal-specific pseudo query generation for video corpus moment retrieval. arXiv preprint arXiv:2210.12617 (2022)."},{"key":"e_1_3_2_1_26_1","volume-title":"Proceedings, Part XXI 16","author":"Lei Jie","year":"2020","unstructured":"Jie Lei, Licheng Yu, Tamara L Berg, and Mohit Bansal. 2020. Tvr: A large-scale dataset for video-subtitle moment retrieval. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XXI 16. Springer, 447--463."},{"key":"e_1_3_2_1_27_1","volume-title":"Hero: Hierarchical encoder for video language omni-representation pre-training. arXiv preprint arXiv:2005.00200","author":"Li Linjie","year":"2020","unstructured":"Linjie Li, Yen-Chun Chen, Yu Cheng, Zhe Gan, Licheng Yu, and Jingjing Liu. 2020a. Hero: Hierarchical encoder for video language omni-representation pre-training. arXiv preprint arXiv:2005.00200 (2020)."},{"key":"e_1_3_2_1_28_1","volume-title":"William Yang Wang, et al.","author":"Li Linjie","year":"2021","unstructured":"Linjie Li, Jie Lei, Zhe Gan, Licheng Yu, Yen-Chun Chen, Rohit Pillai, Yu Cheng, Luowei Zhou, Xin Eric Wang, William Yang Wang, et al. 2021. Value: A multi-task benchmark for video-and-language understanding evaluation. arXiv preprint arXiv:2106.04632 (2021)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i1.25222"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.3042067"},{"key":"e_1_3_2_1_31_1","volume-title":"NeighborRetr: Balancing Hub Centrality in Cross-Modal Retrieval. arXiv preprint arXiv:2503.10526","author":"Lin Zengrong","year":"2025","unstructured":"Zengrong Lin, Zheng Wang, Tianwen Qian, Pan Mu, Sixian Chan, and Cong Bai. 2025. NeighborRetr: Balancing Hub Centrality in Cross-Modal Retrieval. arXiv preprint arXiv:2503.10526 (2025)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475621"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3209978.3210003"},{"key":"e_1_3_2_1_34_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_35_1","volume-title":"Spatial--temporal video grounding with cross-modal understanding and enhancement. Expert Systems with Applications","author":"Luo Shu","year":"2025","unstructured":"Shu Luo, Jingyu Pan, Da Cao, Jiawei Wang, Yuquan Le, and Meng Liu. 2025. Spatial--temporal video grounding with cross-modal understanding and enhancement. Expert Systems with Applications (2025), 126650."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICME55011.2023.00257"},{"key":"e_1_3_2_1_37_1","volume-title":"Distinguishing Visually Similar Images: Triplet Contrastive Learning Framework for Image-text Retrieval. In 2024 IEEE International Conference on Multimedia and Expo (ICME). IEEE, 1--6.","author":"Ouyang Pengxiang","year":"2024","unstructured":"Pengxiang Ouyang, Jianan Chen, Qing Ma, Zheng Wang, and Cong Bai. 2024. Distinguishing Visually Similar Images: Triplet Contrastive Learning Framework for Image-text Retrieval. In 2024 IEEE International Conference on Multimedia and Expo (ICME). IEEE, 1--6."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3090595"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00695"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01566"},{"key":"e_1_3_2_1_41_1","volume-title":"Disentangled representation learning for text-video retrieval. arXiv preprint arXiv:2203.07111","author":"Wang Qiang","year":"2022","unstructured":"Qiang Wang, Yanhao Zhang, Yun Zheng, Pan Pan, and Xian-Sheng Hua. 2022. Disentangled representation learning for text-video retrieval. arXiv preprint arXiv:2203.07111 (2022)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00042"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475278"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475515"},{"key":"e_1_3_2_1_45_1","volume-title":"European Conference on Computer Vision. Springer, 390--408","author":"Wu Xuan","year":"2024","unstructured":"Xuan Wu, Hongxiang Li, Yuanjiang Luo, Xuxin Cheng, Xianwei Zhuang, Meng Cao, and Keren Fu. 2024. Uncertainty-aware sign language video retrieval with probability distribution modeling. In European Conference on Computer Vision. Springer, 390--408."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20059-5_11"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01234-2_29"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01030"},{"key":"e_1_3_2_1_49_1","volume-title":"A hierarchical multi-modal encoder for moment localization in video corpus. arXiv preprint arXiv:2011.09046","author":"Zhang Bowen","year":"2020","unstructured":"Bowen Zhang, Hexiang Hu, Joonseok Lee, Ming Zhao, Sheide Chammas, Vihan Jain, Eugene Ie, and Fei Sha. 2020a. A hierarchical multi-modal encoder for moment localization in video corpus. arXiv preprint arXiv:2011.09046 (2020)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19772-7_29"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462874"},{"key":"e_1_3_2_1_52_1","volume-title":"Span-based localizing network for natural language video localization. arXiv preprint arXiv:2004.13931","author":"Zhang Hao","year":"2020","unstructured":"Hao Zhang, Aixin Sun, Wei Jing, and Joey Tianyi Zhou. 2020b. Span-based localizing network for natural language video localization. arXiv preprint arXiv:2004.13931 (2020)."},{"key":"e_1_3_2_1_53_1","volume-title":"Video corpus moment retrieval via deformable multigranularity feature fusion and adversarial training","author":"Zhang Xuemei","year":"2023","unstructured":"Xuemei Zhang, Peng Zhao, Jinsheng Ji, Xiankai Lu, and Yilong Yin. 2023. Video corpus moment retrieval via deformable multigranularity feature fusion and adversarial training. IEEE Transactions on Circuits and Systems for Video Technology (2023)."}],"event":{"name":"ICMR '25: International Conference on Multimedia Retrieval","location":"Chicago IL USA","acronym":"ICMR '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2025 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731715.3733330","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T04:07:28Z","timestamp":1755749248000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731715.3733330"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":53,"alternative-id":["10.1145\/3731715.3733330","10.1145\/3731715"],"URL":"https:\/\/doi.org\/10.1145\/3731715.3733330","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]},"assertion":[{"value":"2025-06-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}