{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:54:30Z","timestamp":1781538870400,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810648","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"1298-1307","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Event-Centric Structural Modeling for Zero-Shot Video Moment Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-7879-4321","authenticated-orcid":false,"given":"Xin","family":"Li","sequence":"first","affiliation":[{"name":"Institute of Information Engineering\uff0cChinese Academy of Sciences, Beijing, China and School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2963-0914","authenticated-orcid":false,"given":"Yongxiu","family":"Xu","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering\uff0cChinese Academy of Sciences, Beijing, China and School of Cyber Security, University of Chinese Academy of Sciences, Beijing, Chile"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-4136-7356","authenticated-orcid":false,"given":"Yuyao","family":"Kong","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering\uff0cChinese Academy of Sciences, Beijing, China and School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0258-7840","authenticated-orcid":false,"given":"Hongbo","family":"Xu","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering\uff0cChinese Academy of Sciences, Beijing, China and School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3533-4874","authenticated-orcid":false,"given":"Gaopeng","family":"Gou","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering\uff0cChinese Academy of Sciences, Beijing, China and School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6464-3203","authenticated-orcid":false,"given":"Yubin","family":"Wang","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering\uff0cChinese Academy of Sciences, Beijing, China and School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"crossref","unstructured":"Dare\u00a0A Baldwin and Jodie\u00a0A Baird. 2001. Discerning intentions in dynamic human action. Trends in cognitive sciences 5 4 (2001) 171\u2013178.","DOI":"10.1016\/S1364-6613(00)01615-6"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"crossref","unstructured":"Lu Dong Haiyu Zhang Hongjie Zhang Yifei Huang Zhen-Hua Ling Yu Qiao Limin Wang and Yali Wang. 2025. Weakly Supervised Temporal Sentence Grounding via Positive Sample Mining. IEEE Transactions on Circuits and Systems for Video Technology (2025).","DOI":"10.1109\/TCSVT.2025.3562249"},{"key":"e_1_3_3_1_4_2","unstructured":"Xuguang Duan Wenbing Huang Chuang Gan Jingdong Wang Wenwu Zhu and Junzhou Huang. 2018. Weakly supervised dense event captioning in videos. Advances in Neural Information Processing Systems 31 (2018)."},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.563"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"crossref","unstructured":"Junyu Gao and Changsheng Xu. 2021. Learning video moment retrieval without a single annotated video. IEEE Transactions on Circuits and Systems for Video Technology 32 3 (2021) 1646\u20131657.","DOI":"10.1109\/TCSVT.2021.3075470"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2019.00032"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19830-4_41"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00711"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01813"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01273"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00257"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"e_1_3_3_1_14_2","unstructured":"Jin-Seop Lee SungJoon Lee Jaehan Ahn YunSeok Choi and Jee-Hyong Lee. 2025. TAG: A Simple Yet Effective Temporal-Aware Approach for Zero-Shot Video Temporal Grounding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2508.07925 (2025)."},{"key":"e_1_3_3_1_15_2","first-page":"19730","volume-title":"International conference on machine learning","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730\u201319742."},{"key":"e_1_3_3_1_16_2","first-page":"12888","volume-title":"International conference on machine learning","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888\u201312900."},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00304"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"crossref","unstructured":"KunChang Li Yinan He Yi Wang Yizhuo Li Wenhai Wang Ping Luo Yali Wang Limin Wang and Yu Qiao. 2025. Videochat: Chat-centric video understanding. Science China Information Sciences 68 10 (2025) 200102.","DOI":"10.1007\/s11432-024-4321-9"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00262"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00399"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_1"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20058"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20060"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02207"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00538"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01082"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00150"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.1145\/3652583.3658096"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28252"},{"key":"e_1_3_3_1_31_2","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748\u20138763."},{"key":"e_1_3_3_1_32_2","unstructured":"Peng Shi and Jimmy Lin. 2019. Simple BERT Models for Relation Extraction and Semantic Role Labeling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1904.05255 (2019)."},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548004"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20163"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6924"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16406"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i9.32971"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02215"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462823"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"publisher","DOI":"10.1145\/3475723.3484247"},{"key":"e_1_3_3_1_41_2","unstructured":"Yitian Yuan Lin Ma Jingwen Wang Wei Liu and Wenwu Zhu. 2019. Semantic conditioned dynamic modulation for temporal sentence grounding in videos. Advances in Neural Information Processing Systems 32 (2019)."},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"crossref","unstructured":"Jeffrey\u00a0M Zacks and Barbara Tversky. 2001. Event structure in perception and conception. Psychological Bulletin 127 1 (2001) 3.","DOI":"10.1037\/0033-2909.127.1.3"},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"crossref","unstructured":"Hang Zhang Xin Li and Lidong Bing. 2023. Video-llama: An instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2306.02858 (2023).","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.585"},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6984"},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"publisher","DOI":"10.1145\/3331184.3331235"},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.317"},{"key":"e_1_3_3_1_48_2","first-page":"20","volume-title":"European Conference on Computer Vision","author":"Zheng Minghang","year":"2024","unstructured":"Minghang Zheng, Xinhao Cai, Qingchao Chen, Yuxin Peng, and Yang Liu. 2024. Training-free video temporal grounding using large-scale pre-trained models. In European Conference on Computer Vision. Springer, 20\u201337."},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.794"},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20263"},{"key":"e_1_3_3_1_51_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01511"},{"key":"e_1_3_3_1_52_2","doi-asserted-by":"crossref","unstructured":"Qi Zheng Jianfeng Dong Xiaoye Qu Xun Yang Yabing Wang Pan Zhou Baolong Liu and Xun Wang. 2023. Progressive localization networks for language-based moment localization. ACM Transactions on Multimedia Computing Communications and Applications 19 2 (2023) 1\u201321.","DOI":"10.1145\/3543857"}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:01:40Z","timestamp":1781535700000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810648"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":51,"alternative-id":["10.1145\/3805622.3810648","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810648","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}