{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:09:11Z","timestamp":1765339751549,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":74,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62472385"],"award-info":[{"award-number":["62472385"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Pioneer and Leading Goose R&D Program of Zhejiang","award":["2024C01110"],"award-info":[{"award-number":["2024C01110"]}]},{"name":"Pioneer and Leading Goose R&D Program of Zhejiang","award":["2023C01212"],"award-info":[{"award-number":["2023C01212"]}]},{"name":"Young Elite Scientists Sponsorship Program by China Association for Science and Technology","award":["2022QNRC001"],"award-info":[{"award-number":["2022QNRC001"]}]},{"name":"Public Welfare Technology Research Project of Zhejiang Province","award":["LGF21F020010"],"award-info":[{"award-number":["LGF21F020010"]}]},{"name":"Fundamental Research Funds for the Provincial Universities of Zhejiang","award":["FR2402ZD"],"award-info":[{"award-number":["FR2402ZD"]}]},{"name":"Zhejiang Provincial High-Level Talent Special Support Program"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754982","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:56:43Z","timestamp":1761371803000},"page":"6027-6036","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Audio Does Matter: Importance-Aware Multi-Granularity Fusion for Video Moment Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-5612-866X","authenticated-orcid":false,"given":"Junan","family":"Lin","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8179-4508","authenticated-orcid":false,"given":"Daizong","family":"Liu","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6015-3242","authenticated-orcid":false,"given":"Xianke","family":"Chen","sequence":"additional","affiliation":[{"name":"Zhejiang Gongshang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4907-3978","authenticated-orcid":false,"given":"Xiaoye","family":"Qu","sequence":"additional","affiliation":[{"name":"Shanghai Artificial Intelligence Laboratory, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0201-1638","authenticated-orcid":false,"given":"Xun","family":"Yang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0271-9196","authenticated-orcid":false,"given":"Jixiang","family":"Zhu","sequence":"additional","affiliation":[{"name":"Zhejiang Gongshang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8604-874X","authenticated-orcid":false,"given":"Sanyuan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5244-3274","authenticated-orcid":false,"given":"Jianfeng","family":"Dong","sequence":"additional","affiliation":[{"name":"Zhejiang Gongshang University, Hangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.618"},{"key":"e_1_3_2_1_2_1","volume-title":"International conference on machine learning. PMLR, 233--242","author":"Arpit Devansh","year":"2017","unstructured":"Devansh Arpit, Stanislaw Jastrzebski, Nicolas Ballas, David Krueger, Emmanuel Bengio, Maxinder S Kanwal, Tegan Maharaj, Asja Fischer, Aaron Courville, Yoshua Bengio, et al. 2017. A closer look at memorization in deep networks. In International conference on machine learning. PMLR, 233--242."},{"key":"e_1_3_2_1_3_1","volume-title":"Hear me out: Fusional approaches for audio augmented temporal action localization. arXiv preprint arXiv:2106.14118","author":"Bagchi Anurag","year":"2021","unstructured":"Anurag Bagchi, Jazib Mahmood, Dolton Fernandes, and Ravi Kiran Sarvadevabhatla. 2021. Hear me out: Fusional approaches for audio augmented temporal action localization. arXiv preprint arXiv:2106.14118 (2021)."},{"key":"e_1_3_2_1_4_1","volume-title":"Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:1409.0473","author":"Bahdanau Dzmitry","year":"2014","unstructured":"Dzmitry Bahdanau. 2014. Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:1409.0473 (2014)."},{"key":"e_1_3_2_1_5_1","volume-title":"FlashVTG: Feature Layering and Adaptive Score Handling Network for Video Temporal Grounding. arXiv preprint arXiv:2412.13441","author":"Cao Zhuo","year":"2024","unstructured":"Zhuo Cao, Bingqing Zhang, Heming Du, Xin Yu, Xue Li, and Sen Wang. 2024. FlashVTG: Feature Layering and Adaptive Score Handling Network for Video Temporal Grounding. arXiv preprint arXiv:2412.13441 (2024)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612504"},{"key":"e_1_3_2_1_9_1","volume-title":"Proceedings, Part IV 16","author":"Chen Shaoxiang","year":"2020","unstructured":"Shaoxiang Chen, Wenhao Jiang, Wei Liu, and Yu-Gang Jiang. 2020. Learning modality interaction for temporal sentence localization and event captioning in videos. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part IV 16. Springer, 333--351."},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 6339--6348","author":"Cheng Xuelian","year":"2019","unstructured":"Xuelian Cheng, Yiran Zhong, Yuchao Dai, Pan Ji, and Hongdong Li. 2019. Noiseaware unsupervised deep lidar-stereo fusion. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 6339--6348."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3059295"},{"key":"e_1_3_2_1_12_1","first-page":"43107","article-title":"Temporal sentence grounding with relevance feedback in videos","volume":"37","author":"Dong Jianfeng","year":"2024","unstructured":"Jianfeng Dong, Xiaoman Peng, Daizong Liu, Xiaoye Qu, Xun Yang, Cuizhu Bao, and Meng Wang. 2024. Temporal sentence grounding with relevance feedback in videos. Advances in Neural Information Processing Systems 37 (2024), 43107--43132.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01038"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"e_1_3_2_1_15_1","volume-title":"Multimodal imbalance-aware gradient modulation for weakly-supervised audio-visual video parsing","author":"Fu Jie","year":"2023","unstructured":"Jie Fu, Junyu Gao, Bing-Kun Bao, and Changsheng Xu. 2023. Multimodal imbalance-aware gradient modulation for weakly-supervised audio-visual video parsing. IEEE Transactions on Circuits and Systems for Video Technology (2023)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.563"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2020.04.001"},{"key":"e_1_3_2_1_18_1","volume-title":"Saliency-guided detr for moment retrieval and highlight detection. arXiv preprint arXiv:2410.01615","author":"Gordeev Aleksandr","year":"2024","unstructured":"Aleksandr Gordeev, Vladimir Dokholyan, Irina Tolstykh, and Maksim Kuprashevich. 2024. Saliency-guided detr for moment retrieval and highlight detection. arXiv preprint arXiv:2410.01615 (2024)."},{"key":"e_1_3_2_1_19_1","volume-title":"VTG-LLM: Integrating Timestamp Knowledge into Video LLMs for Enhanced Video Temporal Grounding. arXiv preprint arXiv:2405.13382","author":"Guo Yongxin","year":"2024","unstructured":"Yongxin Guo, Jingyu Liu, Mingda Li, Xiaoying Tang, Xi Chen, and Bo Zhao. 2024. VTG-LLM: Integrating Timestamp Knowledge into Video LLMs for Enhanced Video Temporal Grounding. arXiv preprint arXiv:2405.13382 (2024)."},{"key":"e_1_3_2_1_20_1","volume-title":"Jort F Gemmeke, Aren Jansen, R Channing Moore, Manoj Plakal, Devin Platt, Rif A Saurous, Bryan Seybold, et al.","author":"Hershey Shawn","year":"2017","unstructured":"Shawn Hershey, Sourish Chaudhuri, Daniel PW Ellis, Jort F Gemmeke, Aren Jansen, R Channing Moore, Manoj Plakal, Devin Platt, Rif A Saurous, Bryan Seybold, et al. 2017. CNN architectures for large-scale audio classification. In 2017 ieee international conference on acoustics, speech and signal processing (icassp). IEEE, 131--135."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19830-4_41"},{"key":"e_1_3_2_1_22_1","first-page":"29406","article-title":"Learning with noisy correspondence for cross-modal matching","volume":"34","author":"Huang Zhenyu","year":"2021","unstructured":"Zhenyu Huang, Guocheng Niu, Xiao Liu, Wenbiao Ding, Xinyan Xiao, Hua Wu, and Xi Peng. 2021. Learning with noisy correspondence for cross-modal matching. Advances in Neural Information Processing Systems 34 (2021), 29406--29419.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01273"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548309"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681115"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3030497"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"e_1_3_2_1_28_1","volume-title":"European Conference on Computer Vision. Springer, 220--238","author":"Lee Pilhyeon","year":"2024","unstructured":"Pilhyeon Lee and Hyeran Byun. 2024. Bam-detr: Boundary-aligned moment detection transformer for temporal sentence grounding in videos. In European Conference on Computer Vision. Springer, 220--238."},{"key":"e_1_3_2_1_29_1","first-page":"11846","article-title":"Detecting moments and highlights in videos via natural language queries","volume":"34","author":"Lei Jie","year":"2021","unstructured":"Jie Lei, Tamara L Berg, and Mohit Bansal. 2021. Detecting moments and highlights in videos via natural language queries. Advances in Neural Information Processing Systems 34 (2021), 11846--11858.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_30_1","volume-title":"Prototype-based aleatoric uncertainty quantification for cross-modal retrieval. Advances in Neural Information Processing Systems 36","author":"Li Hao","year":"2024","unstructured":"Hao Li, Jingkuan Song, Lianli Gao, Xiaosu Zhu, and Hengtao Shen. 2024. Prototype-based aleatoric uncertainty quantification for cross-modal retrieval. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00262"},{"key":"e_1_3_2_1_32_1","volume-title":"Multigranularity correspondence learning from long-term noisy videos. arXiv preprint arXiv:2401.16702","author":"Lin Yijie","year":"2024","unstructured":"Yijie Lin, Jie Zhang, Zhenyu Huang, Jia Liu, Zujie Wen, and Xi Peng. 2024. Multigranularity correspondence learning from long-term noisy videos. arXiv preprint arXiv:2401.16702 (2024)."},{"key":"e_1_3_2_1_33_1","volume-title":"Adaptive proposal generation network for temporal sentence localization in videos. arXiv preprint arXiv:2109.06398","author":"Liu Daizong","year":"2021","unstructured":"Daizong Liu, Xiaoye Qu, Jianfeng Dong, and Pan Zhou. 2021. Adaptive proposal generation network for temporal sentence localization in videos. arXiv preprint arXiv:2109.06398 (2021)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3209978.3210003"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00305"},{"key":"e_1_3_2_1_36_1","volume-title":"Object-centric learning with slot attention. Advances in neural information processing systems 33","author":"Locatello Francesco","year":"2020","unstructured":"Francesco Locatello, Dirk Weissenborn, Thomas Unterthiner, Aravindh Mahendran, Georg Heigold, Jakob Uszkoreit, Alexey Dosovitskiy, and Thomas Kipf. 2020. Object-centric learning with slot attention. Advances in neural information processing systems 33 (2020), 11525--11538."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20044-1_28"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01030"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02205"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00138"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"e_1_3_2_1_42_1","volume-title":"International conference on machine learning. PmLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748--8763."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01357"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2024.102357"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3613772"},{"key":"e_1_3_2_1_46_1","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","volume":"38","author":"Sun Hao","year":"2024","unstructured":"Hao Sun, Mingyao Zhou, Wenjing Chen, and Wei Xie. 2024. Tr-detr: Taskreciprocal transformer for joint moment retrieval and highlight detection. In Proceedings of the AAAI Conference on Artificial Intelligence, Vol. 38. 4998--5007."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3532083"},{"key":"e_1_3_2_1_48_1","volume-title":"COLD fusion: Calibrated and ordinal latent distribution fusion for uncertainty-aware multimodal emotion recognition","author":"Tellamekala Mani Kumar","year":"2023","unstructured":"Mani Kumar Tellamekala, Shahin Amiriparian, Bj\u00f6rnWSchuller, Elisabeth Andr\u00e9, Timo Giesbrecht, and Michel Valstar. 2023. COLD fusion: Calibrated and ordinal latent distribution fusion for uncertainty-aware multimodal emotion recognition. IEEE Transactions on Pattern Analysis and Machine Intelligence (2023)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"e_1_3_2_1_50_1","volume-title":"Grounded-videollm: Sharpening finegrained temporal grounding in video large language models. arXiv preprint arXiv:2410.03290","author":"Wang Haibo","year":"2024","unstructured":"Haibo Wang, Zhiyang Xu, Yu Cheng, Shizhe Diao, Yufan Zhou, Yixin Cao, Qifan Wang, Weifeng Ge, and Lifu Huang. 2024. Grounded-videollm: Sharpening finegrained temporal grounding in video large language models. arXiv preprint arXiv:2410.03290 (2024)."},{"key":"e_1_3_2_1_51_1","volume-title":"MS-DETR: Natural Language Video Localization with Sampling Moment-Moment Interaction. arXiv preprint arXiv:2305.18969","author":"Wang Jing","year":"2023","unstructured":"Jing Wang, Aixin Sun, Hao Zhang, and Xiaoli Li. 2023. MS-DETR: Natural Language Video Localization with Sampling Moment-Moment Interaction. arXiv preprint arXiv:2305.18969 (2023)."},{"key":"e_1_3_2_1_52_1","volume-title":"European Conference on Computer Vision. Springer, 396--416","author":"Wang Yi","year":"2024","unstructured":"Yi Wang, Kunchang Li, Xinhao Li, Jiashuo Yu, Yinan He, Guo Chen, Baoqi Pei, Rongkun Zheng, Zun Wang, Yansong Shi, et al. 2024. Internvideo2: Scaling foundation models for multimodal video understanding. In European Conference on Computer Vision. Springer, 396--416."},{"key":"e_1_3_2_1_53_1","volume-title":"Hawkeye: Training video-text llms for grounding text in videos. arXiv preprint arXiv:2403.10228","author":"Wang Yueqian","year":"2024","unstructured":"Yueqian Wang, Xiaojun Meng, Jianxin Liang, Yuxuan Wang, Qun Liu, and Dongyan Zhao. 2024. Hawkeye: Training video-text llms for grounding text in videos. arXiv preprint arXiv:2403.10228 (2024)."},{"key":"e_1_3_2_1_54_1","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","volume":"36","author":"Li Tianhao","year":"2022","unstructured":"ZhenzhiWang, LiminWang, TaoWu, Tianhao Li, and GangshanWu. 2022. Negative sample matters: A renaissance of metric learning for temporal grounding. In Proceedings of the AAAI Conference on Artificial Intelligence, Vol. 36. 2613--2623."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16406"},{"key":"e_1_3_2_1_56_1","volume-title":"Proceedings of the 29th International Conference on Computational Linguistics. 1855--1864","author":"Xu Bo","year":"2022","unstructured":"Bo Xu, Shizhou Huang, Ming Du, Hongya Wang, Hui Song, Chaofeng Sha, and Yanghua Xiao. 2022. Different data, different modalities! reinforced data splitting for effective multimodal information extraction from social media posts. In Proceedings of the 29th International Conference on Computational Linguistics. 1855--1864."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096655"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01253"},{"key":"e_1_3_2_1_59_1","volume-title":"Entity-aware and motion-aware transformers for language-driven action localization in videos. arXiv preprint arXiv:2205.05854","author":"Yang Shuo","year":"2022","unstructured":"Shuo Yang and Xinxiao Wu. 2022. Entity-aware and motion-aware transformers for language-driven action localization in videos. arXiv preprint arXiv:2205.05854 (2022)."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2024.3368919"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02106-7"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-023-4084-6"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i15.29578"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019159"},{"key":"e_1_3_2_1_65_1","volume-title":"Joey Tianyi Zhou, and Rick Siow Mong Goh","author":"Zhang Hao","year":"2021","unstructured":"Hao Zhang, Aixin Sun, Wei Jing, Liangli Zhen, Joey Tianyi Zhou, and Rick Siow Mong Goh. 2021. Parallel attention network with sequence matching for video grounding. arXiv preprint arXiv:2105.08481 (2021)."},{"key":"e_1_3_2_1_66_1","volume-title":"Span-based localizing network for natural language video localization. arXiv preprint arXiv:2004.13931","author":"Zhang Hao","year":"2020","unstructured":"Hao Zhang, Aixin Sun, Wei Jing, and Joey Tianyi Zhou. 2020. Span-based localizing network for natural language video localization. arXiv preprint arXiv:2004.13931 (2020)."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2025.3530570"},{"key":"e_1_3_2_1_68_1","unstructured":"Qingyang Zhang Yake Wei Zongbo Han Huazhu Fu Xi Peng Cheng Deng Qinghua Hu Cai Xu JieWen Di Hu et al. 2024. Multimodal fusion on low-quality data: A comprehensive survey. arXiv preprint arXiv:2404.18947 (2024)."},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3120745"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6984"},{"key":"e_1_3_2_1_71_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 14794--14804","author":"Zhang Yimeng","year":"2023","unstructured":"Yimeng Zhang, Xin Chen, Jinghan Jia, Sijia Liu, and Ke Ding. 2023. Textvisual prompting for efficient 2d temporal video grounding. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 14794--14804."},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544493"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00319"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01453"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754982","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:04:12Z","timestamp":1765339452000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754982"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":74,"alternative-id":["10.1145\/3746027.3754982","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754982","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}