{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,2]],"date-time":"2026-06-02T16:45:27Z","timestamp":1780418727390,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":61,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62276155,62206156,62206157"],"award-info":[{"award-number":["62276155,62206156,62206157"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Key Laboratory of Computing Power Network and Information Security, Ministry of Education","award":["2023ZD031"],"award-info":[{"award-number":["2023ZD031"]}]},{"name":"NSF of Shandong Province","award":["ZR2021MF040,ZR2022QF047"],"award-info":[{"award-number":["ZR2021MF040,ZR2022QF047"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680774","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"9214-9223","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":15,"title":["Explicit Granularity and Implicit Scale Correspondence Learning for Point-Supervised Video Moment Localization"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-4856-8806","authenticated-orcid":false,"given":"Kun","family":"Wang","sequence":"first","affiliation":[{"name":"Shandong University, Jinan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1026-4499","authenticated-orcid":false,"given":"Hao","family":"Liu","sequence":"additional","affiliation":[{"name":"Shandong University, Jinan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-6895-3558","authenticated-orcid":false,"given":"Lirong","family":"Jie","sequence":"additional","affiliation":[{"name":"Shandong University, Jinan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5136-159X","authenticated-orcid":false,"given":"Zixu","family":"Li","sequence":"additional","affiliation":[{"name":"Shandong University, Jinan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5653-8286","authenticated-orcid":false,"given":"Yupeng","family":"Hu","sequence":"additional","affiliation":[{"name":"Shandong University, Jinan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1476-0273","authenticated-orcid":false,"given":"Liqiang","family":"Nie","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.618"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16175"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46478-7_34"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.773"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00631"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612504"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i1.19902"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.639"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3532078"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01137"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3539597.3570405"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.563"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00155"},{"key":"e_1_3_2_1_15_1","volume-title":"Wslln: Weakly supervised natural language localization networks. arXiv preprint arXiv:1909.00239","author":"Gao Mingfei","year":"2019","unstructured":"Mingfei Gao, Larry S Davis, Richard Socher, and Caiming Xiong. 2019. Wslln: Weakly supervised natural language localization networks. arXiv preprint arXiv:1909.00239 (2019)."},{"key":"e_1_3_2_1_16_1","volume-title":"Align, match and distill for video-text retrieval. arXiv preprint arXiv:2111.05610","author":"Gao Zijian","year":"2021","unstructured":"Zijian Gao, Jingyu Liu, Weiqi Sun, Sheng Chen, Dedan Chang, and Lili Zhao. 2021. Clip2tv: Align, match and distill for video-text retrieval. arXiv preprint arXiv:2111.05610 (2021)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i3.27959"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00292"},{"key":"e_1_3_2_1_19_1","volume-title":"CONE: An Efficient COarse-to-fiNE Alignment Framework for Long Video Temporal Grounding. In The 61st Annual Meeting Of The Association For Computational Linguistics.","author":"Hou Zhijian","year":"2023","unstructured":"Zhijian Hou, Wanjun Zhong, Lei Ji, DIFEI GAO, Kun Yan, WK Chan, Chong-Wah Ngo, Mike Zheng Shou, and Nan Duan. 2023. CONE: An Efficient COarse-to-fiNE Alignment Framework for Long Video Temporal Grounding. In The 61st Annual Meeting Of The Association For Computational Linguistics."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3073867"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3090521"},{"key":"e_1_3_2_1_22_1","first-page":"1","article-title":"Semantic collaborative learning for cross-modal moment localization","volume":"42","author":"Hu Yupeng","year":"2023","unstructured":"Yupeng Hu, Kun Wang, Meng Liu, Haoyu Tang, and Liqiang Nie. 2023. Semantic collaborative learning for cross-modal moment localization. ACM Transactions on Information Systems, Vol. 42, 2 (2023), 1--26.","journal-title":"ACM Transactions on Information Systems"},{"key":"e_1_3_2_1_23_1","first-page":"29406","article-title":"Learning with noisy correspondence for cross-modal matching","volume":"34","author":"Huang Zhenyu","year":"2021","unstructured":"Zhenyu Huang, Guocheng Niu, Xiao Liu, Wenbiao Ding, Xinyan Xiao, Hua Wu, and Xi Peng. 2021. Learning with noisy correspondence for cross-modal matching. Advances in Neural Information Processing Systems, Vol. 34 (2021), 29406--29419.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_24_1","volume-title":"Faster Video Moment Retrieval with Point-Level Supervision. arXiv preprint arXiv:2305.14017","author":"Jiang Xun","year":"2023","unstructured":"Xun Jiang, Zailei Zhou, Xing Xu, Yang Yang, Guoqing Wang, and Heng Tao Shen. 2023. Faster Video Moment Retrieval with Point-Level Supervision. arXiv preprint arXiv:2305.14017 (2023)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.77"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i3.28059"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01339"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01263"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6820"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3217449"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611886"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3442381.3449986"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3209978.3210003"},{"key":"e_1_3_2_1_34_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_1_35_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Luo Zhuoyan","year":"2024","unstructured":"Zhuoyan Luo, Yicheng Xiao, Yong Liu, Shuyan Li, Yitong Wang, Yansong Tang, Xiu Li, and Yujiu Yang. 2024. Soc: Semantic-assisted object cluster for referring video object segmentation. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_36_1","volume-title":"Sf-net: Single-frame supervision for temporal action localization. In Computer Vision-ECCV 2020: 16th European Conference","author":"Ma Fan","year":"2020","unstructured":"Fan Ma, Linchao Zhu, Yi Yang, Shengxin Zha, Gourab Kundu, Matt Feiszli, and Zheng Shou. 2020. Sf-net: Single-frame supervision for temporal action localization. In Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part IV 16. Springer, 420--437."},{"key":"e_1_3_2_1_37_1","volume-title":"Vlanet: Video-language alignment network for weakly-supervised video moment retrieval. In Computer Vision-ECCV 2020: 16th European Conference","author":"Ma Minuk","year":"2020","unstructured":"Minuk Ma, Sunjae Yoon, Junyeong Kim, Youngjoon Lee, Sunghun Kang, and Chang D Yoo. 2020. Vlanet: Video-language alignment network for weakly-supervised video moment retrieval. In Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part XXVIII 16. Springer, 156--171."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00990"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00207"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-33718-5_11"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3512527.3531396"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_31"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2024.102460"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3063631"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2021.3110713"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.193"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"e_1_3_2_1_49_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00679"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3168424"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3205404"},{"key":"e_1_3_2_1_53_1","volume-title":"Entity-aware and motion-aware transformers for language-driven action localization in videos. arXiv preprint arXiv:2205.05854","author":"Yang Shuo","year":"2022","unstructured":"Shuo Yang and Xinxiao Wu. 2022. Entity-aware and motion-aware transformers for language-driven action localization in videos. arXiv preprint arXiv:2205.05854 (2022)."},{"key":"e_1_3_2_1_54_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Yu Shoubin","year":"2024","unstructured":"Shoubin Yu, Jaemin Cho, Prateek Yadav, and Mohit Bansal. 2024. Self-chained image-language model for video localization and question answering. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019159"},{"key":"e_1_3_2_1_56_1","volume-title":"Temporal sentence grounding in videos: A survey and future directions","author":"Zhang Hao","year":"2023","unstructured":"Hao Zhang, Aixin Sun, Wei Jing, and Joey Tianyi Zhou. 2023. Temporal sentence grounding in videos: A survey and future directions. IEEE Transactions on Pattern Analysis and Machine Intelligence (2023)."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6984"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350879"},{"key":"e_1_3_2_1_59_1","first-page":"18123","article-title":"Counterfactual contrastive learning for weakly-supervised vision-language grounding","volume":"33","author":"Zhang Zhu","year":"2020","unstructured":"Zhu Zhang, Zhou Zhao, Zhijie Lin, Xiuqiang He, et al. 2020. Counterfactual contrastive learning for weakly-supervised vision-language grounding. Advances in Neural Information Processing Systems, Vol. 33 (2020), 18123--18134.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20263"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01511"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680774","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680774","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:57:42Z","timestamp":1750294662000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680774"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":61,"alternative-id":["10.1145\/3664647.3680774","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680774","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}