{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T16:42:46Z","timestamp":1777653766772,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":41,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62222203, 61976049"],"award-info":[{"award-number":["62222203, 61976049"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"CAAI-Huawei MindSpore Open Fund"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612349","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"975-983","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":28,"title":["Joint Searching and Grounding: Multi-Granularity Video Content Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-8962-7951","authenticated-orcid":false,"given":"Zhiguo","family":"Chen","sequence":"first","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2209-651X","authenticated-orcid":false,"given":"Xun","family":"Jiang","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5685-3123","authenticated-orcid":false,"given":"Xing","family":"Xu","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9446-5105","authenticated-orcid":false,"given":"Zuo","family":"Cao","sequence":"additional","affiliation":[{"name":"Meituan, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0991-3597","authenticated-orcid":false,"given":"Yijun","family":"Mo","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology, Wuhan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2999-2088","authenticated-orcid":false,"given":"Heng Tao","family":"Shen","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China &amp; Peng Cheng Laboratory, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.618"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01365"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01065"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547976"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00957"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3059295"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3150959"},{"key":"e_1_3_2_1_9_1","unstructured":"Victor Escorcia Mattia Soldan Josef Sivic Bernard Ghanem and Bryan Russell. 2019. Temporal localization of moments in video collections with natural language. (2019)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.563"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00155"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3075470"},{"key":"e_1_3_2_1_14_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 9726--9735","author":"He Kaiming","unstructured":"Kaiming He, Haoqi Fan, Yuxin Wu, Saining Xie, and Ross B. Girshick. 2020a. Momentum Contrast for Unsupervised Visual Representation Learning. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 9726--9735."},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 9726--9735","author":"He Kaiming","unstructured":"Kaiming He, Haoqi Fan, Yuxin Wu, Saining Xie, and Ross B. Girshick. 2020b. Momentum Contrast for Unsupervised Visual Representation Learning. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 9726--9735."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_17_1","volume-title":"SDN: Semantic Decoupling Network for Temporal Language Grounding","author":"Jiang Xun","year":"2022","unstructured":"Xun Jiang, Xing Xu, Jingran Zhang, Fumin Shen, Zuo Cao, and Heng Tao Shen. 2022a. SDN: Semantic Decoupling Network for Temporal Language Grounding. IEEE Transactions on Neural Networks and Learning Systems (2022)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00250"},{"key":"e_1_3_2_1_19_1","volume-title":"Faster Video Moment Retrieval with Point-Level Supervision. arXiv preprint arXiv:2305.14017","author":"Jiang Xun","year":"2023","unstructured":"Xun Jiang, Zailei Zhou, Xing Xu, Yang Yang, Guoqing Wang, and Heng Tao Shen. 2023. Faster Video Moment Retrieval with Point-Level Supervision. arXiv preprint arXiv:2305.14017 (2023)."},{"key":"e_1_3_2_1_20_1","volume-title":"Adam: A Method for Stochastic Optimization. In International Conference on Learning Representations.","author":"Diederik","unstructured":"Diederik P. Kingma and Jimmy Ba. 2015. Adam: A Method for Stochastic Optimization. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58589-1_27"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.161"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350906"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_1"},{"key":"e_1_3_2_1_26_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01082"},{"key":"e_1_3_2_1_28_1","volume-title":"International Conference on Artificial Intelligence and Statistics. 4348--4380","author":"Nakada Ryumei","year":"2023","unstructured":"Ryumei Nakada, Halil Ibrahim Gulluk, Zhun Deng, Wenlong Ji, James Zou, and Linjun Zhang. 2023. Understanding multimodal contrastive learning and incorporating unpaired data. In International Conference on Artificial Intelligence and Statistics. 4348--4380."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01430"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3120038"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298682"},{"key":"e_1_3_2_1_32_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3142420"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475665"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01522"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462874"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6984"},{"key":"e_1_3_2_1_38_1","unstructured":"Zhu Zhang Zhou Zhao Zhijie Lin Jieming Zhu and Xiuqiang He. 2020b. Counterfactual Contrastive Learning for Weakly-Supervised Vision-Language Grounding. In Advances in Neural Information Processing Systems. 18123--18134."},{"key":"e_1_3_2_1_39_1","volume-title":"Weakly Supervised Video Moment Localization with Contrastive Negative Sample Mining. In AAAI Conference on Artificial Intelligence. 3517--3525","author":"Zheng Minghang","year":"2022","unstructured":"Minghang Zheng, Yanjie Huang, Qingchao Chen, and Yang Liu. 2022a. Weakly Supervised Video Moment Localization with Contrastive Negative Sample Mining. In AAAI Conference on Artificial Intelligence. 3517--3525."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01511"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Ding Zou Wei Wei Xian-Ling Mao Ziyang Wang Minghui Qiu Feida Zhu and Xin Cao. 2022Multi-level Cross-view Contrastive Learning for Knowledge-aware Recommender System. In Proceedings of the ACM International Conference on Research & Development in Information Retrieval. 1358--1368.","DOI":"10.1145\/3477495.3532025"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612349","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612349","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T23:59:43Z","timestamp":1755820783000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612349"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":41,"alternative-id":["10.1145\/3581783.3612349","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612349","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}