{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,12]],"date-time":"2026-02-12T17:29:19Z","timestamp":1770917359462,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"Institute of Information & communications Technology Planning & Evaluation (IITP) grant funded by the Korea government(MSIT)","award":["2021-0-01343"],"award-info":[{"award-number":["2021-0-01343"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681304","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"5318-5327","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Learnable Negative Proposals Using Dual-Signed Cross-Entropy Loss for Weakly Supervised Video Moment Localization"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4336-4851","authenticated-orcid":false,"given":"Sunoh","family":"Kim","sequence":"first","affiliation":[{"name":"Mobile eXperience, Samsung Electronics, Suwon, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-0223-2400","authenticated-orcid":false,"given":"Daeho","family":"Um","sequence":"additional","affiliation":[{"name":"Seoul National University, Seoul, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9377-6008","authenticated-orcid":false,"given":"HyunJun","family":"Choi","sequence":"additional","affiliation":[{"name":"Seoul National University, Seoul, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3891-5815","authenticated-orcid":false,"given":"Jin Young","family":"Choi","sequence":"additional","affiliation":[{"name":"Seoul National University, Seoul, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Vqa: Visual question answering. In ICCV. 2425--2433.","author":"Antol Stanislaw","year":"2015","unstructured":"Stanislaw Antol, Aishwarya Agrawal, Jiasen Lu, Margaret Mitchell, Dhruv Batra, C Lawrence Zitnick, and Devi Parikh. 2015. Vqa: Visual question answering. In ICCV. 2425--2433."},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"crossref","unstructured":"Boris Babenko Ming-Hsuan Yang and Serge Belongie. 2009. Visual tracking with online multiple instance learning. In CVPR. 983--990.","DOI":"10.1109\/CVPR.2009.5206737"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i2.27832"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i2.27831"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"crossref","unstructured":"Yoshua Bengio J\u00e9r\u00f4me Louradour Ronan Collobert and Jason Weston. 2009. Curriculum learning. In ICML. 41--48.","DOI":"10.1145\/1553374.1553380"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"crossref","unstructured":"Meng Cao Fangyun Wei Can Xu Xiubo Geng Long Chen Can Zhang Yuexian Zou Tao Shen and Daxin Jiang. 2023. Iterative Proposal Refinement for Weakly-Supervised Video Grounding. In CVPR. 6524--6534.","DOI":"10.1109\/CVPR52729.2023.00631"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"crossref","unstructured":"Joao Carreira and Andrew Zisserman. 2017. Quo vadis action recognition? a new model and the kinetics dataset. In CVPR. 6299--6308.","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"crossref","unstructured":"Jiaming Chen Weixin Luo Wei Zhang and Lin Ma. 2022. Explore Inter-contrast between Videos via Composition for Weakly Supervised Temporal Sentence Grounding. In AAAI.","DOI":"10.1609\/aaai.v36i1.19902"},{"key":"e_1_3_2_2_9_1","volume-title":"Look closer to ground better: Weakly-supervised temporal grounding of sentence in video. arXiv preprint arXiv:2001.09308","author":"Chen Zhenfang","year":"2020","unstructured":"Zhenfang Chen, Lin Ma, Wenhan Luo, Peng Tang, and Kwan-Yee K Wong. 2020. Look closer to ground better: Weakly-supervised temporal grounding of sentence in video. arXiv preprint arXiv:2001.09308 (2020)."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"crossref","unstructured":"Valentin Gabeur Chen Sun Karteek Alahari and Cordelia Schmid. 2020. Multi-modal transformer for video retrieval. In ECCV. 214--229.","DOI":"10.1007\/978-3-030-58548-8_13"},{"key":"e_1_3_2_2_12_1","volume-title":"Tall: Temporal activity localization via language query. In ICCV. 5267--5275.","author":"Gao Jiyang","year":"2017","unstructured":"Jiyang Gao, Chen Sun, Zhenheng Yang, and Ram Nevatia. 2017. Tall: Temporal activity localization via language query. In ICCV. 5267--5275."},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"crossref","unstructured":"Jiabo Huang Yang Liu Shaogang Gong and Hailin Jin. 2021. Cross-sentence temporal and semantic relations in video activity localisation. In ICCV. 7199--7208.","DOI":"10.1109\/ICCV48922.2021.00711"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"crossref","unstructured":"Yifei Huang Lijin Yang and Yoichi Sato. 2023. Weakly Supervised Temporal Sentence Grounding With Uncertainty-Guided Self-Training. In CVPR. 18908--18918.","DOI":"10.1109\/CVPR52729.2023.01813"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"crossref","unstructured":"Lu Jiang Deyu Meng Qian Zhao Shiguang Shan and Alexander Hauptmann. 2015. Self-paced curriculum learning. In AAAI.","DOI":"10.1609\/aaai.v29i1.9608"},{"key":"e_1_3_2_2_16_1","volume-title":"Mentornet: Learning data-driven curriculum for very deep neural networks on corrupted labels. In ICML. 2304--2313.","author":"Jiang Lu","year":"2018","unstructured":"Lu Jiang, Zhengyuan Zhou, Thomas Leung, Li-Jia Li, and Li Fei-Fei. 2018. Mentornet: Learning data-driven curriculum for very deep neural networks on corrupted labels. In ICML. 2304--2313."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i3.28059"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","unstructured":"Sunoh Kim Taegil Ha Kimin Yun and Jin Young Choi. 2022. SWAG-Net: Semantic Word-Aware Graph Network for Temporal Video Grounding. In ACM CIKM. 982--992. https:\/\/doi.org\/10.1145\/3511808.3557463","DOI":"10.1145\/3511808.3557463"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"crossref","unstructured":"Sunoh Kim Kimin Yun and Jin Young Choi. 2021. Position-aware Location Regression Network for Temporal Video Grounding. In AVSS. 1--8.","DOI":"10.1109\/AVSS52988.2021.9663815"},{"key":"e_1_3_2_2_20_1","volume-title":"Adam: A method for stochastic optimization. In ICLR.","author":"Kingma Diederik P","year":"2015","unstructured":"Diederik P Kingma and Jimmy Ba. 2015. Adam: A method for stochastic optimization. In ICLR."},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"crossref","unstructured":"Shuhan Kong Liang Li Beichen Zhang Wenyu Wang Bin Jiang Chenggang Yan and Changhao Xu. 2023. Dynamic Contrastive Learning with Pseudo-samples Intervention for Weakly Supervised Joint Video MR and HD. In ACM MM. 538--546.","DOI":"10.1145\/3581783.3612384"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"crossref","unstructured":"Ranjay Krishna Kenji Hata Frederic Ren Li Fei-Fei and Juan Carlos Niebles. 2017. Dense-captioning events in videos. In ICCV. 706--715.","DOI":"10.1109\/ICCV.2017.83"},{"key":"e_1_3_2_2_23_1","unstructured":"Xiaohan Lan Yitian Yuan Hong Chen Xin Wang Zequn Jie Lin Ma Zhi Wang and Wenwu Zhu. 2023. Curriculum multi-negative augmentation for debiased video grounding. In AAAI."},{"key":"e_1_3_2_2_24_1","volume-title":"Tvr: A large-scale dataset for video-subtitle moment retrieval","author":"Lei Jie","year":"2020","unstructured":"Jie Lei, Licheng Yu, Tamara L Berg, and Mohit Bansal. 2020. Tvr: A large-scale dataset for video-subtitle moment retrieval. In ECCV. Springer."},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"crossref","unstructured":"Thomas Leung Yang Song and John Zhang. 2011. Handling label noise in video classification via multiple instance learning. In ICCV. 2056--2063.","DOI":"10.1109\/ICCV.2011.6126479"},{"key":"e_1_3_2_2_26_1","unstructured":"Siyang Li Xiangxin Zhu Qin Huang Hao Xu and C-C Jay Kuo. 2017. Multiple instance curriculum learning for weakly supervised object detection. In BMVC."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6820"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"crossref","unstructured":"Zezhong Lv Bing Su and Ji-Rong Wen. 2023. Counterfactual cross-modality reasoning for weakly supervised video moment localization. In ACM MM. 6539--6547.","DOI":"10.1145\/3581783.3612495"},{"key":"e_1_3_2_2_29_1","unstructured":"Yu-Fei Ma Lie Lu Hong-Jiang Zhang and Mingjing Li. 2002. A user attention model for video summarization. In ACM MM. 533--542."},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"crossref","unstructured":"Antoine Miech Jean-Baptiste Alayrac Lucas Smaira Ivan Laptev Josef Sivic and Andrew Zisserman. 2020. End-to-end learning of visual representations from uncurated instructional videos. In CVPR. 9879--9889.","DOI":"10.1109\/CVPR42600.2020.00990"},{"key":"e_1_3_2_2_31_1","unstructured":"Niluthpol Chowdhury Mithun Sujoy Paul and Amit K Roy-Chowdhury. 2019. Weakly supervised video moment retrieval from text queries. In CVPR. 11592--11601."},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"crossref","unstructured":"Jeffrey Pennington Richard Socher and Christopher Manning. 2014. GloVe: Global Vectors for Word Representation. In EMNLP. 1532--1543.","DOI":"10.3115\/v1\/D14-1162"},{"key":"e_1_3_2_2_33_1","volume-title":"a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108","author":"Sanh Victor","year":"2019","unstructured":"Victor Sanh, Lysandre Debut, Julien Chaumond, and Thomas Wolf. 2019. DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108 (2019)."},{"key":"e_1_3_2_2_34_1","volume-title":"Weakly-supervised multi-level attentional reconstruction network for grounding textual queries in videos. arXiv preprint arXiv:2003.07048","author":"Song Yijun","year":"2020","unstructured":"Yijun Song, Jingwen Wang, Lin Ma, Zhou Yu, and Jun Yu. 2020. Weakly-supervised multi-level attentional reconstruction network for grounding textual queries in videos. arXiv preprint arXiv:2003.07048 (2020)."},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"crossref","unstructured":"Waqas Sultani Chen Chen and Mubarak Shah. 2018. Real-world anomaly detection in surveillance videos. In CVPR. 6479--6488.","DOI":"10.1109\/CVPR.2018.00678"},{"key":"e_1_3_2_2_36_1","volume-title":"Logan: Latent graph co-attention network for weakly-supervised video moment retrieval. In WACV. 2083--2092.","author":"Tan Reuben","year":"2021","unstructured":"Reuben Tan, Huijuan Xu, Kate Saenko, and Bryan A Plummer. 2021. Logan: Latent graph co-attention network for weakly-supervised video moment retrieval. In WACV. 2083--2092."},{"key":"e_1_3_2_2_37_1","volume-title":"Movieqa: Understanding stories in movies through question-answering. In CVPR. 4631--4640.","author":"Tapaswi Makarand","year":"2016","unstructured":"Makarand Tapaswi, Yukun Zhu, Rainer Stiefelhagen, Antonio Torralba, Raquel Urtasun, and Sanja Fidler. 2016. Movieqa: Understanding stories in movies through question-answering. In CVPR. 4631--4640."},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"crossref","unstructured":"Du Tran Lubomir Bourdev Rob Fergus Lorenzo Torresani and Manohar Paluri. 2015. Learning spatiotemporal features with 3d convolutional networks. In ICCV. 4489--4497.","DOI":"10.1109\/ICCV.2015.510"},{"key":"e_1_3_2_2_39_1","volume-title":"NeurIPS","volume":"30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In NeurIPS, Vol. 30."},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00331"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"crossref","unstructured":"Jiang Wang Yang Song Thomas Leung Chuck Rosenberg Jingbin Wang James Philbin Bo Chen and Ying Wu. 2014. Learning fine-grained image similarity with deep ranking. In CVPR. 1386--1393.","DOI":"10.1109\/CVPR.2014.180"},{"key":"e_1_3_2_2_42_1","volume-title":"Weakly supervised temporal adjacent network for language grounding","author":"Wang Yuechen","year":"2021","unstructured":"Yuechen Wang, Jiajun Deng, Wengang Zhou, and Houqiang Li. 2021. Weakly supervised temporal adjacent network for language grounding. IEEE Transactions on Multimedia (2021)."},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"crossref","unstructured":"Zheng Wang Jingjing Chen and Yu-Gang Jiang. 2021. Visual co-occurrence alignment learning for weakly-supervised video moment retrieval. In ACM MM. 1459--1468.","DOI":"10.1145\/3474085.3475278"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"crossref","unstructured":"Jie Wu Guanbin Li Xiaoguang Han and Liang Lin. 2020. Reinforcement learning for weakly supervised temporal grounding of natural language in untrimmed videos. In ACM MM. 1283--1291.","DOI":"10.1145\/3394171.3413862"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"crossref","unstructured":"Yan Xia Zhou Zhao Shangwei Ye Yang Zhao Haoyuan Li and Yi Ren. 2022. Video-guided curriculum learning for spoken video grounding. In ACM MM. 5191--5200.","DOI":"10.1145\/3503161.3547996"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"crossref","unstructured":"Shaoning Xiao Long Chen Songyang Zhang Wei Ji Jian Shao Lu Ye and Jun Xiao. 2021. Boundary Proposal Network for Two-Stage Natural Language Video Localization. In AAAI. 2986--2994. https:\/\/ojs.aaai.org\/index.php\/AAAI\/article\/view\/16406","DOI":"10.1609\/aaai.v35i4.16406"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","unstructured":"Huijuan Xu Kun He Bryan A Plummer Leonid Sigal Stan Sclaroff and Kate Saenko. 2019. Multilevel language and vision integration for text-to-clip retrieval. In AAAI. 9062--9069. https:\/\/doi.org\/10.1609\/aaai.v33i01.33019062","DOI":"10.1609\/aaai.v33i01.33019062"},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3058614"},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"crossref","unstructured":"Sunjae Yoon Gwanhyeong Koo Dahyun Kim and Chang D Yoo. 2023. SCANet: Scene Complexity Aware Network for Weakly-Supervised Video Moment Retrieval. In ICCV. 13576--13586.","DOI":"10.1109\/ICCV51070.2023.01249"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","unstructured":"Zhu Zhang Zhijie Lin Zhou Zhao and Zhenxin Xiao. 2019. Cross-modal interaction networks for query-based moment retrieval in videos. In ACM SIGIR. 655--664. https:\/\/doi.org\/10.1145\/3331184.3331235","DOI":"10.1145\/3331184.3331235"},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"crossref","unstructured":"Zhu Zhang Zhijie Lin Zhou Zhao Jieming Zhu and Xiuqiang He. 2020. Regularized two-branch proposal networks for weakly-supervised moment retrieval in videos. In ACM MM. 4098--4106.","DOI":"10.1145\/3394171.3413967"},{"key":"e_1_3_2_2_52_1","first-page":"18123","article-title":"Counterfactual contrastive learning for weakly-supervised vision-language grounding","volume":"33","author":"Zhang Zhu","year":"2020","unstructured":"Zhu Zhang, Zhou Zhao, Zhijie Lin, Xiuqiang He, et al. 2020. Counterfactual contrastive learning for weakly-supervised vision-language grounding. In NeurIPS, Vol. 33. 18123--18134.","journal-title":"NeurIPS"},{"key":"e_1_3_2_2_53_1","first-page":"3","article-title":"Weakly supervised video moment localization with contrastive negative sample mining","volume":"1","author":"Zheng Minghang","year":"2022","unstructured":"Minghang Zheng, Yanjie Huang, Qingchao Chen, and Yang Liu. 2022. Weakly supervised video moment localization with contrastive negative sample mining. In AAAI, Vol. 1. 3.","journal-title":"AAAI"},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"crossref","unstructured":"Minghang Zheng Yanjie Huang Qingchao Chen Yuxin Peng and Yang Liu. 2022. Weakly Supervised Temporal Sentence Grounding With Gaussian-Based Contrastive Proposal Learning. In CVPR. 15555--15564.","DOI":"10.1109\/CVPR52688.2022.01511"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681304","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681304","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:43Z","timestamp":1750295863000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681304"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":54,"alternative-id":["10.1145\/3664647.3681304","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681304","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}