{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:46:04Z","timestamp":1778082364043,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":87,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,7,11]],"date-time":"2021-07-11T00:00:00Z","timestamp":1625961600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001348","name":"Agency for Science, Technology and Research","doi-asserted-by":"publisher","award":["A18A1b0045 and A18A2b0046"],"award-info":[{"award-number":["A18A1b0045 and A18A2b0046"]}],"id":[{"id":"10.13039\/501100001348","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,7,11]]},"DOI":"10.1145\/3404835.3462874","type":"proceedings-article","created":{"date-parts":[[2021,7,12]],"date-time":"2021-07-12T02:41:48Z","timestamp":1626057708000},"page":"685-695","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":82,"title":["Video Corpus Moment Retrieval with Contrastive Learning"],"prefix":"10.1145","author":[{"given":"Hao","family":"Zhang","sequence":"first","affiliation":[{"name":"Nanyang Technological University, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Aixin","family":"Sun","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wei","family":"Jing","sequence":"additional","affiliation":[{"name":"Agency for Science, Technology and Research, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Guoshun","family":"Nan","sequence":"additional","affiliation":[{"name":"Singapore University of Technology and Design, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Liangli","family":"Zhen","sequence":"additional","affiliation":[{"name":"Agency for Science, Technology and Research, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Joey Tianyi","family":"Zhou","sequence":"additional","affiliation":[{"name":"Agency for Science, Technology and Research, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rick Siow Mong","family":"Goh","sequence":"additional","affiliation":[{"name":"Agency for Science, Technology and Research, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2021,7,11]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Hinton","author":"Ba Jimmy","year":"2016","unstructured":"Jimmy Ba, J. Kiros, and Geoffrey E. Hinton. 2016. Layer Normalization. ArXiv , Vol. abs\/1607.06450 (2016)."},{"key":"e_1_3_2_2_2_1","volume-title":"Advances in Neural Information Processing Systems","volume":"32","author":"Bachman Philip","year":"2019","unstructured":"Philip Bachman, R Devon Hjelm, and William Buchwalter. 2019. Learning Representations by Maximizing Mutual Information Across Views. In Advances in Neural Information Processing Systems, Vol. 32. Curran Associates, Inc., 15535--15545."},{"key":"e_1_3_2_2_3_1","volume-title":"International Conference on Learning Representations .","author":"Bahdanau Dzmitry","year":"2015","unstructured":"Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. 2015. Neural Machine Translation by Jointly Learning to Align and Translate. In International Conference on Learning Representations ."},{"key":"e_1_3_2_2_4_1","volume-title":"Mine: mutual information neural estimation. arXiv preprint arXiv:1801.04062","author":"Belghazi Mohamed Ishmael","year":"2018","unstructured":"Mohamed Ishmael Belghazi, Aristide Baratin, Sai Rajeswar, Sherjil Ozair, Yoshua Bengio, Aaron Courville, and R Devon Hjelm. 2018. Mine: mutual information neural estimation. arXiv preprint arXiv:1801.04062 (2018)."},{"key":"e_1_3_2_2_5_1","volume-title":"An information-maximization approach to blind separation and blind deconvolution. Neural computation","author":"Bell Anthony J","year":"1995","unstructured":"Anthony J Bell and Terrence J Sejnowski. 1995. An information-maximization approach to blind separation and blind deconvolution. Neural computation , Vol. 7, 6 (1995), 1129--1159."},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413840"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1015"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6627"},{"key":"e_1_3_2_2_10_1","volume-title":"Learning Modality Interaction for Temporal Sentence Localization and Event Captioning in Videos. In The European Conference on Computer Vision .","author":"Chen Shaoxiang","year":"2020","unstructured":"Shaoxiang Chen, Wenhao Jiang, Wei Liu, and Yu-Gang Jiang. 2020 a. Learning Modality Interaction for Temporal Sentence Localization and Event Captioning in Videos. In The European Conference on Computer Vision ."},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018199"},{"key":"e_1_3_2_2_12_1","volume-title":"Proceedings of the 37th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"1607","author":"Chen Ting","year":"2020","unstructured":"Ting Chen, Simon Kornblith, Mohammad Norouzi, and Geoffrey Hinton. 2020 b. A Simple Framework for Contrastive Learning of Visual Representations. In Proceedings of the 37th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 119). PMLR, 1597--1607."},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_2_14_1","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. Association for Computational Linguistics, 4171--4186","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. Association for Computational Linguistics, 4171--4186."},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00957"},{"key":"e_1_3_2_2_16_1","volume-title":"Temporal Localization of Moments in Video Collections with Natural Language. arXiv preprint arXiv:1907.12763","author":"Escorcia Victor","year":"2019","unstructured":"Victor Escorcia, Mattia Soldan, Josef Sivic, Bernard Ghanem, and Bryan Russell. 2019. Temporal Localization of Moments in Video Collections with Natural Language. arXiv preprint arXiv:1907.12763 (2019)."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58548-8_13"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.563"},{"key":"e_1_3_2_2_19_1","volume-title":"MAC: Mining Activity Concepts for Language-based Temporal Localization. In IEEE Winter Conference on Applications of Computer Vision. 245--253","author":"Ge Runzhou","year":"2019","unstructured":"Runzhou Ge, Jiyang Gao, Kan Chen, and Ram Nevatia. 2019. MAC: Mining Activity Concepts for Language-based Temporal Localization. In IEEE Winter Conference on Applications of Computer Vision. 245--253."},{"key":"e_1_3_2_2_20_1","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. 1984--1990","author":"Ghosh Soham","year":"2019","unstructured":"Soham Ghosh, Anuva Agarwal, Zarana Parekh, and Alexander Hauptmann. 2019. ExCL: Extractive Clip Localization Using Natural Language Descriptions. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. 1984--1990."},{"key":"e_1_3_2_2_21_1","volume-title":"Proceedings of the Thirteenth International Conference on Artificial Intelligence and Statistics (Proceedings of Machine Learning Research","volume":"304","author":"Gutmann Michael","year":"2010","unstructured":"Michael Gutmann and Aapo Hyv\u00e4rinen. 2010. Noise-contrastive estimation: A new estimation principle for unnormalized statistical models. In Proceedings of the Thirteenth International Conference on Artificial Intelligence and Statistics (Proceedings of Machine Learning Research, Vol. 9). JMLR Workshop and Conference Proceedings, Chia Laguna Resort, Sardinia, Italy, 297--304."},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2006.100"},{"key":"e_1_3_2_2_23_1","volume-title":"The British Machine Vision Conference .","author":"Hahn Meera","year":"2020","unstructured":"Meera Hahn, Asim Kadav, James M Rehg, and Hans Peter Graf. 2020. Tripping through time: Efficient localization of activities in videos. In The British Machine Vision Conference ."},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018393"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_2_27_1","volume-title":"IEEE Conference on Computer Vision and Pattern Recognition. 961--970","author":"Heilbron F. C.","unstructured":"F. C. Heilbron, V. Escorcia, B. Ghanem, and J. C. Niebles. 2015. ActivityNet: A large-scale video benchmark for human activity understanding. In IEEE Conference on Computer Vision and Pattern Recognition. 961--970."},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1168"},{"key":"e_1_3_2_2_29_1","volume-title":"Localizing Moments in Video with Natural Language. In 2017 IEEE International Conference on Computer Vision (ICCV). 5804--5813","author":"Hendricks Lisa Anne","unstructured":"Lisa Anne Hendricks, Oliver Wang, Eli Shechtman, Josef Sivic, Trevor Darrell, and Bryan C. Russell. 2017a. Localizing Moments in Video with Natural Language. In 2017 IEEE International Conference on Computer Vision (ICCV). 5804--5813."},{"key":"e_1_3_2_2_30_1","volume-title":"Localizing Moments in Video with Natural Language. In 2017 IEEE International Conference on Computer Vision. 5804--5813","author":"Hendricks Lisa Anne","unstructured":"Lisa Anne Hendricks, Oliver Wang, Eli Shechtman, Josef Sivic, Trevor Darrell, and Bryan C. Russell. 2017b. Localizing Moments in Video with Natural Language. In 2017 IEEE International Conference on Computer Vision. 5804--5813."},{"key":"e_1_3_2_2_31_1","volume-title":"International Conference on Learning Representations .","author":"Hjelm R Devon","year":"2019","unstructured":"R Devon Hjelm, Alex Fedorov, Samuel Lavoie-Marchildon, Karan Grewal, Phil Bachman, Adam Trischler, and Yoshua Bengio. 2019. Learning deep representations by mutual information estimation and maximization. In International Conference on Learning Representations ."},{"key":"e_1_3_2_2_32_1","volume-title":"International Conference on Learning Representations .","author":"Huang Hsin-Yuan","year":"2018","unstructured":"Hsin-Yuan Huang, Chenguang Zhu, Yelong Shen, and Weizhu Chen. 2018. FusionNet: Fusing via Fully-aware Attention with Application to Machine Comprehension. In International Conference on Learning Representations ."},{"key":"e_1_3_2_2_33_1","volume-title":"Independent component analysis: algorithms and applications. Neural networks","author":"Erkki Oja Aapo","year":"2000","unstructured":"Aapo Hyv\"arinen and Erkki Oja. 2000. Independent component analysis: algorithms and applications. Neural networks , Vol. 13, 4--5 (2000), 411--430."},{"key":"e_1_3_2_2_34_1","volume-title":"Exploring the limits of language modeling. arXiv preprint arXiv:1602.02410","author":"Jozefowicz Rafal","year":"2016","unstructured":"Rafal Jozefowicz, Oriol Vinyals, Mike Schuster, Noam Shazeer, and Yonghui Wu. 2016. Exploring the limits of language modeling. arXiv preprint arXiv:1602.02410 (2016)."},{"key":"e_1_3_2_2_35_1","volume-title":"et almbox","author":"Kay Will","year":"2017","unstructured":"Will Kay, Joao Carreira, Karen Simonyan, Brian Zhang, Chloe Hillier, Sudheendra Vijayanarasimhan, Fabio Viola, Tim Green, Trevor Back, Paul Natsev, et almbox. 2017. The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)."},{"key":"e_1_3_2_2_36_1","volume-title":"Dense-Captioning Events in Videos. In IEEE International Conference on Computer Vision . 706--715","author":"Krishna R.","unstructured":"R. Krishna, K. Hata , F. Ren, L. Fei-Fei, and J. C. Niebles. 2017. Dense-Captioning Events in Videos. In IEEE International Conference on Computer Vision . 706--715."},{"key":"e_1_3_2_2_37_1","volume-title":"TVR: A Large-Scale Dataset for Video-Subtitle Moment Retrieval. In The European Conference on Computer Vision .","author":"Lei Jie","year":"2020","unstructured":"Jie Lei, Licheng Yu, Tamara L Berg, and Mohit Bansal. 2020. TVR: A Large-Scale Dataset for Video-Subtitle Moment Retrieval. In The European Conference on Computer Vision ."},{"key":"e_1_3_2_2_38_1","unstructured":"Linjie Li Yen-Chun Chen Yu Cheng Zhe Gan Licheng Yu and Jingjing Liu. 2020 a. HERO: Hierarchical Encoder for Video"},{"key":"e_1_3_2_2_39_1","volume-title":"Omni-representation Pre-training. In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP). Association for Computational Linguistics","author":"Language","unstructured":"Language Omni-representation Pre-training. In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP). Association for Computational Linguistics, Online, 2046--2065."},{"key":"e_1_3_2_2_40_1","volume-title":"2020 b. SEA: Sentence Encoder Assembly for Video Retrieval by Textual Queries","author":"Li Xirong","year":"2020","unstructured":"Xirong Li, Fangming Zhou, Chaoxi Xu, Jiaqi Ji, and Gang Yang. 2020 b. SEA: Sentence Encoder Assembly for Video Retrieval by Textual Queries. IEEE Transactions on Multimedia (2020)."},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_1"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6820"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3240508.3240549"},{"key":"e_1_3_2_2_44_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1518"},{"key":"e_1_3_2_2_46_1","unstructured":"Jiasen Lu Dhruv Batra Devi Parikh and Stefan Lee. 2019 a. ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks. In Advances in Neural Information Processing Systems. 13--23."},{"key":"e_1_3_2_2_47_1","volume-title":"Univilm: A unified video and language pre-training model for multimodal understanding and generation. arXiv preprint arXiv:2002.06353","author":"Luo Huaishao","year":"2020","unstructured":"Huaishao Luo, Lei Ji, Botian Shi, Haoyang Huang, Nan Duan, Tianrui Li, Xilin Chen, and Ming Zhou. 2020. Univilm: A unified video and language pre-training model for multimodal understanding and generation. arXiv preprint arXiv:2002.06353 (2020)."},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00990"},{"key":"e_1_3_2_2_49_1","volume-title":"Learning a text-video embedding from incomplete and heterogeneous data. arXiv preprint arXiv:1804.02516","author":"Miech Antoine","year":"2018","unstructured":"Antoine Miech, Ivan Laptev, and Josef Sivic. 2018. Learning a text-video embedding from incomplete and heterogeneous data. arXiv preprint arXiv:1804.02516 (2018)."},{"key":"e_1_3_2_2_50_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition . 6707--6717","author":"Misra Ishan","unstructured":"Ishan Misra and Laurens van der Maaten. 2020. Self-supervised learning of pretext-invariant representations. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition . 6707--6717."},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3206025.3206064"},{"key":"e_1_3_2_2_52_1","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 11592--11601","author":"Mithun Niluthpol Chowdhury","unstructured":"Niluthpol Chowdhury Mithun, Sujoy Paul, and Amit K. Roy-Chowdhury. 2019. Weakly Supervised Video Moment Retrieval From Text Queries. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 11592--11601."},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01082"},{"key":"e_1_3_2_2_54_1","volume-title":"Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748","author":"van den Oord Aaron","year":"2018","unstructured":"Aaron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.497"},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"e_1_3_2_2_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093328"},{"key":"e_1_3_2_2_58_1","volume-title":"Bidirectional Attention Flow for Machine Comprehension. In International Conference on Learning Representations .","author":"Seo Minjoon","year":"2017","unstructured":"Minjoon Seo, Aniruddha Kembhavi, Ali Farhadi, and Hannaneh Hajishirzi. 2017. Bidirectional Attention Flow for Machine Comprehension. In International Conference on Learning Representations ."},{"key":"e_1_3_2_2_59_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01240-3_13"},{"key":"e_1_3_2_2_60_1","volume-title":"Learning video representations using contrastive bidirectional transformer. arXiv preprint arXiv:1906.05743","author":"Sun Chen","year":"2019","unstructured":"Chen Sun, Fabien Baradel, Kevin Murphy, and Cordelia Schmid. 2019. Learning video representations using contrastive bidirectional transformer. arXiv preprint arXiv:1906.05743 (2019)."},{"key":"e_1_3_2_2_61_1","volume-title":"International Conference on Learning Representations .","author":"Sun Fan-Yun","year":"2020","unstructured":"Fan-Yun Sun, Jordan Hoffman, Vikas Verma, and Jian Tang. 2020. InfoGraph: Unsupervised and Semi-supervised Graph-Level Representation Learning via Mutual Information Maximization. In International Conference on Learning Representations ."},{"key":"e_1_3_2_2_62_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1514"},{"key":"e_1_3_2_2_63_1","volume-title":"\u0141 ukasz Kaiser, and Illia Polosukhin","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141 ukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems. 5998--6008."},{"key":"e_1_3_2_2_64_1","volume-title":"Deep Graph Infomax. In International Conference on Learning Representations .","author":"Fedus William","year":"2019","unstructured":"William Fedus, William L. Hamilton, Pietro Li\u00f2, Yoshua Bengio, and R Devon Hjelm. 2019. Deep Graph Infomax. In International Conference on Learning Representations ."},{"key":"e_1_3_2_2_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.515"},{"key":"e_1_3_2_2_66_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413975"},{"key":"e_1_3_2_2_67_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6897"},{"key":"e_1_3_2_2_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00042"},{"key":"e_1_3_2_2_69_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1018"},{"key":"e_1_3_2_2_70_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413862"},{"key":"e_1_3_2_2_71_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6924"},{"key":"e_1_3_2_2_72_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00393"},{"key":"e_1_3_2_2_73_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v29i1.9512"},{"key":"e_1_3_2_2_74_1","volume-title":"International Conference on Learning Representations .","author":"Yu Adams Wei","year":"2018","unstructured":"Adams Wei Yu, David Dohan, Quoc Le, Thang Luong, Rui Zhao, and Kai Chen. 2018a. Fast and Accurate Reading Comprehension by Combining Self-Attention and Convolution. In International Conference on Learning Representations ."},{"key":"e_1_3_2_2_75_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00142"},{"key":"e_1_3_2_2_76_1","unstructured":"Yitian Yuan Lin Ma Jingwen Wang Wei Liu and Wenwu Zhu. 2019 a. Semantic Conditioned Dynamic Modulation for Temporal Sentence Grounding in Videos. In Advances in Neural Information Processing Systems. 536--546."},{"key":"e_1_3_2_2_77_1","volume-title":"To Find Where You Talk: Temporal Sentence Localization in Video with Attention Based Location Regression. In Proceedings of the AAAI Conference on Artificial Intelligence","volume":"33","author":"Yuan Yitian","year":"2019","unstructured":"Yitian Yuan, Tao Mei, and Wenwu Zhu. 2019 b. To Find Where You Talk: Temporal Sentence Localization in Video with Attention Based Location Regression. In Proceedings of the AAAI Conference on Artificial Intelligence, Vol. 33. 9159--9166."},{"key":"e_1_3_2_2_78_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01030"},{"key":"e_1_3_2_2_79_1","volume-title":"2020 a. A Hierarchical Multi-Modal Encoder for Moment Localization in Video Corpus. arXiv preprint arXiv:2011.09046","author":"Zhang Bowen","year":"2020","unstructured":"Bowen Zhang, Hexiang Hu, Joonseok Lee, Ming Zhao, Sheide Chammas, Vihan Jain, Eugene Ie, and Fei Sha. 2020 a. A Hierarchical Multi-Modal Encoder for Moment Localization in Video Corpus. arXiv preprint arXiv:2011.09046 (2020)."},{"key":"e_1_3_2_2_80_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_23"},{"key":"e_1_3_2_2_81_1","volume-title":"Joey Tianyi Zhou, and Rick Siow Mong Goh","author":"Zhang Hao","year":"2021","unstructured":"Hao Zhang, Aixin Sun, Wei Jing, Liangli Zhen, Joey Tianyi Zhou, and Rick Siow Mong Goh. 2021. Natural Language Video Localization: A Revisit in Span-based Question Answering Framework. IEEE Transactions on Pattern Analysis and Machine Intelligence (2021)."},{"key":"e_1_3_2_2_82_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.585"},{"key":"e_1_3_2_2_83_1","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence .","author":"Zhang Songyang","year":"2020","unstructured":"Songyang Zhang, Houwen Peng, Jianlong Fu, and Jiebo Luo. 2020 b. Learning 2D Temporal Adjacent Networks forMoment Localization with Natural Language. In Proceedings of the AAAI Conference on Artificial Intelligence ."},{"key":"e_1_3_2_2_84_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350879"},{"key":"e_1_3_2_2_85_1","doi-asserted-by":"publisher","DOI":"10.1145\/3331184.3331235"},{"key":"e_1_3_2_2_86_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00877"},{"key":"e_1_3_2_2_87_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00610"}],"event":{"name":"SIGIR '21: The 44th International ACM SIGIR Conference on Research and Development in Information Retrieval","location":"Virtual Event Canada","acronym":"SIGIR '21","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"]},"container-title":["Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3404835.3462874","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3404835.3462874","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:47:17Z","timestamp":1750193237000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3404835.3462874"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,7,11]]},"references-count":87,"alternative-id":["10.1145\/3404835.3462874","10.1145\/3404835"],"URL":"https:\/\/doi.org\/10.1145\/3404835.3462874","relation":{},"subject":[],"published":{"date-parts":[[2021,7,11]]},"assertion":[{"value":"2021-07-11","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}