{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,2]],"date-time":"2026-01-02T07:32:25Z","timestamp":1767339145080,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":32,"publisher":"ACM","license":[{"start":{"date-parts":[[2019,10,15]],"date-time":"2019-10-15T00:00:00Z","timestamp":1571097600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["1813709","1722847","1704337"],"award-info":[{"award-number":["1813709","1722847","1704337"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2019,10,15]]},"DOI":"10.1145\/3343031.3350879","type":"proceedings-article","created":{"date-parts":[[2019,10,21]],"date-time":"2019-10-21T16:32:26Z","timestamp":1571675546000},"page":"1230-1238","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":51,"title":["Exploiting Temporal Relationships in Video Moment Localization with Natural Language"],"prefix":"10.1145","author":[{"given":"Songyang","family":"Zhang","sequence":"first","affiliation":[{"name":"University of Rochester, Rochester, NY, USA"}]},{"given":"Jinsong","family":"Su","sequence":"additional","affiliation":[{"name":"Xiamen University, Xiamen, China"}]},{"given":"Jiebo","family":"Luo","sequence":"additional","affiliation":[{"name":"University of Rochester, Rochester, NY, USA"}]}],"member":"320","published-online":{"date-parts":[[2019,10,15]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1177\/0961463X93002002005"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1093\/jos\/5.4.345"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Jingyuan Chen Xinpeng Chen Lin Ma Zequn Jie and Tat-Seng Chua. 2018. Temporally Grounding Natural Sentence in Video. In EMNLP .  Jingyuan Chen Xinpeng Chen Lin Ma Zequn Jie and Tat-Seng Chua. 2018. Temporally Grounding Natural Sentence in Video. In EMNLP .","DOI":"10.18653\/v1\/D18-1015"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Jingyuan Chen Lin Ma Xinpeng Chen Zequn Jie and Jiebo Luo. 2019. Localizing Natural Language in Videos. In AAAI .  Jingyuan Chen Lin Ma Xinpeng Chen Zequn Jie and Jiebo Luo. 2019. Localizing Natural Language in Videos. In AAAI .","DOI":"10.1609\/aaai.v33i01.33018175"},{"volume-title":"Automatically ordering events and times in text","author":"Derczynski Leon RA","key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","DOI":"10.1007\/978-3-319-47241-6"},{"volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","year":"2018","author":"Devlin Jacob","key":"e_1_3_2_1_6_1"},{"volume-title":"TALL: Temporal Activity Localization via Language Query. In ICCV .","year":"2017","author":"Gao Jiyang","key":"e_1_3_2_1_7_1"},{"volume-title":"MAC: Mining Activity Concepts for Language-based Temporal Localization. In WACV .","year":"2019","author":"Ge Runzhou","key":"e_1_3_2_1_8_1"},{"key":"e_1_3_2_1_9_1","unstructured":"Dongliang He Xiang Zhao Jizhou Huang Fu Li Xiao Liu and Shilei Wen. 2019. Read Watch and Move: Reinforcement Learning for Temporally Grounding Natural Language Descriptions in Videos. In AAAI .  Dongliang He Xiang Zhao Jizhou Huang Fu Li Xiao Liu and Shilei Wen. 2019. Read Watch and Move: Reinforcement Learning for Temporally Grounding Natural Language Descriptions in Videos. In AAAI ."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Lisa Anne Hendricks Oliver Wang Eli Shechtman Josef Sivic Trevor Darrell and Bryan Russell. 2017. Localizing Moments in Video With Natural Language. In ICCV .  Lisa Anne Hendricks Oliver Wang Eli Shechtman Josef Sivic Trevor Darrell and Bryan Russell. 2017. Localizing Moments in Video With Natural Language. In ICCV .","DOI":"10.1109\/ICCV.2017.618"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Lisa Anne Hendricks Oliver Wang Eli Shechtman Josef Sivic Trevor Darrell and Bryan Russell. 2018. Localizing Moments in Video with Temporal Language.. In EMNLP .  Lisa Anne Hendricks Oliver Wang Eli Shechtman Josef Sivic Trevor Darrell and Bryan Russell. 2018. Localizing Moments in Video with Temporal Language.. In EMNLP .","DOI":"10.18653\/v1\/D18-1168"},{"key":"e_1_3_2_1_12_1","volume-title":"Natural Language Semantics","volume":"5","author":"Hitzeman Janet","year":"1995"},{"key":"e_1_3_2_1_13_1","unstructured":"Ronghang Hu Marcus Rohrbach Jacob Andreas Trevor Darrell and Kate Saenko. 2017. Modeling relationships in referential expressions with compositional modular networks. In CVPR .  Ronghang Hu Marcus Rohrbach Jacob Andreas Trevor Darrell and Kate Saenko. 2017. Modeling relationships in referential expressions with compositional modular networks. In CVPR ."},{"volume-title":"Adam: A method for stochastic optimization. In ICLR .","year":"2015","author":"Kingma Diederik P","key":"e_1_3_2_1_14_1"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Nikita Kitaev and Dan Klein. 2018. Constituency Parsing with a Self-Attentive Encoder. In ACL .  Nikita Kitaev and Dan Klein. 2018. Constituency Parsing with a Self-Attentive Encoder. In ACL .","DOI":"10.18653\/v1\/P18-1249"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Junwei Liang Lu Jiang Liangliang Cao Li-Jia Li and Alexander G Hauptmann. 2018. Focal visual-text attention for visual question answering. In CVPR .  Junwei Liang Lu Jiang Liangliang Cao Li-Jia Li and Alexander G Hauptmann. 2018. Focal visual-text attention for visual question answering. In CVPR .","DOI":"10.1109\/CVPR.2018.00642"},{"key":"e_1_3_2_1_17_1","unstructured":"Bingbin Liu Serena Yeung Edward Chou De-An Huang Li Fei-Fei and Juan Carlos Niebles. 2018c. Temporal Modular Networks for Retrieving Complex Compositional Activities in Videos. In ECCV .  Bingbin Liu Serena Yeung Edward Chou De-An Huang Li Fei-Fei and Juan Carlos Niebles. 2018c. Temporal Modular Networks for Retrieving Complex Compositional Activities in Videos. In ECCV ."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","unstructured":"Meng Liu Xiang Wang Liqiang Nie Xiangnan He Baoquan Chen and Tat-Seng Chua. 2018a. Attentive moment retrieval in videos. In SIGIR .  Meng Liu Xiang Wang Liqiang Nie Xiangnan He Baoquan Chen and Tat-Seng Chua. 2018a. Attentive moment retrieval in videos. In SIGIR .","DOI":"10.1145\/3209978.3210003"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"Meng Liu Xiang Wang Liqiang Nie Qi Tian Baoquan Chen and Tat-Seng Chua. 2018b. Cross-modal Moment Localization in Videos. In ACM MM .  Meng Liu Xiang Wang Liqiang Nie Qi Tian Baoquan Chen and Tat-Seng Chua. 2018b. Cross-modal Moment Localization in Videos. In ACM MM .","DOI":"10.1145\/3240508.3240549"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Xihui Liu Zihao Wang Jing Shao Xiaogang Wang and Hongsheng Li. 2019. Improving Referring Expression Grounding with Cross-modal Attention-guided Erasing. CVPR .  Xihui Liu Zihao Wang Jing Shao Xiaogang Wang and Hongsheng Li. 2019. Improving Referring Expression Grounding with Cross-modal Attention-guided Erasing. CVPR .","DOI":"10.1109\/CVPR.2019.00205"},{"volume-title":"Glove: Global vectors for word representation. In EMNLP .","year":"2014","author":"Pennington Jeffrey","key":"e_1_3_2_1_21_1"},{"volume-title":"The tenses of verbs. The language of time: A reader","year":"2005","author":"Reichenbach Hans","key":"e_1_3_2_1_22_1"},{"volume-title":"Temporal specification of the present perfect: a corpus-based study. Language and Computers (01","year":"2002","author":"Schl\u00fcter Norbert","key":"e_1_3_2_1_23_1"},{"key":"e_1_3_2_1_24_1","unstructured":"Karen Simonyan and Andrew Zisserman. 2015. Very deep convolutional networks for large-scale image recognition. ICLR .  Karen Simonyan and Andrew Zisserman. 2015. Very deep convolutional networks for large-scale image recognition. ICLR ."},{"volume-title":"Manning","year":"2015","author":"Tai Kai Sheng","key":"e_1_3_2_1_25_1"},{"key":"e_1_3_2_1_26_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. In NIPS .  Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. In NIPS ."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"Limin Wang Yuanjun Xiong Zhe Wang Yu Qiao Dahua Lin Xiaoou Tang and Luc Van Gool. 2016. Temporal segment networks: Towards good practices for deep action recognition. In ECCV .  Limin Wang Yuanjun Xiong Zhe Wang Yu Qiao Dahua Lin Xiaoou Tang and Luc Van Gool. 2016. Temporal segment networks: Towards good practices for deep action recognition. In ECCV .","DOI":"10.1007\/978-3-319-46484-8_2"},{"volume-title":"Gregory D Hager, and Trac D Tran.","year":"2018","author":"Xiang Xiang","key":"e_1_3_2_1_28_1"},{"key":"e_1_3_2_1_29_1","unstructured":"Huijuan Xu Kun He Bryan A. Plummer Leonid Sigal Stan Sclaroff and Kate Saenko. 2019. Multilevel Language and Vision Integration for Text-to-Clip Retrieval.. In AAAI .  Huijuan Xu Kun He Bryan A. Plummer Leonid Sigal Stan Sclaroff and Kate Saenko. 2019. Multilevel Language and Vision Integration for Text-to-Clip Retrieval.. In AAAI ."},{"key":"e_1_3_2_1_30_1","unstructured":"Licheng Yu Zhe Lin Xiaohui Shen Jimei Yang Xin Lu Mohit Bansal and Tamara L Berg. 2018. MAttNet: Modular Attention Network for Referring Expression Comprehension. In CVPR .  Licheng Yu Zhe Lin Xiaohui Shen Jimei Yang Xin Lu Mohit Bansal and Tamara L Berg. 2018. MAttNet: Modular Attention Network for Referring Expression Comprehension. In CVPR ."},{"volume-title":"Davis","year":"2019","author":"Zhang Da","key":"e_1_3_2_1_31_1"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Hanwang Zhang Yulei Niu and Shih-Fu Chang. 2018. Grounding referring expressions in images by variational context. In CVPR .  Hanwang Zhang Yulei Niu and Shih-Fu Chang. 2018. Grounding referring expressions in images by variational context. In CVPR .","DOI":"10.1109\/CVPR.2018.00437"}],"event":{"name":"MM '19: The 27th ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Nice France","acronym":"MM '19"},"container-title":["Proceedings of the 27th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3343031.3350879","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3343031.3350879","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3343031.3350879","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T23:13:25Z","timestamp":1750202005000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3343031.3350879"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,10,15]]},"references-count":32,"alternative-id":["10.1145\/3343031.3350879","10.1145\/3343031"],"URL":"https:\/\/doi.org\/10.1145\/3343031.3350879","relation":{},"subject":[],"published":{"date-parts":[[2019,10,15]]},"assertion":[{"value":"2019-10-15","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}