{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,10]],"date-time":"2026-01-10T19:34:57Z","timestamp":1768073697808,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":90,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681042","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"8383-8392","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["SynopGround: A Large-Scale Dataset for Multi-Paragraph Video Grounding from TV Dramas and Synopses"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-2864-0696","authenticated-orcid":false,"given":"Chaolei","family":"Tan","sequence":"first","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7884-0044","authenticated-orcid":false,"given":"Zihang","family":"Lin","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9915-5437","authenticated-orcid":false,"given":"Junfu","family":"Pu","sequence":"additional","affiliation":[{"name":"ARC Lab, Tencent PCG, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8298-4063","authenticated-orcid":false,"given":"Zhongang","family":"Qi","sequence":"additional","affiliation":[{"name":"ARC Lab, Tencent PCG, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7270-2480","authenticated-orcid":false,"given":"Wei-Yi","family":"Pei","sequence":"additional","affiliation":[{"name":"Tencent Video, PCG, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-0239-9232","authenticated-orcid":false,"given":"Zhi","family":"Qu","sequence":"additional","affiliation":[{"name":"Tencent Video, PCG, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-3407-1215","authenticated-orcid":false,"given":"Yexin","family":"Wang","sequence":"additional","affiliation":[{"name":"Tencent Video, PCG, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7673-8325","authenticated-orcid":false,"given":"Ying","family":"Shan","sequence":"additional","affiliation":[{"name":"ARC Lab, Tencent PCG, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8327-0003","authenticated-orcid":false,"given":"Wei-Shi","family":"Zheng","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University &amp; Key Laboratory of Machine Intelligence and Advanced Computing, Ministry of Education, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-2678-5373","authenticated-orcid":false,"given":"Jian-Fang","family":"Hu","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University &amp; Guangdong Province Key Laboratory of Information Security Technology, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Ht-step: Aligning instructional articles with how-to videos. In NeurIPS.","author":"Afouras Triantafyllos","year":"2023","unstructured":"Triantafyllos Afouras, Effrosyni Mavroudi, Tushar Nagarajan, Huiyu Wang, and Lorenzo Torresani. 2023. Ht-step: Aligning instructional articles with how-to videos. In NeurIPS."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Lisa Anne Hendricks Oliver Wang Eli Shechtman Josef Sivic Trevor Darrell and Bryan Russell. 2017. Localizing moments in video with natural language. In ICCV.","DOI":"10.1109\/ICCV.2017.618"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Max Bain Arsha Nagrani Andrew Brown and Andrew Zisserman. 2020. Condensed movies: Story based retrieval with contextual embeddings. In ACCV.","DOI":"10.1007\/978-3-030-69541-5_28"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Max Bain Arsha Nagrani G\u00fcl Varol and Andrew Zisserman. 2021. Frozen in time: A joint video and image encoder for end-to-end retrieval. In ICCV.","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Peijun Bao Qian Zheng and Yadong Mu. 2021. Dense events grounding in video. In AAAI.","DOI":"10.1609\/aaai.v35i2.16175"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Omer Bar-Tal Dolev Ofri-Amar Rafail Fridman Yoni Kasten and Tali Dekel. 2022. Text2live: Text-driven layered image and video editing. In ECCV.","DOI":"10.1007\/978-3-031-19784-0_41"},{"key":"e_1_3_2_1_7_1","volume-title":"Activitynet: A large-scale video benchmark for human activity understanding. In CVPR.","author":"Heilbron Fabian Caba","year":"2015","unstructured":"Fabian Caba Heilbron, Victor Escorcia, Bernard Ghanem, and Juan Carlos Niebles. 2015. Activitynet: A large-scale video benchmark for human activity understanding. In CVPR."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Nicolas Carion Francisco Massa Gabriel Synnaeve Nicolas Usunier Alexander Kirillov and Sergey Zagoruyko. 2020. End-to-end object detection with transformers. In ECCV.","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_9_1","volume-title":"A short note about kinetics-600. arXiv preprint arXiv:1808.01340","author":"Carreira Joao","year":"2018","unstructured":"Joao Carreira, Eric Noland, Andras Banki-Horvath, Chloe Hillier, and Andrew Zisserman. 2018. A short note about kinetics-600. arXiv preprint arXiv:1808.01340 (2018)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Joao Carreira and Andrew Zisserman. 2017. Quo vadis action recognition? a new model and the kinetics dataset. In CVPR.","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Long Chen Chujie Lu Siliang Tang Jun Xiao Dong Zhang Chilie Tan and Xiaolin Li. 2020. Rethinking the bottom-up framework for query-based video localization. In AAAI.","DOI":"10.1609\/aaai.v34i07.6627"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Long Chen Yulei Niu Brian Chen Xudong Lin Guangxing Han Christopher Thomas Hammad Ayyubi Heng Ji and Shih-Fu Chang. 2022. Weakly-supervised temporal article grounding. In EMNLP.","DOI":"10.18653\/v1\/2022.emnlp-main.639"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Shizhe Chen Yida Zhao Qin Jin and Qi Wu. 2020. Fine-grained video-text retrieval with hierarchical graph reasoning. In CVPR.","DOI":"10.1109\/CVPR42600.2020.01065"},{"key":"e_1_3_2_1_14_1","unstructured":"Yi-Wen Chen Yi-Hsuan Tsai and Ming-Hsuan Yang. 2021. End-to-end multi-modal video temporal grounding. In NeurIPS."},{"key":"e_1_3_2_1_15_1","volume-title":"State-of-the-art and future challenges in video scene detection: a survey. Multimedia Systems","author":"Fabro Manfred Del","year":"2013","unstructured":"Manfred Del Fabro and Laszlo B\u00f6sz\u00f6rmenyi. 2013. State-of-the-art and future challenges in video scene detection: a survey. Multimedia Systems (2013)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Jianfeng Dong Xianke Chen Minsong Zhang Xun Yang Shujie Chen Xirong Li and Xun Wang. 2022. Partially Relevant Video Retrieval. In ACM MM.","DOI":"10.1145\/3503161.3547976"},{"key":"e_1_3_2_1_17_1","volume-title":"Svtr: Scene text recognition with a single visual model. arXiv preprint arXiv:2205.00159","author":"Du Yongkun","year":"2022","unstructured":"Yongkun Du, Zhineng Chen, Caiyan Jia, Xiaoting Yin, Tianlun Zheng, Chenxia Li, Yuning Du, and Yu-Gang Jiang. 2022. Svtr: Scene text recognition with a single visual model. arXiv preprint arXiv:2205.00159 (2022)."},{"key":"e_1_3_2_1_18_1","volume-title":"Temporal localization of moments in video collections with natural language. arXiv preprint arXiv:1907.12763","author":"Escorcia Victor","year":"2019","unstructured":"Victor Escorcia, Mattia Soldan, Josef Sivic, Bernard Ghanem, and Bryan Russell. 2019. Temporal localization of moments in video collections with natural language. arXiv preprint arXiv:1907.12763 (2019)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"Christoph Feichtenhofer Haoqi Fan Jitendra Malik and Kaiming He. 2019. Slowfast networks for video recognition. In ICCV.","DOI":"10.1109\/ICCV.2019.00630"},{"key":"e_1_3_2_1_20_1","volume-title":"Text-based editing of talking-head video. TOG","author":"Fried Ohad","year":"2019","unstructured":"Ohad Fried, Ayush Tewari, Michael Zollh\u00f6fer, Adam Finkelstein, Eli Shechtman, Dan B Goldman, Kyle Genova, Zeyu Jin, Christian Theobalt, and Maneesh Agrawala. 2019. Text-based editing of talking-head video. TOG (2019)."},{"key":"e_1_3_2_1_21_1","volume-title":"Scott T Grafton, Miguel P Eckstein, and William Yang Wang.","author":"Fu Tsu-Jui","year":"2022","unstructured":"Tsu-Jui Fu, Xin Eric Wang, Scott T Grafton, Miguel P Eckstein, and William Yang Wang. 2022. M3L: Language-based video editing via multi-modal multi-level transformers. In CVPR."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Valentin Gabeur Chen Sun Karteek Alahari and Cordelia Schmid. 2020. Multi-modal transformer for video retrieval. In ECCV.","DOI":"10.1007\/978-3-030-58548-8_13"},{"key":"e_1_3_2_1_23_1","volume-title":"Tall: Temporal activity localization via language query. In ICCV.","author":"Gao Jiyang","year":"2017","unstructured":"Jiyang Gao, Chen Sun, Zhenheng Yang, and Ram Nevatia. 2017. Tall: Temporal activity localization via language query. In ICCV."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Kristen Grauman Andrew Westbury Eugene Byrne Zachary Chavis Antonino Furnari Rohit Girdhar Jackson Hamburger Hao Jiang Miao Liu Xingyu Liu et al. 2022. Ego4d: Around the world in 3 000 hours of egocentric video. In CVPR.","DOI":"10.1109\/CVPR52688.2022.01842"},{"key":"e_1_3_2_1_25_1","volume-title":"Ava: A video dataset of spatio-temporally localized atomic visual actions. In CVPR.","author":"Gu Chunhui","year":"2018","unstructured":"Chunhui Gu, Chen Sun, David A Ross, Carl Vondrick, Caroline Pantofaru, Yeqing Li, Sudheendra Vijayanarasimhan, George Toderici, Susanna Ricco, Rahul Sukthankar, et al. 2018. Ava: A video dataset of spatio-temporally localized atomic visual actions. In CVPR."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Monica-Laura Haurilet Makarand Tapaswi Ziad Al-Halah and Rainer Stiefelhagen. 2016. Naming TV characters by watching and analyzing dialogs. In WACV.","DOI":"10.1109\/WACV.2016.7477560"},{"key":"e_1_3_2_1_27_1","volume-title":"Mike Zheng Shou, and Nan Duan","author":"Hou Zhijian","year":"2023","unstructured":"Zhijian Hou, Wanjun Zhong, Lei Ji, Difei Gao, Kun Yan, W.k. Chan, Chong-Wah Ngo, Mike Zheng Shou, and Nan Duan. 2023. CONE: An Efficient COarse-to-fiNE Alignment Framework for Long Video Temporal Grounding. In ACL."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"Jiabo Huang Hailin Jin Shaogang Gong and Yang Liu. 2022. Video activity localisation with uncertainties in temporal boundary. In ECCV.","DOI":"10.1007\/978-3-031-19830-4_41"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Qingqiu Huang Yu Xiong and Dahua Lin. 2018. Unifying identification and context learning for person recognition. In CVPR.","DOI":"10.1109\/CVPR.2018.00236"},{"key":"e_1_3_2_1_30_1","volume-title":"Movienet: A holistic dataset for movie understanding. In ECCV.","author":"Huang Qingqiu","year":"2020","unstructured":"Qingqiu Huang, Yu Xiong, Anyi Rao, Jiaze Wang, and Dahua Lin. 2020. Movienet: A holistic dataset for movie understanding. In ECCV."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Jinhyun Jang Jungin Park Jin Kim Hyeongjun Kwon and Kwanghoon Sohn. 2023. Knowing Where to Focus: Event-aware Transformer for Video Grounding. In ICCV.","DOI":"10.1109\/ICCV51070.2023.01273"},{"key":"e_1_3_2_1_32_1","volume-title":"Tgif-qa: Toward spatio-temporal reasoning in visual question answering. In CVPR.","author":"Jang Yunseok","year":"2017","unstructured":"Yunseok Jang, Yale Song, Youngjae Yu, Youngjin Kim, and Gunhee Kim. 2017. Tgif-qa: Toward spatio-temporal reasoning in visual question answering. In CVPR."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00235"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"crossref","unstructured":"Xun Jiang Xing Xu Jingran Zhang Fumin Shen Zuo Cao and Heng Tao Shen. 2022. Semi-supervised Video Paragraph Grounding with Contrastive Encoder. In CVPR.","DOI":"10.1109\/CVPR52688.2022.00250"},{"key":"e_1_3_2_1_35_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"crossref","unstructured":"Sharath Koorathota Patrick Adelman Kelly Cotton and Paul Sajda. 2021. Editing like humans: a contextual multimodal framework for automated video editing. In CVPR.","DOI":"10.1109\/CVPRW53098.2021.00186"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Ranjay Krishna Kenji Hata Frederic Ren Li Fei-Fei and Juan Carlos Niebles. 2017. Dense-captioning events in videos. In ICCV.","DOI":"10.1109\/ICCV.2017.83"},{"key":"e_1_3_2_1_38_1","volume-title":"A closer look at debiased temporal sentence grounding in videos: Dataset, metric, and approach. TOMM","author":"Lan Xiaohan","year":"2023","unstructured":"Xiaohan Lan, Yitian Yuan, Xin Wang, Long Chen, Zhi Wang, Lin Ma, and Wenwu Zhu. 2023. A closer look at debiased temporal sentence grounding in videos: Dataset, metric, and approach. TOMM (2023)."},{"key":"e_1_3_2_1_39_1","unstructured":"Thao Minh Le Vuong Le Svetha Venkatesh and Truyen Tran. 2020. Hierarchical conditional relation networks for video question answering. In CVPR."},{"key":"e_1_3_2_1_40_1","unstructured":"Jie Lei Tamara L Berg and Mohit Bansal. 2021. Detecting moments and highlights in videos via natural language queries. In NeurIPS."},{"key":"e_1_3_2_1_41_1","volume-title":"TVQA: Localized, Compositional Video Question Answering. In EMNLP.","author":"Lei Jie","year":"2018","unstructured":"Jie Lei, Licheng Yu, Mohit Bansal, and Tamara Berg. 2018. TVQA: Localized, Compositional Video Question Answering. In EMNLP."},{"key":"e_1_3_2_1_42_1","volume-title":"Tvr: A large-scale dataset for video-subtitle moment retrieval. In ECCV.","author":"Lei Jie","year":"2020","unstructured":"Jie Lei, Licheng Yu, Tamara L Berg, and Mohit Bansal. 2020. Tvr: A large-scale dataset for video-subtitle moment retrieval. In ECCV."},{"key":"e_1_3_2_1_43_1","unstructured":"Hongxiang Li Meng Cao Xuxin Cheng Yaowei Li Zhihong Zhu and Yuexian Zou. 2023. G2l: Semantically aligned and uniform video grounding via geodesic and game theory. In ICCV."},{"key":"e_1_3_2_1_44_1","unstructured":"Yicong Li Xiang Wang Junbin Xiao Wei Ji and Tat-Seng Chua. 2022. Invariant grounding for video question answering. In CVPR."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"crossref","unstructured":"Tianming Liang Chaolei Tan Beihao Xia Wei-Shi Zheng and Jian-Fang Hu. 2024. Ranking Distillation for Open-Ended Video Question Answering with Insufficient Labels. In CVPR.","DOI":"10.1109\/CVPR52733.2024.01250"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","unstructured":"Minghui Liao Zhaoyi Wan Cong Yao Kai Chen and Xiang Bai. 2020. Real-time scene text detection with differentiable binarization. In AAAI.","DOI":"10.1609\/aaai.v34i07.6812"},{"key":"e_1_3_2_1_47_1","unstructured":"Zihang Lin Chaolei Tan Jian-Fang Hu Zhi Jin Tiancai Ye and Wei-Shi Zheng. 2023. Collaborative static and dynamic vision-language streams for spatio-temporal video grounding. In CVPR."},{"key":"e_1_3_2_1_48_1","unstructured":"Daizong Liu and Wei Hu. 2022. Skimming locating then perusing: A human-like framework for natural language video localization. In ACM MM."},{"key":"e_1_3_2_1_49_1","unstructured":"Daizong Liu Xiaoye Qu Jianfeng Dong Pan Zhou Yu Cheng Wei Wei Zichuan Xu and Yulai Xie. 2021. Context-aware biaffine localizing network for temporal sentence grounding. In CVPR."},{"key":"e_1_3_2_1_50_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_51_1","volume-title":"Debug: A dense bottom-up grounding approach for natural language video localization. In EMNLP.","author":"Lu Chujie","year":"2019","unstructured":"Chujie Lu, Long Chen, Chilie Tan, Xiaolin Li, and Jun Xiao. 2019. Debug: A dense bottom-up grounding approach for natural language video localization. In EMNLP."},{"key":"e_1_3_2_1_52_1","unstructured":"Junhua Mao Jonathan Huang Alexander Toshev Oana Camburu Alan L Yuille and Kevin Murphy. 2016. Generation and comprehension of unambiguous object descriptions. In CVPR."},{"key":"e_1_3_2_1_53_1","unstructured":"Jonghwan Mun Minsu Cho and Bohyung Han. 2020. Local-global video-text interactions for temporal grounding. In CVPR."},{"key":"e_1_3_2_1_54_1","volume-title":"From benedict cumberbatch to sherlock holmes: Character identification in tv series without a script. arXiv preprint arXiv:1801.10442","author":"Nagrani Arsha","year":"2018","unstructured":"Arsha Nagrani and Andrew Zisserman. 2018. From benedict cumberbatch to sherlock holmes: Character identification in tv series without a script. arXiv preprint arXiv:1801.10442 (2018)."},{"key":"e_1_3_2_1_55_1","unstructured":"Guoshun Nan Rui Qiao Yao Xiao Jun Liu Sicong Leng Hao Zhang and Wei Lu. 2021. Interventional video grounding with dual contrastive learning. In CVPR."},{"key":"e_1_3_2_1_56_1","unstructured":"Mayu Otani Yuta Nakashima Esa Rahtu and Janne Heikkil\"a. 2020. Uncovering hidden challenges in query-based video moment retrieval. In BMVC."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"crossref","unstructured":"Yulin Pan Xiangteng He Biao Gong Yiliang Lv Yujun Shen Yuxin Peng and Deli Zhao. 2023. Scanning Only Once: An End-to-end Framework for Fast Temporal Grounding in Long Videos. In ICCV.","DOI":"10.1109\/ICCV51070.2023.01266"},{"key":"e_1_3_2_1_58_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In ICML."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"crossref","unstructured":"Anyi Rao Jiaze Wang Linning Xu Xuekun Jiang Qingqiu Huang Bolei Zhou and Dahua Lin. 2020. A unified framework for shot type classification based on subject centric lens. In ECCV.","DOI":"10.1007\/978-3-030-58621-8_2"},{"key":"e_1_3_2_1_60_1","volume-title":"Grounding action descriptions in videos. TACL","author":"Regneri Michaela","year":"2013","unstructured":"Michaela Regneri, Marcus Rohrbach, Dominikus Wetzel, Stefan Thater, Bernt Schiele, and Manfred Pinkal. 2013. Grounding action descriptions in videos. TACL (2013)."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"crossref","unstructured":"Hamid Rezatofighi Nathan Tsoi JunYoung Gwak Amir Sadeghian Ian Reid and Silvio Savarese. 2019. Generalized intersection over union: A metric and a loss for bounding box regression. In CVPR.","DOI":"10.1109\/CVPR.2019.00075"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"crossref","unstructured":"Marcus Rohrbach Michaela Regneri Mykhaylo Andriluka Sikandar Amin Manfred Pinkal and Bernt Schiele. 2012. Script data for attribute-based recognition of composite activities. In ECCV.","DOI":"10.1007\/978-3-642-33718-5_11"},{"key":"e_1_3_2_1_63_1","volume-title":"End-to-end dense video grounding via parallel regression. Computer Vision and Image Understanding","author":"Shi Fengyuan","year":"2024","unstructured":"Fengyuan Shi, Weilin Huang, and Limin Wang. 2024. End-to-end dense video grounding via parallel regression. Computer Vision and Image Understanding (2024)."},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"crossref","unstructured":"Gunnar A Sigurdsson G\u00fcl Varol Xiaolong Wang Ali Farhadi Ivan Laptev and Abhinav Gupta. 2016. Hollywood in homes: Crowdsourcing data collection for activity understanding. In ECCV.","DOI":"10.1007\/978-3-319-46448-0_31"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"crossref","unstructured":"Gabriel S Simoes J\u00f4natas Wehrmann Rodrigo C Barros and Duncan D Ruiz. 2016. Movie genre classification with convolutional neural networks. In IJCNN.","DOI":"10.1109\/IJCNN.2016.7727207"},{"key":"e_1_3_2_1_66_1","volume-title":"Fabian Caba, Chen Zhao, Silvio Giancola, and Bernard Ghanem.","author":"Soldan Mattia","year":"2022","unstructured":"Mattia Soldan, Alejandro Pardo, Juan Le\u00f3n Alc\u00e1zar, Fabian Caba, Chen Zhao, Silvio Giancola, and Bernard Ghanem. 2022. Mad: A scalable dataset for language grounding in videos from movie audio descriptions. In CVPR."},{"key":"e_1_3_2_1_67_1","volume-title":"Synopses of Movie Narratives: a Video-Language Dataset for Story Understanding. arXiv preprint arXiv:2203.05711","author":"Sun Yidan","year":"2022","unstructured":"Yidan Sun, Qin Chao, and Boyang Li. 2022. Synopses of Movie Narratives: a Video-Language Dataset for Story Understanding. arXiv preprint arXiv:2203.05711 (2022)."},{"key":"e_1_3_2_1_68_1","unstructured":"Chaolei Tan Jian-Fang Hu and Wei-Shi Zheng. 2022. Context Alignment Network for Video Moment Retrieval. In CICAI."},{"key":"e_1_3_2_1_69_1","unstructured":"Chaolei Tan Jianhuang Lai Wei-Shi Zheng and Jian-Fang Hu. 2024. Siamese Learning with Joint Alignment and Regression for Weakly-Supervised Video Paragraph Grounding. In CVPR."},{"key":"e_1_3_2_1_70_1","volume-title":"Augmented 2d-tan: A two-stage approach for human-centric spatio-temporal video grounding. arXiv preprint arXiv:2106.10634","author":"Tan Chaolei","year":"2021","unstructured":"Chaolei Tan, Zihang Lin, Jian-Fang Hu, Xiang Li, and Wei-Shi Zheng. 2021. Augmented 2d-tan: A two-stage approach for human-centric spatio-temporal video grounding. arXiv preprint arXiv:2106.10634 (2021)."},{"key":"e_1_3_2_1_71_1","unstructured":"Chaolei Tan Zihang Lin Jian-Fang Hu Wei-Shi Zheng and Jianhuang Lai. 2023. Hierarchical Semantic Correspondence Networks for Video Paragraph Grounding. In CVPR."},{"key":"e_1_3_2_1_72_1","volume-title":"Movieqa: Understanding stories in movies through question-answering. In CVPR.","author":"Tapaswi Makarand","year":"2016","unstructured":"Makarand Tapaswi, Yukun Zhu, Rainer Stiefelhagen, Antonio Torralba, Raquel Urtasun, and Sanja Fidler. 2016. Movieqa: Understanding stories in movies through question-answering. In CVPR."},{"key":"e_1_3_2_1_73_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. In NeurIPS."},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"crossref","unstructured":"Hao Wang Zheng-Jun Zha Xuejin Chen Zhiwei Xiong and Jiebo Luo. 2020. Dual path interaction network for video moment localization. In ACM MM.","DOI":"10.1145\/3394171.3413975"},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"crossref","unstructured":"Zheng Wang Jingjing Chen and Yu-Gang Jiang. 2021. Visual co-occurrence alignment learning for weakly-supervised video moment retrieval. In ACM MM.","DOI":"10.1145\/3474085.3475278"},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"crossref","unstructured":"Zhenzhi Wang Limin Wang Tao Wu Tianhao Li and Gangshan Wu. 2022. Negative sample matters: A renaissance of metric learning for temporal grounding. In AAAI.","DOI":"10.1609\/aaai.v36i3.20163"},{"key":"e_1_3_2_1_77_1","unstructured":"Wenhao Wu Haipeng Luo Bo Fang Jingdong Wang and Wanli Ouyang. 2023. Cap4Video: What Can Auxiliary Captions Do for Text-Video Retrieval?. In CVPR."},{"key":"e_1_3_2_1_78_1","volume-title":"Next-qa: Next phase of question-answering to explaining temporal actions. In CVPR.","author":"Xiao Junbin","year":"2021","unstructured":"Junbin Xiao, Xindi Shang, Angela Yao, and Tat-Seng Chua. 2021. Next-qa: Next phase of question-answering to explaining temporal actions. In CVPR."},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"crossref","unstructured":"Shaoning Xiao Long Chen Songyang Zhang Wei Ji Jian Shao Lu Ye and Jun Xiao. 2021. Boundary proposal network for two-stage natural language video localization. In AAAI.","DOI":"10.1609\/aaai.v35i4.16406"},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"crossref","unstructured":"Yu Xiong Qingqiu Huang Lingfeng Guo Hang Zhou Bolei Zhou and Dahua Lin. 2019. A graph-based framework to bridge movies and synopses. In ICCV.","DOI":"10.1109\/ICCV.2019.00469"},{"key":"e_1_3_2_1_81_1","doi-asserted-by":"crossref","unstructured":"Zhengyuan Yang Tianlang Chen Liwei Wang and Jiebo Luo. 2020. Improving one-stage visual grounding by recursive sub-query construction. In ECCV.","DOI":"10.1007\/978-3-030-58568-6_23"},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"crossref","unstructured":"Jiong Yin Liang Li Jiehua Zhang Chenggang Yan Lei Zhang and Zunjie Zhu. 2023. Reducing Intrinsic and Extrinsic Data Biases for Moment Localization with Natural Language. In ACM MM.","DOI":"10.1145\/3581783.3612357"},{"key":"e_1_3_2_1_83_1","volume-title":"Activitynet-qa: A dataset for understanding complex web videos via question answering. In AAAI.","author":"Yu Zhou","year":"2019","unstructured":"Zhou Yu, Dejing Xu, Jun Yu, Ting Yu, Zhou Zhao, Yueting Zhuang, and Dacheng Tao. 2019. Activitynet-qa: A dataset for understanding complex web videos via question answering. In AAAI."},{"key":"e_1_3_2_1_84_1","doi-asserted-by":"crossref","unstructured":"Yitian Yuan Tao Mei and Wenwu Zhu. 2019. To find where you talk: Temporal sentence localization in video with attention based location regression. In AAAI.","DOI":"10.1609\/aaai.v33i01.33019159"},{"key":"e_1_3_2_1_85_1","doi-asserted-by":"crossref","unstructured":"Runhao Zeng Haoming Xu Wenbing Huang Peihao Chen Mingkui Tan and Chuang Gan. 2020. Dense regression network for video grounding. In CVPR.","DOI":"10.1109\/CVPR42600.2020.01030"},{"key":"e_1_3_2_1_86_1","volume-title":"Man: Moment alignment network for natural language moment retrieval via iterative graph adjustment. In CVPR.","author":"Zhang Da","year":"2019","unstructured":"Da Zhang, Xiyang Dai, Xin Wang, Yuan-Fang Wang, and Larry S Davis. 2019. Man: Moment alignment network for natural language moment retrieval via iterative graph adjustment. In CVPR."},{"key":"e_1_3_2_1_87_1","volume-title":"Joey Tianyi Zhou, and Rick Siow Mong Goh","author":"Zhang Hao","year":"2021","unstructured":"Hao Zhang, Aixin Sun, Wei Jing, Liangli Zhen, Joey Tianyi Zhou, and Rick Siow Mong Goh. 2021. Natural language video localization: A revisit in span-based question answering framework. TPAMI (2021)."},{"key":"e_1_3_2_1_88_1","volume-title":"Temporal sentence grounding in videos: A survey and future directions. TPAMI","author":"Zhang Hao","year":"2023","unstructured":"Hao Zhang, Aixin Sun, Wei Jing, and Joey Tianyi Zhou. 2023. Temporal sentence grounding in videos: A survey and future directions. TPAMI (2023)."},{"key":"e_1_3_2_1_89_1","doi-asserted-by":"crossref","unstructured":"Songyang Zhang Houwen Peng Jianlong Fu and Jiebo Luo. 2020. Learning 2d temporal adjacent networks for moment localization with natural language. In AAAI.","DOI":"10.1609\/aaai.v34i07.6984"},{"key":"e_1_3_2_1_90_1","doi-asserted-by":"crossref","unstructured":"Songyang Zhang Jinsong Su and Jiebo Luo. 2019. Exploiting temporal relationships in video moment localization with natural language. In ACM MM.","DOI":"10.1145\/3343031.3350879"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681042","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681042","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:37Z","timestamp":1750295857000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681042"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":90,"alternative-id":["10.1145\/3664647.3681042","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681042","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}