{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T08:06:39Z","timestamp":1761897999285,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":62,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100003725","name":"National Research Foundation of Korea","doi-asserted-by":"publisher","id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100003725","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681514","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"8199-8208","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Let Me Finish My Sentence: Video Temporal Grounding with Holistic Text Understanding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-4941-6837","authenticated-orcid":false,"given":"Jongbhin","family":"Woo","sequence":"first","affiliation":[{"name":"KAIST, Daejeon, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-5638-1832","authenticated-orcid":false,"given":"Hyeonggon","family":"Ryu","sequence":"additional","affiliation":[{"name":"KAIST, Daejeon, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-0500-6025","authenticated-orcid":false,"given":"Youngjoon","family":"Jang","sequence":"additional","affiliation":[{"name":"KAIST, Daejeon, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9979-8929","authenticated-orcid":false,"given":"Jae Won","family":"Cho","sequence":"additional","affiliation":[{"name":"Sejong University, Seoul, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7741-7275","authenticated-orcid":false,"given":"Joon Son","family":"Chung","sequence":"additional","affiliation":[{"name":"KAIST, Daejeon, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.618"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19809-0_3"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1015"},{"key":"e_1_3_2_2_4_1","unstructured":"Yi-Wen Chen Yi-Hsuan Tsai and Ming-Hsuan Yang. 2021. End-to-end multi-modal video temporal grounding. In NeurIPS."},{"key":"e_1_3_2_2_5_1","volume-title":"Youngtaek Oh, Dong-Jin Kim, and In So Kweon.","author":"Cho Jae Won","year":"2023","unstructured":"Jae Won Cho, Dawit Mureja Argaw, Youngtaek Oh, Dong-Jin Kim, and In So Kweon. 2023. Empirical study on using adapters for debiased Visual Question Answering. Computer Vision and Image Understanding (2023)."},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01124"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.563"},{"key":"e_1_3_2_2_9_1","volume-title":"Proc. ACL.","author":"Ghosh Soham","year":"2019","unstructured":"Soham Ghosh, Anuva Agarwal, Zarana Parekh, and Alexander Hauptmann. 2019. ExCL: Extractive Clip Localization Using Natural Language Descriptions. In Proc. ACL."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475241"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i1.19972"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01273"},{"key":"e_1_3_2_2_13_1","volume-title":"Proc. BMVC.","author":"Jang Youngjoon","year":"2022","unstructured":"Youngjoon Jang, Youngtaek Oh, Jae Won Cho, Dong-Jin Kim, Joon Son Chung, and In So Kweon. 2022. Signing Outside the Studio: Benchmarking Background Robustness for Continuous Sign Language Recognition. In Proc. BMVC."},{"volume-title":"So Kweon, and Joon Son Chung. 2023. Self-Sufficient Framework for Continuous Sign Language Recognition. In Proc. ICASSP.","author":"Jang Youngjoon","key":"e_1_3_2_2_14_1","unstructured":"Youngjoon Jang, Youngtaek Oh, Jae Won Cho, Myungchul Kim, Dong-Jin Kim, In So Kweon, and Joon Son Chung. 2023. Self-Sufficient Framework for Continuous Sign Language Recognition. In Proc. ICASSP."},{"key":"e_1_3_2_2_15_1","volume-title":"Proc. ICLR.","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In Proc. ICLR."},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01425"},{"key":"e_1_3_2_2_17_1","volume-title":"Proc. BMVC.","author":"Kim Dong-Jin","year":"2021","unstructured":"Dong-Jin Kim, Jae Won Cho, Jinsoo Choi, Yunjae Jung, and In So Kweon. 2021. Single-Modal Entropy based Active Learning for Visual Question Answering. In Proc. BMVC."},{"key":"e_1_3_2_2_18_1","unstructured":"Jie Lei Tamara L Berg and Mohit Bansal. 2021. Detecting moments and highlights in videos via natural language queries. In NeurIPS."},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58589-1_27"},{"key":"e_1_3_2_2_20_1","unstructured":"Junnan Li Ramprasaath Selvaraju Akhilesh Gotmare Shafiq Joty Caiming Xiong and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. In NeurIPS."},{"key":"e_1_3_2_2_21_1","volume-title":"Momentdiff: Generative video moment retrieval from random to real. In NeurIPS.","author":"Li Pandeng","year":"2023","unstructured":"Pandeng Li, Chen-Wei Xie, Hongtao Xie, Liming Zhao, Lei Zhang, Yun Zheng, Deli Zhao, and Yongdong Zhang. 2023. Momentdiff: Generative video moment retrieval from random to real. In NeurIPS."},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3512527.3531382"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00262"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3240508.3240549"},{"key":"e_1_3_2_2_25_1","volume-title":"Proc. ICLR.","author":"Liu Shilong","year":"2022","unstructured":"Shilong Liu, Feng Li, Hao Zhang, Xiao Yang, Xianbiao Qi, Hang Su, Jun Zhu, and Lei Zhang. 2022. Dab-detr: Dynamic anchor boxes are better queries for detr. In Proc. ICLR."},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298994"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00305"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02205"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01082"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00279"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"e_1_3_2_2_32_1","volume-title":"Proc. ICLR.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In Proc. ICLR."},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00207"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00075"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00112"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/354384.354443"},{"volume-title":"So Kweon, and Joon Son Chung. 2023. Hindi as a second language: Improving visually grounded speech with semantically similar samples. In Proc. ICASSP.","author":"Ryu Hyeonggon","key":"e_1_3_2_2_37_1","unstructured":"Hyeonggon Ryu, Arda Senocak, In So Kweon, and Joon Son Chung. 2023. Hindi as a second language: Improving visually grounded speech with semantically similar samples. In Proc. ICASSP."},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00715"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01240-3_13"},{"key":"e_1_3_2_2_40_1","volume-title":"Proc. ICLR.","author":"Simonyan Karen","year":"2015","unstructured":"Karen Simonyan and Andrew Zisserman. 2015. Very deep convolutional networks for large-scale image recognition. In Proc. ICLR."},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00361"},{"key":"e_1_3_2_2_42_1","volume-title":"Proc. CVPR.","author":"Song Yale","year":"2015","unstructured":"Yale Song, Jordi Vallmitjana, Amanda Stent, and Alejandro Jaimes. 2015. Tvsum: Summarizing web videos using titles. In Proc. CVPR."},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00717"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28304"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10590-1_51"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10590-1_51"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"e_1_3_2_2_48_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N Gomez \u0141ukasz Kaiser and Illia Polosukhin. [n. d.]. Attention is all you need. In NeurIPS."},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00504"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447235"},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01770"},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.544"},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019062"},{"key":"e_1_3_2_2_54_1","volume-title":"Proc. ACM MM.","author":"Xu Yifang","year":"2023","unstructured":"Yifang Xu, Yunzhuo Sun, Yang Li, Yilei Shi, Xiaoxiang Zhu, and Sidan Du. 2023. Mh-detr: Video moment and highlight detection with cross-modal transformer. In Proc. ACM MM."},{"key":"e_1_3_2_2_55_1","volume-title":"NeurIPS","volume":"32","author":"Yuan Yitian","year":"2019","unstructured":"Yitian Yuan, Lin Ma, Jingwen Wang, Wei Liu, and Wenwu Zhu. 2019. Semantic conditioned dynamic modulation for temporal sentence grounding in videos. In NeurIPS, Vol. 32."},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350985"},{"key":"e_1_3_2_2_57_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019159"},{"key":"e_1_3_2_2_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00134"},{"key":"e_1_3_2_2_59_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.585"},{"key":"e_1_3_2_2_60_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6984"},{"key":"e_1_3_2_2_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350879"},{"key":"e_1_3_2_2_62_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i3.25478"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681514","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681514","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:57:48Z","timestamp":1750294668000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681514"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":62,"alternative-id":["10.1145\/3664647.3681514","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681514","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}