{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T16:09:56Z","timestamp":1780675796216,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":65,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Nature Science Foundation of China","award":["62322211 U21B2024 61931008 62071415"],"award-info":[{"award-number":["62322211 U21B2024 61931008 62071415"]}]},{"name":"the National Key Research and Development Program of China under Grant","award":["2020YFB1406604"],"award-info":[{"award-number":["2020YFB1406604"]}]},{"name":"\"Pioneer\" Zhejiang Provincial Natural Science Foundation of China","award":["LDT23F01011F01 LDT23F01015F01 LDT23F01014F01"],"award-info":[{"award-number":["LDT23F01011F01 LDT23F01015F01 LDT23F01014F01"]}]},{"name":"\"Leading Goose\" R&D Program of Zhejiang Province","award":["2022C01068"],"award-info":[{"award-number":["2022C01068"]}]},{"name":"Youth Innovation Promotion Association of Chinese Academy of Sciences under Grant","award":["2020108"],"award-info":[{"award-number":["2020108"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612357","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"4584-4594","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Reducing Intrinsic and Extrinsic Data Biases for Moment Localization with Natural Language"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1991-4908","authenticated-orcid":false,"given":"Jiong","family":"Yin","sequence":"first","affiliation":[{"name":"HangZhou Dianzi University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1943-8219","authenticated-orcid":false,"given":"Liang","family":"Li","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0790-9279","authenticated-orcid":false,"given":"Jiehua","family":"Zhang","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1204-0512","authenticated-orcid":false,"given":"Chenggang","family":"Yan","sequence":"additional","affiliation":[{"name":"Hangzhou Dianzi University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2839-8693","authenticated-orcid":false,"given":"Lei","family":"Zhang","sequence":"additional","affiliation":[{"name":"Lishui Institute of Hangzhou Dianzi University, Lishui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6107-4538","authenticated-orcid":false,"given":"Zunjie","family":"Zhu","sequence":"additional","affiliation":[{"name":"Hangzhou Dianzi University &amp; Lishui Institute of Hangzhou Dianzi University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.618"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.618"},{"key":"e_1_3_2_1_3_1","volume-title":"Natural language processing with Python: analyzing text with the natural language toolkit. \"O'Reilly Media","author":"Bird Steven","unstructured":"Steven Bird, Ewan Klein, and Edward Loper. 2009. Natural language processing with Python: analyzing text with the natural language toolkit. \"O'Reilly Media, Inc.\"."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1015"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6627"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018199"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3187288"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.563"},{"key":"e_1_3_2_1_9_1","volume-title":"Mac: Mining activity concepts for language-based temporal localization. In 2019 IEEE winter conference on applications of computer vision (WACV)","author":"Ge Runzhou","year":"2019","unstructured":"Runzhou Ge, Jiyang Gao, Kan Chen, and Ram Nevatia. 2019. Mac: Mining activity concepts for language-based temporal localization. In 2019 IEEE winter conference on applications of computer vision (WACV). IEEE, 245--253."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-020-00257-z"},{"key":"e_1_3_2_1_11_1","volume-title":"Excl: Extractive clip localization using natural language descriptions. arXiv preprint arXiv:1904.02755","author":"Ghosh Soham","year":"2019","unstructured":"Soham Ghosh, Anuva Agarwal, Zarana Parekh, and Alexander Hauptmann. 2019. Excl: Extractive clip localization using natural language descriptions. arXiv preprint arXiv:1904.02755 (2019)."},{"key":"e_1_3_2_1_12_1","volume-title":"Can Shuffling Video Benefit Temporal Bias Problem: A Novel Training Framework for Temporal Grounding. In European Conference on Computer Vision. Springer, 130--147","author":"Hao Jiachang","year":"2022","unstructured":"Jiachang Hao, Haifeng Sun, Pengfei Ren, Jingyu Wang, Qi Qi, and Jianxin Liao. 2022. Can Shuffling Video Benefit Temporal Bias Problem: A Novel Training Framework for Temporal Grounding. In European Conference on Computer Vision. Springer, 130--147."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"e_1_3_2_1_14_1","volume-title":"Localizing moments in video with temporal language. arXiv preprint arXiv:1809.01337","author":"Hendricks Lisa Anne","year":"2018","unstructured":"Lisa Anne Hendricks, Oliver Wang, Eli Shechtman, Josef Sivic, Trevor Darrell, and Bryan Russell. 2018. Localizing moments in video with temporal language. arXiv preprint arXiv:1809.01337 (2018)."},{"key":"e_1_3_2_1_15_1","volume-title":"Long short-term memory. Neural computation","author":"Hochreiter Sepp","year":"1997","unstructured":"Sepp Hochreiter and J\u00fcrgen Schmidhuber. 1997. Long short-term memory. Neural computation, Vol. 9, 8 (1997), 1735--1780."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475281"},{"key":"e_1_3_2_1_17_1","volume-title":"Video Activity Localisation with Uncertainties in Temporal Boundary. In European Conference on Computer Vision. Springer, 724--740","author":"Huang Jiabo","year":"2022","unstructured":"Jiabo Huang, Hailin Jin, Shaogang Gong, and Yang Liu. 2022. Video Activity Localisation with Uncertainties in Temporal Boundary. In European Conference on Computer Vision. Springer, 724--740."},{"key":"e_1_3_2_1_18_1","volume-title":"Overcoming Weak Visual-Textual Alignment for Video Moment Retrieval. arXiv preprint arXiv:2306.02728","author":"Jung Minjoon","year":"2023","unstructured":"Minjoon Jung, Youwon Jang, Seongho Choi, Joochan Kim, Jin-Hwa Kim, and Byoung-Tak Zhang. 2023. Overcoming Weak Visual-Textual Alignment for Video Moment Retrieval. arXiv preprint arXiv:2306.02728 (2023)."},{"key":"e_1_3_2_1_19_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_20_1","volume-title":"On information and sufficiency. The annals of mathematical statistics","author":"Kullback Solomon","year":"1951","unstructured":"Solomon Kullback and Richard A Leibler. 1951. On information and sufficiency. The annals of mathematical statistics, Vol. 22, 1 (1951), 79--86."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i1.25204"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3158546"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547969"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3240508.3240549"},{"key":"e_1_3_2_1_25_1","first-page":"3003","article-title":"Entity-enhanced adaptive reconstruction network for weakly supervised referring expression grounding","volume":"45","author":"Liu Xuejing","year":"2022","unstructured":"Xuejing Liu, Liang Li, Shuhui Wang, Zheng-Jun Zha, Zechao Li, Qi Tian, and Qingming Huang. 2022a. Entity-enhanced adaptive reconstruction network for weakly supervised referring expression grounding. IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. 45, 3 (2022), 3003--3018.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01082"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00279"},{"key":"e_1_3_2_1_28_1","volume-title":"Uncovering hidden challenges in query-based video moment retrieval. arXiv preprint arXiv:2009.00325","author":"Otani Mayu","year":"2020","unstructured":"Mayu Otani, Yuta Nakashima, Esa Rahtu, and Janne Heikkil\u00e4. 2020. Uncovering hidden challenges in query-based video moment retrieval. arXiv preprint arXiv:2009.00325 (2020)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-33718-5_11"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_31"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00361"},{"key":"e_1_3_2_1_33_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413975"},{"key":"e_1_3_2_1_35_1","volume-title":"Semantic and relation modulation for audio-visual event localization","author":"Wang Hao","year":"2022","unstructured":"Hao Wang, Zheng-Jun Zha, Liang Li, Xuejin Chen, and Jiebo Luo. 2022b. Semantic and relation modulation for audio-visual event localization. IEEE Transactions on Pattern Analysis and Machine Intelligence (2022)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2023.3290083"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00695"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00042"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20163"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16406"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019062"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.2975798"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3067449"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404374"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3472810"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3468872"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00265"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462823"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462823"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3475723.3484247"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3475723.3484247"},{"key":"e_1_3_2_1_52_1","volume-title":"Advances in Neural Information Processing Systems","volume":"32","author":"Yuan Yitian","year":"2019","unstructured":"Yitian Yuan, Lin Ma, Jingwen Wang, Wei Liu, and Wenwu Zhu. 2019a. Semantic conditioned dynamic modulation for temporal sentence grounding in videos. Advances in Neural Information Processing Systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019159"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01030"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01030"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00225"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547963"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00134"},{"key":"e_1_3_2_1_59_1","volume-title":"Span-based localizing network for natural language video localization. arXiv preprint arXiv:2004.13931","author":"Zhang Hao","year":"2020","unstructured":"Hao Zhang, Aixin Sun, Wei Jing, and Joey Tianyi Zhou. 2020b. Span-based localizing network for natural language video localization. arXiv preprint arXiv:2004.13931 (2020)."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01248"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3120745"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6984"},{"key":"e_1_3_2_1_63_1","volume-title":"Text-Visual Prompting for Efficient 2D Temporal Video Grounding. arXiv preprint arXiv:2303.04995","author":"Zhang Yimeng","year":"2023","unstructured":"Yimeng Zhang, Xin Chen, Jinghan Jia, Sijia Liu, and Ke Ding. 2023. Text-Visual Prompting for Efficient 2D Temporal Video Grounding. arXiv preprint arXiv:2303.04995 (2023)."},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.3023339"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00418"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612357","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612357","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:00:27Z","timestamp":1755820827000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612357"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":65,"alternative-id":["10.1145\/3581783.3612357","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612357","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}