{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,16]],"date-time":"2026-07-16T05:18:47Z","timestamp":1784179127610,"version":"3.55.0"},"publisher-location":"New York, NY, USA","reference-count":65,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100017582","name":"Beijing National Research Center For Information Science And Technology","doi-asserted-by":"publisher","award":["BNR2023RC01003,BNR2023TD03006"],"award-info":[{"award-number":["BNR2023RC01003,BNR2023TD03006"]}],"id":[{"id":"10.13039\/501100017582","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62250008,62222209,62102222"],"award-info":[{"award-number":["62250008,62222209,62102222"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2020AAA0106300"],"award-info":[{"award-number":["2020AAA0106300"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612504","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:12Z","timestamp":1698391632000},"page":"3117-3128","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":15,"title":["Curriculum-Listener: Consistency- and Complementarity-Aware Audio-Enhanced Temporal Sentence Grounding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-7749-7328","authenticated-orcid":false,"given":"Houlun","family":"Chen","sequence":"first","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0351-2939","authenticated-orcid":false,"given":"Xin","family":"Wang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5382-6699","authenticated-orcid":false,"given":"Xiaohan","family":"Lan","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0943-2286","authenticated-orcid":false,"given":"Hong","family":"Chen","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9108-9618","authenticated-orcid":false,"given":"Xuguang","family":"Duan","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-8449-278X","authenticated-orcid":false,"given":"Jia","family":"Jia","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2236-9290","authenticated-orcid":false,"given":"Wenwu","family":"Zhu","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.618"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.5220\/0010832700003124"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/1553374.1553380"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.773"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_2_6_1","first-page":"26924","article-title":"Curriculum disentangled recommendation with noisy multi-feedback","volume":"34","author":"Chen Hong","year":"2021","unstructured":"Hong Chen, Yudong Chen, Xin Wang, Ruobing Xie, Rui Wang, Feng Xia, and Wenwu Zhu. 2021a. Curriculum disentangled recommendation with noisy multi-feedback. Advances in Neural Information Processing Systems, Vol. 34 (2021), 26924--26936.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6627"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58548-8_20"},{"key":"e_1_3_2_2_9_1","first-page":"28442","article-title":"End-to-end multi-modal video temporal grounding","volume":"34","author":"Chen Yi-Wen","year":"2021","unstructured":"Yi-Wen Chen, Yi-Hsuan Tsai, and Ming-Hsuan Yang. 2021b. End-to-end multi-modal video temporal grounding. Advances in Neural Information Processing Systems, Vol. 34 (2021), 28442--28453.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3222965"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.563"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"e_1_3_2_2_13_1","volume-title":"international conference on machine learning. Pmlr, 1311--1320","author":"Graves Alex","year":"2017","unstructured":"Alex Graves, Marc G Bellemare, Jacob Menick, Remi Munos, and Koray Kavukcuoglu. 2017. Automated curriculum learning for neural networks. In international conference on machine learning. Pmlr, 1311--1320."},{"key":"e_1_3_2_2_14_1","volume-title":"International conference on machine learning. PMLR, 2535--2544","author":"Hacohen Guy","year":"2019","unstructured":"Guy Hacohen and Daphna Weinshall. 2019. On the power of curriculum learning in training deep networks. In International conference on machine learning. PMLR, 2535--2544."},{"key":"e_1_3_2_2_15_1","volume-title":"Jort F Gemmeke, Aren Jansen, R Channing Moore, Manoj Plakal, Devin Platt, Rif A Saurous, Bryan Seybold, et al.","author":"Hershey Shawn","year":"2017","unstructured":"Shawn Hershey, Sourish Chaudhuri, Daniel PW Ellis, Jort F Gemmeke, Aren Jansen, R Channing Moore, Manoj Plakal, Devin Platt, Rif A Saurous, Bryan Seybold, et al. 2017. CNN architectures for large-scale audio classification. In 2017 ieee international conference on acoustics, speech and signal processing (icassp). IEEE, 131--135."},{"key":"e_1_3_2_2_16_1","volume-title":"International Conference on Machine Learning. PMLR, 9938--9964","author":"Javaloy Adri\u00e1n","year":"2022","unstructured":"Adri\u00e1n Javaloy, Maryam Meghdadi, and Isabel Valera. 2022. Mitigating modality collapse in multimodal VAEs via impartial optimization. In International Conference on Machine Learning. PMLR, 9938--9964."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548309"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3030497"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"e_1_3_2_2_20_1","volume-title":"Self-paced learning for latent variable models. Advances in neural information processing systems","author":"Kumar M","year":"2010","unstructured":"M Kumar, Benjamin Packer, and Daphne Koller. 2010. Self-paced learning for latent variable models. Advances in neural information processing systems, Vol. 23 (2010)."},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3532626"},{"key":"e_1_3_2_2_22_1","first-page":"11846","article-title":"Detecting moments and highlights in videos via natural language queries","volume":"34","author":"Lei Jie","year":"2021","unstructured":"Jie Lei, Tamara L Berg, and Mohit Bansal. 2021. Detecting moments and highlights in videos via natural language queries. Advances in Neural Information Processing Systems, Vol. 34 (2021), 11846--11858.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i3.16285"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.596"},{"key":"e_1_3_2_2_25_1","volume-title":"5th International Conference on Learning Representations, ICLR 2017, Toulon, France, April 24-26, 2017, Conference Track Proceedings. OpenReview.net. https:\/\/openreview.net\/forum?id=BJC_jUqxe","author":"Lin Zhouhan","year":"2017","unstructured":"Zhouhan Lin, Minwei Feng, C\u00edcero Nogueira dos Santos, Mo Yu, Bing Xiang, Bowen Zhou, and Yoshua Bengio. 2017. A Structured Self-Attentive Sentence Embedding. In 5th International Conference on Learning Representations, ICLR 2017, Toulon, France, April 24-26, 2017, Conference Track Proceedings. OpenReview.net. https:\/\/openreview.net\/forum?id=BJC_jUqxe"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3238514"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.732"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3414026"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20059"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3209978.3210003"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00305"},{"key":"e_1_3_2_2_32_1","volume-title":"Fixing Weight Decay Regularization in Adam. CoRR","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Fixing Weight Decay Regularization in Adam. CoRR, Vol. abs\/1711.05101 (2017). showeprint[arXiv]1711.05101 http:\/\/arxiv.org\/abs\/1711.05101"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1518"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3531961"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2019.2934906"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ins.2017.05.043"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20044-1_28"},{"key":"e_1_3_2_2_38_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 10553--10563","author":"Mercea Otniel-Bogdan","year":"2022","unstructured":"Otniel-Bogdan Mercea, Lukas Riesch, A. Sophia Koepke, and Zeynep Akata. 2022b. Audio-Visual Generalised Zero-Shot Learning With Cross-Modal Attention and Language. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 10553--10563."},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01082"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00138"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00748"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00806"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/n19-1119"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3543507.3583374"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093328"},{"key":"e_1_3_2_2_47_1","volume-title":"5th International Conference on Learning Representations, ICLR","author":"Seo Min Joon","year":"2017","unstructured":"Min Joon Seo, Aniruddha Kembhavi, Ali Farhadi, and Hannaneh Hajishirzi. 2017. Bidirectional Attention Flow for Machine Comprehension. In 5th International Conference on Learning Representations, ICLR 2017, Toulon, France, April 24-26, 2017, Conference Track Proceedings. OpenReview.net. https:\/\/openreview.net\/forum?id=HJ0UKP9ge"},{"key":"e_1_3_2_2_48_1","volume-title":"Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556","author":"Simonyan Karen","year":"2014","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)."},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00361"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00695"},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01271"},{"key":"e_1_3_2_2_53_1","first-page":"4555","article-title":"A survey on curriculum learning","volume":"44","author":"Wang Xin","year":"2021","unstructured":"Xin Wang, Yudong Chen, and Wenwu Zhu. 2021a. A survey on curriculum learning. IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. 44, 9 (2021), 4555--4576.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_2_2_54_1","volume-title":"Explore-And-Match: Bridging Proposal-Based and Proposal-Free With Transformer for Sentence Grounding in Videos. arXiv preprint arXiv:2201.10168","author":"Woo Sangmin","year":"2022","unstructured":"Sangmin Woo, Jinyoung Park, Inyong Koo, Sumin Lee, Minki Jeong, and Changick Kim. 2022. Explore-And-Match: Bridging Proposal-Based and Proposal-Free With Transformer for Sentence Grounding in Videos. arXiv preprint arXiv:2201.10168 (2022)."},{"key":"e_1_3_2_2_55_1","volume-title":"International Conference on Machine Learning. PMLR, 24043--24055","author":"Wu Nan","year":"2022","unstructured":"Nan Wu, Stanislaw Jastrzebski, Kyunghyun Cho, and Krzysztof J Geras. 2022. Characterizing and overcoming the greedy nature of learning in multi-modal deep neural networks. In International Conference on Machine Learning. PMLR, 24043--24055."},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16406"},{"key":"e_1_3_2_2_57_1","volume-title":"5th International Conference on Learning Representations, ICLR","author":"Xiong Caiming","year":"2017","unstructured":"Caiming Xiong, Victor Zhong, and Richard Socher. 2017. Dynamic Coattention Networks For Question Answering. In 5th International Conference on Learning Representations, ICLR 2017, Toulon, France, April 24-26, 2017, Conference Track Proceedings. OpenReview.net. https:\/\/openreview.net\/forum?id=rJeKjwvclx"},{"key":"e_1_3_2_2_58_1","volume-title":"Advances in Neural Information Processing Systems","volume":"32","author":"Yuan Yitian","year":"2019","unstructured":"Yitian Yuan, Lin Ma, Jingwen Wang, Wei Liu, and Wenwu Zhu. 2019a. Semantic conditioned dynamic modulation for temporal sentence grounding in videos. Advances in Neural Information Processing Systems, Vol. 32 (2019)."},{"key":"e_1_3_2_2_59_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019159"},{"key":"e_1_3_2_2_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01030"},{"key":"e_1_3_2_2_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00134"},{"key":"e_1_3_2_2_62_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.585"},{"key":"e_1_3_2_2_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01248"},{"key":"e_1_3_2_2_64_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6984"},{"key":"e_1_3_2_2_65_1","volume-title":"An empirical exploration of curriculum learning for neural machine translation. arXiv preprint arXiv:1811.00739","author":"Zhang Xuan","year":"2018","unstructured":"Xuan Zhang, Gaurav Kumar, Huda Khayrallah, Kenton Murray, Jeremy Gwinnup, Marianna J Martindale, Paul McNamee, Kevin Duh, and Marine Carpuat. 2018. An empirical exploration of curriculum learning for neural machine translation. arXiv preprint arXiv:1811.00739 (2018)."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612504","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612504","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:02:27Z","timestamp":1755820947000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612504"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":65,"alternative-id":["10.1145\/3581783.3612504","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612504","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}