{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,2]],"date-time":"2026-05-02T14:52:46Z","timestamp":1777733566871,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":65,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Key R&D Program of China","award":["No.2022ZD0161400"],"award-info":[{"award-number":["No.2022ZD0161400"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681472","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"5025-5034","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":26,"title":["Auto-ACD: A Large-scale Dataset for Audio-Language Representation Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-2408-9624","authenticated-orcid":false,"given":"Luoyi","family":"Sun","sequence":"first","affiliation":[{"name":"CMIC, Shanghai Jiao Tong University &amp; Shanghai Artificial Intelligence Laboratory, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8718-1278","authenticated-orcid":false,"given":"Xuenan","family":"Xu","sequence":"additional","affiliation":[{"name":"X-LANCE, Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5599-8707","authenticated-orcid":false,"given":"Mengyue","family":"Wu","sequence":"additional","affiliation":[{"name":"X-LANCE, Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-8609-6826","authenticated-orcid":false,"given":"Weidi","family":"Xie","sequence":"additional","affiliation":[{"name":"CMIC, Shanghai Jiao Tong University &amp; Shanghai Artificial Intelligence Laboratory, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_2_1_1","first-page":"25","article-title":"Self-supervised multimodal versatile networks","volume":"33","author":"Alayrac Jean-Baptiste","year":"2020","unstructured":"Jean-Baptiste Alayrac, Adria Recasens, Rosalia Schneider, Relja Arandjelovi\u0107, Jason Ramapuram, Jeffrey De Fauw, Lucas Smaira, Sander Dieleman, and Andrew Zisserman. 2020. Self-supervised multimodal versatile networks. Advances in Neural Information Processing Systems, Vol. 33 (2020), 25--37.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.73"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1080\/08839514.2018.1430469"},{"key":"e_1_3_2_2_4_1","volume-title":"Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization. 65--72","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization. 65--72."},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01659"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746312"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_2_9_1","volume-title":"Acoustic scene classification: a comprehensive survey. Expert Systems with Applications","author":"Ding Biyun","year":"2023","unstructured":"Biyun Ding, Tao Zhang, Chao Wang, Ganjun Liu, Jinhua Liang, Ruimin Hu, Yulin Wu, and Difei Guo. 2023. Acoustic scene classification: a comprehensive survey. Expert Systems with Applications (2023), 121902."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052990"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i11.29104"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"e_1_3_2_2_13_1","volume-title":"Proceedings of the INTERSPEECH Conference","author":"Di Hu","year":"2023","unstructured":"Di Hu Guangyao li, Yixin Xu. 2023. Multi-scale attention for audio question answering. Proceedings of the INTERSPEECH Conference (2023)."},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747631"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01255"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01815"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00186"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","unstructured":"Toni Heittola Annamaria Mesaros and Tuomas Virtanen. 2020. TAU Urban Acoustic Scenes 2020 Mobile Development dataset. https:\/\/doi.org\/10.5281\/zenodo.3670167","DOI":"10.5281\/zenodo.3670167"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952132"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01023"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448489"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i7.25967"},{"key":"e_1_3_2_2_23_1","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","volume":"1","author":"Kim Chris Dongjoo","year":"2019","unstructured":"Chris Dongjoo Kim, Byeongchang Kim, Hyunmin Lee, and Gunhee Kim. 2019. Audiocaps: Generating captions for audios in the wild. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers). 119--132."},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3149712"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3030497"},{"key":"e_1_3_2_2_26_1","volume-title":"Proceedings of the International Conference on Machine Learning. 19730--19742","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In Proceedings of the International Conference on Machine Learning. 19730--19742."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096853"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611724"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.3115\/1073445.1073465"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO55093.2022.9909680"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00551"},{"key":"e_1_3_2_2_32_1","unstructured":"Shilong Liu Zhaoyang Zeng Tianhe Ren Feng Li Hao Zhang Jie Yang Chunyuan Li Jianwei Yang Hang Su Jun Zhu et al. 2023. Grounding dino: Marrying dino with grounded pre-training for open-set object detection. arXiv preprint arXiv:2303.05499 (2023)."},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.100"},{"key":"e_1_3_2_2_34_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_2_35_1","unstructured":"Irene Martin Morato and Annamaria Mesaros. 2021. Diversity and bias in audio captioning datasets. In Detection and Classication of Acoustic Scenes and Events. 90--94."},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1186\/s13636-022-00259-2"},{"key":"e_1_3_2_2_37_1","volume-title":"WavCaps: A chatGPT-assisted weakly-labelled audio captioning dataset for audio-language multimodal research. arXiv preprint arXiv:2303.17395","author":"Mei Xinhao","year":"2023","unstructured":"Xinhao Mei, Chutong Meng, Haohe Liu, Qiuqiang Kong, Tom Ko, Chengqi Zhao, Mark D Plumbley, Yuexian Zou, and Wenwu Wang. 2023. WavCaps: A chatGPT-assisted weakly-labelled audio captioning dataset for audio-language multimodal research. arXiv preprint arXiv:2303.17395 (2023)."},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00272"},{"key":"e_1_3_2_2_39_1","first-page":"37524","article-title":"A closer look at weakly-supervised audio-visual source localization","volume":"35","author":"Mo Shentong","year":"2022","unstructured":"Shentong Mo and Pedro Morgado. 2022. A closer look at weakly-supervised audio-visual source localization. Advances in Neural Information Processing Systems, Vol. 35 (2022), 37524--37536.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01018"},{"key":"e_1_3_2_2_41_1","volume-title":"AV-SAM: Segment anything model meets audio-visual localization and segmentation. arXiv preprint arXiv:2305.01836","author":"Mo Shentong","year":"2023","unstructured":"Shentong Mo and Yapeng Tian. 2023. AV-SAM: Segment anything model meets audio-visual localization and segmentation. arXiv preprint arXiv:2305.01836 (2023)."},{"key":"e_1_3_2_2_42_1","volume-title":"Clipcap: Clip prefix for image captioning. arXiv preprint arXiv:2111.09734","author":"Mokady Ron","year":"2021","unstructured":"Ron Mokady, Amir Hertz, and Amit H Bermano. 2021. Clipcap: Clip prefix for image captioning. arXiv preprint arXiv:2111.09734 (2021)."},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10097115"},{"key":"e_1_3_2_2_44_1","volume-title":"Audio retrieval with natural language queries. arXiv preprint arXiv:2105.02192","author":"Oncescu Andreea-Maria","year":"2021","unstructured":"Andreea-Maria Oncescu, A Koepke, Joao F Henriques, Zeynep Akata, and Samuel Albanie. 2021. Audio retrieval with natural language queries. arXiv preprint arXiv:2105.02192 (2021)."},{"key":"e_1_3_2_2_45_1","volume-title":"Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748","author":"van den Oord Aaron","year":"2018","unstructured":"Aaron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_39"},{"key":"e_1_3_2_2_47_1","volume-title":"Rethinking CNN models for audio classification. arXiv preprint arXiv:2007.11154","author":"Palanisamy Kamalesh","year":"2020","unstructured":"Kamalesh Palanisamy, Dipika Singhania, and Angela Yao. 2020. Rethinking CNN models for audio classification. arXiv preprint arXiv:2007.11154 (2020)."},{"key":"e_1_3_2_2_48_1","volume-title":"Proceedings of the International Conference on Machine Learning. 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In Proceedings of the International Conference on Machine Learning. 8748--8763."},{"key":"e_1_3_2_2_49_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language models are unsupervised multitask learners. OpenAI blog Vol. 1 8 (2019) 9."},{"key":"e_1_3_2_2_50_1","volume-title":"Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125, Vol. 1, 2 (2022), 3."},{"key":"e_1_3_2_2_51_1","volume-title":"Sentence-bert: Sentence embeddings using siamese bert-networks. arXiv preprint arXiv:1908.10084","author":"Reimers Nils","year":"2019","unstructured":"Nils Reimers and Iryna Gurevych. 2019. Sentence-bert: Sentence embeddings using siamese bert-networks. arXiv preprint arXiv:1908.10084 (2019)."},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2655045"},{"key":"e_1_3_2_2_54_1","first-page":"25278","article-title":"Laion-5b: An open large-scale dataset for training next generation image-text models","volume":"35","author":"Schuhmann Christoph","year":"2022","unstructured":"Christoph Schuhmann, Romain Beaumont, Richard Vencu, Cade Gordon, Ross Wightman, Mehdi Cherti, Theo Coombes, Aarush Katta, Clayton Mullis, Mitchell Wortsman, et al. 2022. Laion-5b: An open large-scale dataset for training next generation image-text models. Advances in Neural Information Processing Systems, Vol. 35 (2022), 25278--25294.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00621"},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/TSA.2002.800560"},{"key":"e_1_3_2_2_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747669"},{"key":"e_1_3_2_2_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"key":"e_1_3_2_2_59_1","volume-title":"A comprehensive survey of automated audio captioning. arXiv preprint arXiv:2205.05357","author":"Xu Xuenan","year":"2022","unstructured":"Xuenan Xu, Mengyue Wu, and Kai Yu. 2022. A comprehensive survey of automated audio captioning. arXiv preprint arXiv:2205.05357 (2022)."},{"key":"e_1_3_2_2_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3613820"},{"key":"e_1_3_2_2_61_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.333"},{"key":"e_1_3_2_2_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2723009"},{"key":"e_1_3_2_2_63_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19836-6_22"},{"key":"e_1_3_2_2_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01886"},{"key":"e_1_3_2_2_65_1","volume-title":"Jesse Dodge, Alex Fang, Youngjae Yu, Ludwig Schmidt, William Yang Wang, and Yejin Choi.","author":"Zhu Wanrong","year":"2024","unstructured":"Wanrong Zhu, Jack Hessel, Anas Awadalla, Samir Yitzhak Gadre, Jesse Dodge, Alex Fang, Youngjae Yu, Ludwig Schmidt, William Yang Wang, and Yejin Choi. 2024. Multimodal c4: An open, billion-scale corpus of images interleaved with text. Advances in Neural Information Processing Systems, Vol. 36 (2024)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681472","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681472","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:57:47Z","timestamp":1750294667000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681472"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":65,"alternative-id":["10.1145\/3664647.3681472","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681472","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}