{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,21]],"date-time":"2026-05-21T17:37:46Z","timestamp":1779385066157,"version":"3.53.1"},"reference-count":96,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2024]]},"DOI":"10.1109\/taslp.2024.3419446","type":"journal-article","created":{"date-parts":[[2024,6,26]],"date-time":"2024-06-26T18:23:40Z","timestamp":1719426220000},"page":"3339-3354","source":"Crossref","is-referenced-by-count":130,"title":["WavCaps: A ChatGPT-Assisted Weakly-Labelled Audio Captioning Dataset for Audio-Language Multimodal Research"],"prefix":"10.1109","volume":"32","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6079-5130","authenticated-orcid":false,"given":"Xinhao","family":"Mei","sequence":"first","affiliation":[{"name":"Centre for Vision, Speech, and Signal Processing, University of Surrey, Guildford, U.K."}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7591-4793","authenticated-orcid":false,"given":"Chutong","family":"Meng","sequence":"additional","affiliation":[{"name":"Johns Hopkins University, Baltimore, MD, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1036-7888","authenticated-orcid":false,"given":"Haohe","family":"Liu","sequence":"additional","affiliation":[{"name":"Centre for Vision, Speech, and Signal Processing, University of Surrey, Guildford, U.K."}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Qiuqiang","family":"Kong","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong, SAR, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Tom","family":"Ko","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Chengqi","family":"Zhao","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9708-1075","authenticated-orcid":false,"given":"Mark D.","family":"Plumbley","sequence":"additional","affiliation":[{"name":"Centre for Vision, Speech, and Signal Processing, University of Surrey, Guildford, U.K."}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9999-6140","authenticated-orcid":false,"given":"Yuexian","family":"Zou","sequence":"additional","affiliation":[{"name":"School of Electronic and Computer Engineering, Peking University Shenzhen Graduate School, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8393-5703","authenticated-orcid":false,"given":"Wenwu","family":"Wang","sequence":"additional","affiliation":[{"name":"Centre for Vision, Speech, and Signal Processing, University of Surrey, Guildford, U.K."}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2806390"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3133208"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2930913"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2021.3090678"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2017.2690563"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2014.2326181"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747336"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3149712"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-11115"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1186\/s13636-022-00259-2"},{"key":"ref12","first-page":"170","article-title":"Automated audio captioning by fine-tuning BART with AudioSet tags","volume-title":"Proc. 6th Detection Classification Acoustic Scenes Events","author":"Gontier","year":"2021"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2087"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO55093.2022.9909680"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3010650"},{"key":"ref16","article-title":"AudioGen: Textually guided audio generation","volume-title":"Proc. 11th Int. Conf. Learn. Representations","author":"Kreuk","year":"2023"},{"key":"ref17","first-page":"21450","article-title":"AudioLDM: Text-to-audio generation with latent diffusion models","volume-title":"Proc. 40th Int. Conf. Mach. Learn.","author":"Liu","year":"2023"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3268730"},{"key":"ref19","article-title":"MusicLM: Generating music from text","author":"Agostinelli","year":"2023"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10894"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2021.07.009"},{"key":"ref22","article-title":"ImageBERT: Cross-modal pre-training with large-scale weak-supervised image-text data","author":"Qi","year":"2020"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.7005"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"ref25","article-title":"Hierarchical text-conditional image generation with CLIP latents","author":"Ramesh","year":"2022"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00644"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1038\/nature14539"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1038\/323533a0"},{"key":"ref32","first-page":"9694","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Li","year":"2021"},{"key":"ref33","first-page":"12888","article-title":"BLIP: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li","year":"2022"},{"key":"ref34","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Jia","year":"2021"},{"key":"ref35","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford","year":"2021"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-698"},{"key":"ref37","first-page":"119","article-title":"AudioCaps: Generating captions for audios in the wild","volume-title":"Proc. Conf. North Amer. Chapter Assoc. Comput. Linguistics: Hum. Lang. Technol.","author":"Kim","year":"2019"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref39","article-title":"Microsoft COCO captions: Data collection and evaluation server","author":"Chen","year":"2015"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1093\/nsr\/nwx106"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2922396"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052990"},{"key":"ref45","first-page":"90","article-title":"Diversity and bias in audio captioning datasets","volume-title":"Proc. 6th Detection Classification Acoustic Scenes Events","author":"Martin","year":"2021"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00904"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"ref48","first-page":"211","article-title":"Audio captioning transformer","volume-title":"Proc. 6th Detection Classification Acoustic Scenes Events","author":"Mei","year":"2021"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO55093.2022.9909761"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.33682\/sezz-vd31"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682377"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2023-1136"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"issue":"140","key":"ref54","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"J. Mach. Learn. Res."},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3030497"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414579"},{"key":"ref57","article-title":"Listen, think, and understand","volume-title":"Proc. 12th Int. Conf. Learn. Representations","author":"Gong","year":"2024"},{"key":"ref58","doi-asserted-by":"crossref","DOI":"10.1145\/3664647.3681472","article-title":"A large-scale dataset for audio-language representation learning","author":"Sun","year":"2024"},{"key":"ref59","first-page":"409","article-title":"LP-MusicCaps: LLM-based pseudo music captioning","volume-title":"Proc. 24th Int. Soc. Music Inf. Retrieval Conf.","author":"Doh","year":"2023"},{"key":"ref60","article-title":"UniAudio: An audio foundation model toward universal audio generation","author":"Yang","year":"2023"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3399607"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612348"},{"key":"ref63","article-title":"SALMONN: Towards generic hearing abilities for large language models","volume-title":"Proc. 12th Int. Conf. Learn. Representations","author":"Tang","year":"2024"},{"key":"ref64","article-title":"ChatBridge: Bridging modalities with large language model as a language catalyst","author":"Zhao","year":"2023"},{"key":"ref65","article-title":"Acoustic prompt tuning: Empowering large language models with audition capabilities","author":"Liang","year":"2023"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1145\/2502081.2502245"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.21236\/ADA006655"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19781-9_24"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096972"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746312"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.5555\/3524938.3525087"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095889"},{"key":"ref74","article-title":"ADAM: A method for stochastic optimization","author":"Kingma","year":"2014"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096877"},{"key":"ref76","first-page":"40","article-title":"Improving the performance of automated audio captioning via integrating the acoustic and semantic information","volume-title":"Proc. 6th Detection Classification Acoustic Scenes Events","author":"Ye","year":"2021"},{"key":"ref77","first-page":"6","article-title":"Automated audio captioning with weakly supervised pre-training and word selection methods","volume-title":"Proc. 6th Detection Classification Acoustic Scenes Events","author":"Han","year":"2021"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.23919\/APSIPAASC55919.2022.9980325"},{"key":"ref79","first-page":"2773","article-title":"Interactive audio-text representation for automated audio captioning with contrastive learning","volume-title":"Proc. Annu. Conf. Int. Speech Commun. Assoc.","author":"Chen","year":"2022"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-914"},{"key":"ref81","article-title":"Improving audio-language learning with MixGen and multi-level test-time augmentation","author":"Kim","year":"2022"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.703"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"ref84","first-page":"74","article-title":"ROUGE: A package for automatic evaluation of summaries","volume-title":"Proc. Text Summarization Branches Out","author":"Lin","year":"2004"},{"key":"ref85","first-page":"65","article-title":"METEOR: An automatic metric for MT evaluation with improved correlation with human judgments","volume-title":"Proc. ACL Workshop Intrinsic Extrinsic Eval. Measures Mach. Transl. Summarization","author":"Banerjee","year":"2005"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.100"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2655045"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"ref91","first-page":"5178","article-title":"BEATs: Audio pre-training with acoustic tokenizers","volume-title":"Proc. 40th Int. Conf. Mach. Learn. Res.","author":"Chen","year":"2023"},{"key":"ref92","article-title":"End-to-end audio strikes back: Boosting augmentations towards an efficient audio classification network","author":"Gazneli","year":"2022"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747669"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747631"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3613820"},{"key":"ref96","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Ho","year":"2020"}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6570655\/10304349\/10572302.pdf?arnumber=10572302","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,22]],"date-time":"2024-11-22T20:29:06Z","timestamp":1732307346000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10572302\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"references-count":96,"URL":"https:\/\/doi.org\/10.1109\/taslp.2024.3419446","relation":{},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"value":"2329-9290","type":"print"},{"value":"2329-9304","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]}}}