{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,30]],"date-time":"2026-06-30T15:43:02Z","timestamp":1782834182846,"version":"3.54.5"},"reference-count":113,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61901265"],"award-info":[{"award-number":["61901265"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Shanghai Municipal Science and Technology Major Project","award":["2021SHZDZX0102"],"award-info":[{"award-number":["2021SHZDZX0102"]}]},{"name":"Key Research and Development Program of Jiangsu Province China","award":["BE2022059-2"],"award-info":[{"award-number":["BE2022059-2"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2024]]},"DOI":"10.1109\/taslp.2023.3321968","type":"journal-article","created":{"date-parts":[[2023,10,13]],"date-time":"2023-10-13T18:10:57Z","timestamp":1697220657000},"page":"95-112","source":"Crossref","is-referenced-by-count":18,"title":["Beyond the Status Quo: A Contemporary Survey of Advances and Challenges in Audio Captioning"],"prefix":"10.1109","volume":"32","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8718-1278","authenticated-orcid":false,"given":"Xuenan","family":"Xu","sequence":"first","affiliation":[{"name":"X-LANCE Lab, Department of Computer Science and Engineering, MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-9546-3301","authenticated-orcid":false,"given":"Zeyu","family":"Xie","sequence":"additional","affiliation":[{"name":"X-LANCE Lab, Department of Computer Science and Engineering, MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5599-8707","authenticated-orcid":false,"given":"Mengyue","family":"Wu","sequence":"additional","affiliation":[{"name":"X-LANCE Lab, Department of Computer Science and Engineering, MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7102-9826","authenticated-orcid":false,"given":"Kai","family":"Yu","sequence":"additional","affiliation":[{"name":"X-LANCE Lab, Department of Computer Science and Engineering, MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096877"},{"key":"ref56","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"0","journal-title":"Proc Int Conf North Amer Chapter Assoc Computat Linguistics"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.353"},{"key":"ref58","first-page":"1","article-title":"PEACS: Prefix encoding for auditory caption synthesis","author":"schauml\u00f6ffel","year":"0","journal-title":"Proc Conf Detection Classification Acoust Scenes Events Challenge"},{"key":"ref53","article-title":"Audio captioning using pre-trained large-scale language model guided by audio-based similar caption retrieval","author":"koizumi","year":"2020"},{"key":"ref52","first-page":"1","article-title":"Cau submission to DCASE 2021 task6: Transformer followed by transfer learning for audio captioning","author":"won","year":"0","journal-title":"Proc Conf Detection Classification Acoust Scenes Events"},{"key":"ref55","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"radford","year":"2019","journal-title":"OpenAIRE blog"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO55093.2022.9909761"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952132"},{"key":"ref50","first-page":"21","article-title":"Audio captioning based on transformer and pre-trained CNN","author":"chen","year":"0","journal-title":"Proc Conf Detection Classification Acoust Scenes Events"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2022.3189536"},{"key":"ref45","first-page":"170","article-title":"Automated audio captioning by fine-tuning bart with audioset tags","author":"gontier","year":"0","journal-title":"Proc Conf Detection Classification Acoust Scenes Events"},{"key":"ref48","first-page":"1","article-title":"The curious case of neural text degeneration","author":"holtzman","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref47","first-page":"190","article-title":"Effects of word-frequency based pre- and post- processings for audio captioning","author":"takeuchi","year":"0","journal-title":"Proc Conf Detection Classification Acoust Scenes Events"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413982"},{"key":"ref41","first-page":"60","article-title":"Evaluating off-the-shelf machine listening and natural language models for automated audio captioning","author":"weck","year":"0","journal-title":"Proc Conf Detection Classification Acoust Scenes Events"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10510"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10073"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1296"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-227"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-698"},{"key":"ref9","author":"wang","year":"2006","journal-title":"Computational Auditory Scene Analysis Principles Algorithms and Applications"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-2227"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.33682\/sezz-vd31"},{"key":"ref6","first-page":"1","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"dosovitskiy","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1186\/s13636-022-00259-2"},{"key":"ref100","first-page":"228","article-title":"METEOR: An automatic metric for MT evaluation with improved correlation with human judgments","author":"lavie","year":"0","journal-title":"Proc 2nd Workshop Statist Mach Translat"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"ref40","first-page":"160","article-title":"Leveraging state-of-the-art ASR techniques to audio captioning","author":"narisetty","year":"0","journal-title":"Proc Conf Detection Classification Acoust Scenes Events"},{"key":"ref35","first-page":"211","article-title":"Audio captioning transformer","author":"mei","year":"0","journal-title":"Proc Conf Detection Classification Acoust Scenes Events"},{"key":"ref34","first-page":"5998","article-title":"Attention is all you need","author":"vaswani","year":"0","journal-title":"Proc Int Conf Adv Neural Inf Process Syst"},{"key":"ref37","first-page":"225","article-title":"A CRNN-GRU based reinforcement learning approach to audio captioning","author":"xu","year":"0","journal-title":"Proc Conf Detection Classification Acoust Scenes Events Challenge"},{"key":"ref36","article-title":"Efficient audio captioning transformer with patchout and text guidance","author":"kouzelis","year":"2022"},{"key":"ref31","first-page":"6","article-title":"Automated audio captioning with weakly supervised pre-training and word selection methods","author":"han","year":"0","journal-title":"Proc Conf Detection Classification Acoust Scenes Events"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO54536.2021.9616340"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2023.3266114"},{"key":"ref32","first-page":"40","article-title":"Improving the performance of automated audio captioning via integrating the acoustic and semantic information","author":"ye","year":"0","journal-title":"Proc Conf Detection Classification Acoust Scenes Events"},{"key":"ref39","first-page":"1","article-title":"Audio captioning with meshed-memory transformer","author":"chen","year":"0","journal-title":"Proc Conf Detection Classification Acoust Scenes Events Challenge"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747676"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.33682\/7bay-bj41"},{"key":"ref23","first-page":"740","article-title":"Microsoft coco: Common objects in context","author":"lin","year":"0","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref26","first-page":"1","article-title":"The NTT DCASE2020 challenge task 6 system: Automated audio captioning with keywords and sentence length estimation","author":"koizumi","year":"0","journal-title":"Proc Conf Detection Classification Acoust Scenes Events Challenge"},{"key":"ref25","first-page":"150","article-title":"Listen carefully and tell: An audio captioning system based on residual learning and gammatone audio representation","author":"perez-castanos","year":"0","journal-title":"Proc Conf Detection Classification Acoust Scenes Events"},{"key":"ref20","article-title":"An attempt towards interpretable audio-visual video captioning","author":"tian","year":"2018"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-914"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123448"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ISM.2020.00014"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/WASPAA.2017.8170058"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-15825-4_10"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-42"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095266"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref14","first-page":"61","article-title":"Guided multi-branch learning systems for sound event detection with sound separation","author":"huang","year":"0","journal-title":"Proc DCASE"},{"key":"ref97","article-title":"Evaluation of text generation: A survey","author":"celikyilmaz","year":"2020"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096526"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3290428"},{"key":"ref99","first-page":"25","article-title":"Rouge: A package for automatic evaluation of summaries","author":"chin-yew","year":"0","journal-title":"Proc Workshop Text Summarization Branches Out"},{"key":"ref10","first-page":"3846","article-title":"Unsupervised sound separation using mixture invariant training","author":"wisdom","year":"0","journal-title":"Proc Int Conf Adv Neural Inf Process Syst"},{"key":"ref98","first-page":"311","article-title":"BLEU: A method for automatic evaluation of machine translation","author":"kishore","year":"0","journal-title":"Proc Annual Meeting of the Assoc Computational Linguistics"},{"key":"ref17","first-page":"1","article-title":"Beats-based audio captioning model with instructor embedding supervision and ChatGPt mix-up","author":"wu","year":"0","journal-title":"Proc Conf Detection Classification Acoust Scenes Events Challenge"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3030497"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-15561-1_2"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1613\/jair.3994"},{"key":"ref93","first-page":"90","article-title":"Diversity and bias in audio captioning datasets","author":"martin","year":"0","journal-title":"Proc Conf Detection Classification Acoust Scenes Events"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052990"},{"key":"ref95","first-page":"1","article-title":"Is my automatic audio captioning system so bad? Spider-max: A metric to consider several caption candidates","author":"labb\u00e9","year":"0","journal-title":"Proc Conf Detection Classification Acoust Scenes Events"},{"key":"ref94","first-page":"9","article-title":"A multi-device dataset for urban acoustic scene classification","author":"mesaros","year":"0","journal-title":"Proc Conf Detection Classification Acoust Scenes Events"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN52387.2021.9533461"},{"key":"ref90","article-title":"Wavcaps: A chatgpt-assisted weakly-labelled audio captioning dataset for audio-language multimodal research","author":"mei","year":"2023"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1145\/2502081.2502245"},{"key":"ref86","first-page":"1","article-title":"Hyu submission for the DCASE 2023 task 6a: Automated audio captioning model using al-mixgen and synonyms substitution","author":"cho","year":"0","journal-title":"Proc Conf Detection Classification Acoust Scenes Events Challenge"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.23919\/APSIPAASC55919.2022.9980325"},{"key":"ref88","article-title":"Introducing ChatGPT","year":"2022"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1145\/219717.219748"},{"key":"ref82","first-page":"140","article-title":"Continual learning for automated audio captioning using the learning without forgetting approach","author":"berg","year":"0","journal-title":"Proc Conf Detection Classification Acoust Scenes Events"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-1614"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-140"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746601"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.23919\/APSIPAASC55919.2022.9980242"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2087"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.324"},{"key":"ref109","first-page":"1","article-title":"The SJTU system for DCASE2021 challenge task 6: Audio captioning based on encoder pre-training and reinforcement learning","author":"xu","year":"0","journal-title":"Proc Conf Detection Classification Acoust Scenes Events"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10097021"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12340"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746834"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414834"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3293015"},{"key":"ref105","first-page":"1","article-title":"BERTScore: Evaluating text generation with BERT","author":"zhang","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref77","article-title":"Investigations in audio captioning: Addressing vocabulary imbalance and evaluating suitability of language-centric performance metrics","author":"kothinti","year":"0","journal-title":"Proc IEEE EUSIPCO"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W15-2812"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746894"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1410"},{"key":"ref2","first-page":"119","article-title":"Audiocaps: Generating captions for audios in the wild","author":"kim","year":"0","journal-title":"Proc Conf North Amer Chapter Assoc Comput Linguistics Hum Lang Technol"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682377"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746427"},{"key":"ref111","first-page":"1","article-title":"CP-JKU's submission to task 6a of the DCASE2022 challenge: A bart encoder-decoder for automatic audio captioning trained via the reinforce algorithm and transfer learning","author":"primus","year":"0","journal-title":"Proc Conf Detection Classification Acoust Scenes Events"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018626"},{"key":"ref112","first-page":"1","article-title":"Irit-ups DCASE 2023 audio captioning and retrieval system","author":"labb\u00e9","year":"0","journal-title":"Proc Conf Detection Classification Acoust Scenes Events Challenge"},{"key":"ref73","first-page":"1","article-title":"The SJTU system for DCASE2022 challenge task 6: Audio captioning with audio-text retrieval pre-training","author":"xu","year":"0","journal-title":"Proc Conf Detection Classification Acoust Scenes Events"},{"key":"ref72","first-page":"196","article-title":"CL4AC: A contrastive loss for audio captioning","author":"liu","year":"0","journal-title":"Proc Conf Detection Classification Acoust Scenes Events"},{"key":"ref110","first-page":"1","article-title":"Automated audio captioning with keywords guidance","author":"mei","year":"0","journal-title":"Proc Conf Detection Classification Acoust Scenes Events"},{"key":"ref68","first-page":"206","article-title":"An encoder-decoder based audio captioning system with transfer and reinforcement learning","author":"mei","year":"0","journal-title":"Proc Conf Detection Classification Acoust Scenes Events"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1989.1.2.270"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.131"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/ISCSLP49672.2021.9362117"},{"key":"ref63","first-page":"1","article-title":"Leveraging multi-task training and image retrieval with clap for audio captioning","author":"sun","year":"0","journal-title":"Proc Conf Detection Classification Acoust Scenes Events Challenge"},{"key":"ref66","first-page":"1171","article-title":"Scheduled sampling for sequence prediction with recurrent neural networks","author":"bengio","year":"0","journal-title":"Proc Int Conf Adv Neural Inf Process Syst"},{"key":"ref113","first-page":"1","article-title":"Label-refined sequential training with noisy data for automated audio captioning","author":"sim","year":"0","journal-title":"Proc Conf Detection Classification Acoust Scenes Events Challenge"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096522"},{"key":"ref60","first-page":"1","article-title":"Audio captioning using sound event detection","author":"eren","year":"0","journal-title":"Proc Conf Detection Classification Acoust Scenes Events"},{"key":"ref62","first-page":"6","article-title":"Multi-task regularization based on infrequent classes for audio captioning","author":"cak?r","year":"0","journal-title":"Proc Conf Detection Classification Acoust Scenes Events"},{"key":"ref61","first-page":"1","article-title":"Automated audio captioning with multi-task learning","author":"ye","year":"0","journal-title":"Proc Conf Detection Classification Acoust Scenes Events"}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6570655\/10304349\/10285526.pdf?arnumber=10285526","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,11,20]],"date-time":"2023-11-20T19:33:29Z","timestamp":1700508809000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10285526\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"references-count":113,"URL":"https:\/\/doi.org\/10.1109\/taslp.2023.3321968","relation":{},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"value":"2329-9290","type":"print"},{"value":"2329-9304","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]}}}