{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,16]],"date-time":"2026-01-16T08:56:22Z","timestamp":1768553782066,"version":"3.49.0"},"reference-count":51,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100004826","name":"Beijing Natural Science Foundation","doi-asserted-by":"publisher","award":["Z200002"],"award-info":[{"award-number":["Z200002"]}],"id":[{"id":"10.13039\/501100004826","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U19B2036"],"award-info":[{"award-number":["U19B2036"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62225601"],"award-info":[{"award-number":["62225601"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Youth Innovative Research Team of BUPT","award":["2023QNTD02"],"award-info":[{"award-number":["2023QNTD02"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2023]]},"DOI":"10.1109\/taslp.2023.3293015","type":"journal-article","created":{"date-parts":[[2023,7,6]],"date-time":"2023-07-06T17:24:32Z","timestamp":1688664272000},"page":"2643-2657","source":"Crossref","is-referenced-by-count":11,"title":["ACTUAL: Audio Captioning With Caption Feature Space Regularization"],"prefix":"10.1109","volume":"31","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1172-6846","authenticated-orcid":false,"given":"Yiming","family":"Zhang","sequence":"first","affiliation":[{"name":"Pattern Recognition and Intelligent System Laboratory, School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-5992-2892","authenticated-orcid":false,"given":"Hong","family":"Yu","sequence":"additional","affiliation":[{"name":"Department of Artificial Intelligence, School of Information and Electrical Engineering, Ludong University, Yantai, Shandong, China"}]},{"given":"Ruoyi","family":"Du","sequence":"additional","affiliation":[{"name":"Pattern Recognition and Intelligent System Laboratory, School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6856-8928","authenticated-orcid":false,"given":"Zheng-Hua","family":"Tan","sequence":"additional","affiliation":[{"name":"Department of Electronic Systems, Aalborg University, Aalborg, Denmark"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8393-5703","authenticated-orcid":false,"given":"Wenwu","family":"Wang","sequence":"additional","affiliation":[{"name":"Centre for Vision, Speech and Signal Processing, University of Surrey, Guildford, U.K."}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2950-2488","authenticated-orcid":false,"given":"Zhanyu","family":"Ma","sequence":"additional","affiliation":[{"name":"Pattern Recognition and Intelligent System Laboratory, School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-8650-1603","authenticated-orcid":false,"given":"Yuan","family":"Dong","sequence":"additional","affiliation":[{"name":"Pattern Recognition and Intelligent System Laboratory, School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China"}]}],"member":"263","reference":[{"key":"ref13","article-title":"Leveraging state-of-the-art ASR techniques to audio captioning","author":"narisetty","year":"2021"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747676"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10510"},{"key":"ref14","first-page":"196","article-title":"CL4AC: A contrastive loss for audio captioning","author":"liu","year":"0","journal-title":"Proc 6th Detection Classification Acoust Scenes Events Workshop"},{"key":"ref11","article-title":"Automated audio captioning with temporal attention","author":"wang","year":"2020"},{"key":"ref10","article-title":"The sjtu system for dcase2021 challenge task 6: Audio captioning based on encoder pre-training and reinforcement learning","author":"xu","year":"2021"},{"key":"ref17","first-page":"6","article-title":"Automated audio captioning with weakly supervised pre-training and word selection methods.","author":"han","year":"0","journal-title":"Proc DCASE"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2022.3189536"},{"key":"ref19","first-page":"21","article-title":"Audio captioning based on transformer and pretrained cnn","author":"chen","year":"0","journal-title":"Proc Detection Classification Acoust Scenes Events Workshop"},{"key":"ref18","article-title":"Gated recurrent unit (GRU) for emotion classification from noisy speech","author":"rana","year":"2016"},{"key":"ref51","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"radford","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096877"},{"key":"ref45","article-title":"The DCASE 2021 challenge task 6 system: Automated audio captioning with weakly supervised pre-traing and word selection methods","author":"yuan","year":"2021"},{"key":"ref48","article-title":"Improving the performance of automated audio captioning via integrating the acoustic and textual information","author":"ye","year":"2021"},{"key":"ref47","article-title":"Irit-ups DCASE 2022 task6a system: Stochastic decoding methods for audio captioning","author":"labb\u00e9","year":"2022"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"ref41","first-page":"1171","article-title":"Scheduled sampling for sequence prediction with recurrent neural networks","author":"bengio","year":"0","journal-title":"Proc 28th Int Conf Neural Inf Process Syst"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.708"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.308"},{"key":"ref49","article-title":"Efficient estimation of word representations in vector space","author":"mikolov","year":"2013"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.33682\/sezz-vd31"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682377"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413982"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1186\/s13636-022-00259-2"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2087"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ISCSLP49672.2021.9362117"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746427"},{"key":"ref40","first-page":"90","article-title":"Diversity and Bias in Audio Captioning Datasets","author":"morato","year":"0","journal-title":"Proc Detection Classication Acoust Scenes Events"},{"key":"ref35","first-page":"74","article-title":"Rouge: A package for automatic evaluation of summaries","author":"lin","year":"0","journal-title":"Proc Workshop Text Summarization Branches Out"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"ref37","first-page":"4566","article-title":"CIDERr: Consensus-based image description evaluation","author":"vedantam","year":"0","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref36","first-page":"65","article-title":"Meteor: An automatic metric for mt evaluation with improved correlation with human judgments","author":"banerjee","year":"0","journal-title":"Proc ACL Workshop Intrinsic Extrinsic Eval Meas Mach Transl Summarization"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/W14-4012"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462665"},{"key":"ref33","article-title":"Neural machine translation by jointly learning to align and translate","author":"bahdanau","year":"0","journal-title":"Proc 3rd Int Conf Learn Representations"},{"key":"ref32","first-page":"6000","article-title":"Attention is all you need","author":"vaswani","year":"0","journal-title":"Proc 31th Int Conf Neural Inf Process Syst"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052990"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/WASPAA.2017.8170058"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.100"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"ref24","article-title":"Impact of visual assistance for automated audio captioning","author":"boes","year":"2023"},{"key":"ref23","article-title":"Visually-aware audio captioning with adaptive audio-visual attention","author":"liu","year":"2022"},{"key":"ref26","article-title":"An encoder-decoder based audio captioning system with transfer and reinforcement learning for DCASE challenge 2021 task 6","author":"mei","year":"2021"},{"key":"ref25","first-page":"225","article-title":"A CRNN-GRU based reinforcement learning approach to audio captioning","author":"xu","year":"0","journal-title":"Proc DCASE"},{"key":"ref20","first-page":"60","article-title":"Evaluating off-the-shelf machine listening and natural language models for automated audio captioning","author":"weck","year":"0","journal-title":"Proc 6th Detection Classification Acoust Scenes Events Workshop"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ISM.2020.00014"},{"key":"ref21","first-page":"119","article-title":"Audiocaps: Generating captions for audios in the wild","author":"kim","year":"0","journal-title":"Proc Conf North Amer Chapter Assoc Comput Linguistics Hum Lang Technol"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746834"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746894"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-1423"}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6570655\/9970249\/10174663.pdf?arnumber=10174663","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,8,7]],"date-time":"2023-08-07T18:23:30Z","timestamp":1691432610000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10174663\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"references-count":51,"URL":"https:\/\/doi.org\/10.1109\/taslp.2023.3293015","relation":{},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"value":"2329-9290","type":"print"},{"value":"2329-9304","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023]]}}}