{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,25]],"date-time":"2026-02-25T21:57:29Z","timestamp":1772056649461,"version":"3.50.1"},"reference-count":97,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"1","license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U23B2018"],"award-info":[{"award-number":["U23B2018"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Shanghai Municipal Science and Technology Major Project","award":["2021SHZDZX0102"],"award-info":[{"award-number":["2021SHZDZX0102"]}]},{"name":"Yangtze River Delta Science and Technology Innovation Community Joint Research","award":["2024CSJGG01100"],"award-info":[{"award-number":["2024CSJGG01100"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE J. Sel. Top. Signal Process."],"published-print":{"date-parts":[[2026,1]]},"DOI":"10.1109\/jstsp.2026.3653157","type":"journal-article","created":{"date-parts":[[2026,1,12]],"date-time":"2026-01-12T22:01:02Z","timestamp":1768255262000},"page":"63-76","source":"Crossref","is-referenced-by-count":0,"title":["SLAM-LLM: A Modular, Open-Source Multimodal Large Language Model Framework and Best Practice for Speech, Language, Audio and Music Processing"],"prefix":"10.1109","volume":"20","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8195-3262","authenticated-orcid":false,"given":"Ziyang","family":"Ma","sequence":"first","affiliation":[{"name":"X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China"}]},{"given":"Guanrou","family":"Yang","sequence":"additional","affiliation":[{"name":"X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China"}]},{"given":"Wenxi","family":"Chen","sequence":"additional","affiliation":[{"name":"X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China"}]},{"given":"Zhifu","family":"Gao","sequence":"additional","affiliation":[{"name":"Tongyi Lab, Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-0513-2635","authenticated-orcid":false,"given":"Yexing","family":"Du","sequence":"additional","affiliation":[{"name":"Peng Cheng Laboratory, Guangdong, China"}]},{"given":"Xiquan","family":"Li","sequence":"additional","affiliation":[{"name":"X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China"}]},{"given":"Zhisheng","family":"Zheng","sequence":"additional","affiliation":[{"name":"The University of Texas at Austin, Austin, TX, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-6286-5530","authenticated-orcid":false,"given":"Haina","family":"Zhu","sequence":"additional","affiliation":[{"name":"X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China"}]},{"given":"Jianheng","family":"Zhuo","sequence":"additional","affiliation":[{"name":"X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China"}]},{"given":"Zheshu","family":"Song","sequence":"additional","affiliation":[{"name":"X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China"}]},{"given":"Ruiyang","family":"Xu","sequence":"additional","affiliation":[{"name":"X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2765-5889","authenticated-orcid":false,"given":"Tiranrui","family":"Wang","sequence":"additional","affiliation":[{"name":"Tianjin University, Tianjin, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-0588-1812","authenticated-orcid":false,"given":"Yifan","family":"Yang","sequence":"additional","affiliation":[{"name":"X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-4066-2039","authenticated-orcid":false,"given":"Yanqiao","family":"Zhu","sequence":"additional","affiliation":[{"name":"X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-1880-7434","authenticated-orcid":false,"given":"Zhikang","family":"Niu","sequence":"additional","affiliation":[{"name":"X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2815-8494","authenticated-orcid":false,"given":"Liumeng","family":"Xue","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology, Hong Kong, SAR, China"}]},{"given":"Yinghao","family":"Ma","sequence":"additional","affiliation":[{"name":"Queen Mary University of London, London, U.K."}]},{"given":"Ruibin","family":"Yuan","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology, Hong Kong, SAR, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1718-3686","authenticated-orcid":false,"given":"Shiliang","family":"Zhang","sequence":"additional","affiliation":[{"name":"Tongyi Lab, Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7102-9826","authenticated-orcid":false,"given":"Kai","family":"Yu","sequence":"additional","affiliation":[{"name":"X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6257-7399","authenticated-orcid":false,"given":"Eng Siong","family":"Chng","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7423-617X","authenticated-orcid":false,"given":"Xie","family":"Chen","sequence":"additional","affiliation":[{"name":"X-LANCE Lab, School of Computer Science, MoE Key Lab of Artificial Intelligence Shanghai Jiao Tong University, Shanghai, China"}]}],"member":"263","reference":[{"key":"ref1","first-page":"34892","article-title":"Visual instruction tuning","volume-title":"Proc. 37th Int. Conf. Neural Inf. Process. Syst.","author":"Liu","year":"2023"},{"key":"ref2","article-title":"OpenFlamingo: An open-source framework for training large autoregressive vision-language models","volume":"abs\/2308.01390","author":"Awadalla","year":"2023","journal-title":"CoRR"},{"key":"ref3","first-page":"28492","article-title":"Robust speech recognition via large-scale weak supervision","volume-title":"Proc. 40th Int. Conf. Mach. Learn.","author":"Radford","year":"2023"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref5","first-page":"5178","article-title":"BEATs: Audio pre-training with acoustic tokenizers","volume-title":"Proc. 40th Int. Conf. Mach. Learn.","author":"Chen","year":"2023"},{"key":"ref6","article-title":"MERT: Acoustic music understanding model with large-scale self-supervised training","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Li","year":"2024"},{"key":"ref7","first-page":"19730","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"Proc. 40th Int. Conf. Mach. Learn.","author":"Li","year":"2023"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.5040\/9781501365072.09396"},{"key":"ref9","article-title":"LLaMA 2: Open foundation and fine-tuned chat models","volume":"abs\/2307.09288","author":"Touvron","year":"2023","journal-title":"CoRR"},{"key":"ref10","article-title":"Vicuna: An open-source Chatbot impressing GPT-4 with 90* ChatGPT quality","author":"Chiang","year":"2023"},{"key":"ref11","article-title":"Qwen technical report","author":"Bai","year":"2023"},{"key":"ref12","article-title":"LLaMA-recipes","year":"2024"},{"key":"ref13","article-title":"LLaMA-Adapter: Efficient fine-tuning of language models with zero-init attention","author":"Zhang","year":"2023"},{"key":"ref14","article-title":"LLaMA-Adapter V2: Parameter-efficient visual instruction model","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Gao","year":"2024"},{"key":"ref15","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford","year":"2021"},{"key":"ref16","article-title":"DINOv2: Learning robust visual features without supervision","author":"Oquab","year":"2024","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-demos.38"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-demo.12"},{"key":"ref19","article-title":"MiniGPT-4: Enhancing vision-language understanding with advanced large language models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Zhu","year":"2024"},{"key":"ref20","article-title":"TinyLLaVA: A framework of small-scale large multimodal models","volume":"abs\/2402.14289","author":"Zhou","year":"2024","journal-title":"CoRR"},{"key":"ref21","first-page":"23716","article-title":"Flamingo: A visual language model for few-shot learning","volume-title":"Proc. 36th Int. Conf. Neural Inf. Process. Syst.","author":"Alayrac","year":"2022"},{"key":"ref22","article-title":"PEFT: State-of-the-art parameter-efficient fine-tuning methods","author":"Mangrulkar","year":"2022"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.14778\/3611540.3611569"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-demo.45"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.135"},{"key":"ref27","article-title":"Speak foreign languages with your own voice: Cross-lingual neural codec language modeling","author":"Zhang","year":"2023"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-acl.115"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052942"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.80"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1965"},{"key":"ref34","article-title":"TinyLLaMA: An open-source small language model","author":"Zhang","year":"2024"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2059"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2020-3015"},{"key":"ref37","article-title":"Branchformer: Parallel MLP-attention architectures to capture local and global context for speech recognition and understanding","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Peng","year":"2022"},{"key":"ref38","article-title":"Zipformer: A faster and better encoder for automatic speech recognition","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Yao","year":"2024"},{"key":"ref39","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume-title":"Proc. 34th Int. Conf. Neural Inf. Process. Syst.","author":"Baevski","year":"2020"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-1194"},{"key":"ref41","article-title":"The ASRU 2019 mandarin-english code-switching speech recognition challenge: Open datasets, tracks, methods and results","author":"Shi","year":"2020"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448079"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448106"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-1566"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-767"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447782"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-461"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447438"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447652"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/SLT61566.2024.10832154"},{"key":"ref51","article-title":"LRS3-TED: A large-scale dataset for visual speech recognition","author":"Afouras","year":"2018"},{"key":"ref52","article-title":"Learning audio-visual speech representation by masked multimodal cluster prediction","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Shi","year":"2022"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-2027"},{"key":"ref55","first-page":"2012","article-title":"MuST-C: A multilingual speech translation corpus","volume-title":"Proc. Conf. North Amer. Chapter Assoc. Comput. Linguistics","author":"Gangi","year":"2019"},{"key":"ref56","article-title":"No language left behind: Scaling human-centered machine translation","author":"Costa-juss","year":"2022"},{"key":"ref57","article-title":"BLSP-KD: Bootstrapping language-speech pre-training via knowledge distillation","author":"Wang","year":"2024"},{"key":"ref58","article-title":"SALMONN: Towards generic hearing abilities for large language models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Tang","year":"2024"},{"key":"ref59","article-title":"Seamless: Multilingual expressive and streaming speech translation","author":"Barrault","year":"2023"},{"key":"ref60","article-title":"Qwen2-audio technical report","author":"Chu","year":"2024"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i17.29902"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.801"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2024.3390182"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.931"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3124365"},{"key":"ref66","article-title":"An embarrassingly simple approach for LLM with strong ASR capacity","author":"Ma","year":"2024"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-488"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052990"},{"key":"ref69","first-page":"119","article-title":"Audiocaps: Generating captions for audios in the wild","volume-title":"Proc. Conf. North Amer. Chapter Assoc. Comput. Linguistics","author":"Kim","year":"2019"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/taslp.2024.3419446"},{"key":"ref71","article-title":"What is the ground truth? Reliability of multi-annotator data for audio tagging","volume-title":"Proc. EUSIPCO","author":"Mart-Morat","year":"2021"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-1009"},{"key":"ref74","first-page":"65","article-title":"METEOR: An automatic metric for MT evaluation with improved correlation with human judgments","volume-title":"Proc. Assoc. Comput. Linguistics","author":"Banerjee","year":"2005"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.100"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746427"},{"key":"ref79","first-page":"3807","article-title":"EAT: Self-supervised pre-training with efficient audio transformer","volume-title":"Proc. Int. Joint Conf. Artif. Intell.","author":"Chen","year":"2024"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10889071"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"key":"ref82","article-title":"LoRA: Low-rank adaptation of large language models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Hu","year":"2022"},{"key":"ref83","article-title":"DeCap: Decoding CLIP latents for zero-shot captioning via text-only training","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Li","year":"2023"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446672"},{"key":"ref85","article-title":"BEATs-based audio captioning model with INSTRUCTOR embedding supervision and ChatGPT mix-up","author":"Wu","year":"2023","journal-title":"Rep., DCASE Challenge"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446343"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-65"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-41"},{"key":"ref89","article-title":"Weakly-supervised automated audio captioning via text only training","author":"Kouzelis","year":"2023"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/TASLPRO.2025.3567770"},{"key":"ref91","article-title":"LP-MusicCaps: LLM-based pseudo music captioning","author":"Doh","year":"2023"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-naacl.231"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448314"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/TASLPRO.2025.3602320"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095889"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN52387.2021.9533461"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10890325"}],"container-title":["IEEE Journal of Selected Topics in Signal Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/4200690\/11409413\/11346946.pdf?arnumber=11346946","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,25]],"date-time":"2026-02-25T20:56:36Z","timestamp":1772052996000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11346946\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1]]},"references-count":97,"journal-issue":{"issue":"1"},"URL":"https:\/\/doi.org\/10.1109\/jstsp.2026.3653157","relation":{},"ISSN":["1932-4553","1941-0484"],"issn-type":[{"value":"1932-4553","type":"print"},{"value":"1941-0484","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,1]]}}}