{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,19]],"date-time":"2026-06-19T14:19:54Z","timestamp":1781878794963,"version":"3.54.5"},"reference-count":112,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"3","license":[{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"New Generation Artificial Intelligence-National Science and Technology","award":["2025ZD0123602"],"award-info":[{"award-number":["2025ZD0123602"]}]},{"name":"National Cryptologic Science Foundation of China","award":["2025NCSF01012"],"award-info":[{"award-number":["2025NCSF01012"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62171326"],"award-info":[{"award-number":["62171326"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Dependable and Secure Comput."],"published-print":{"date-parts":[[2026,5]]},"DOI":"10.1109\/tdsc.2026.3661073","type":"journal-article","created":{"date-parts":[[2026,2,4]],"date-time":"2026-02-04T20:50:49Z","timestamp":1770238249000},"page":"6085-6102","source":"Crossref","is-referenced-by-count":1,"title":["AudioJailbreak: Jailbreak Attacks Against End-to-End Large Audio-Language Models"],"prefix":"10.1109","volume":"23","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8277-3119","authenticated-orcid":false,"given":"Guangke","family":"Chen","sequence":"first","affiliation":[{"name":"Wuhan University, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Fu","family":"Song","sequence":"additional","affiliation":[{"name":"Key Laboratory of System Software (Chinese Academy of Sciences), Institute of Software, Chinese Academy of Sciences, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhe","family":"Zhao","sequence":"additional","affiliation":[{"name":"Ant Group, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2018-9344","authenticated-orcid":false,"given":"Xiaojun","family":"Jia","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7300-9215","authenticated-orcid":false,"given":"Yang","family":"Liu","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5009-3095","authenticated-orcid":false,"given":"Yanchen","family":"Qiao","sequence":"additional","affiliation":[{"name":"Pengcheng Laboratory, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4783-876X","authenticated-orcid":false,"given":"Weizhe","family":"Zhang","sequence":"additional","affiliation":[{"name":"Pengcheng Laboratory, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6933-3298","authenticated-orcid":false,"given":"Weiping","family":"Tu","sequence":"additional","affiliation":[{"name":"Wuhan University, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3001-7957","authenticated-orcid":false,"given":"Yuhong","family":"Yang","sequence":"additional","affiliation":[{"name":"Wuhan University, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0059-8458","authenticated-orcid":false,"given":"Bo","family":"Du","sequence":"additional","affiliation":[{"name":"Wuhan University, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Abusing images and sounds for indirect instruction injection in multi-modal LLMs","author":"Bagdasaryan","year":"2023"},{"key":"ref2","article-title":"AdvWave: Stealthy adversarial jailbreak attack against large audio-language models","author":"Kang","year":"2024"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.596"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/DSN-W65791.2025.00074"},{"key":"ref5","article-title":"Voice jailbreak attacks against GPT-4o","author":"Shen","year":"2024"},{"key":"ref6","article-title":"Unveiling the safety of GPT-4o: An empirical study using jailbreak attacks","author":"Ying","year":"2024"},{"key":"ref7","article-title":"Multilingual and multi-accent jailbreaking of audio LLMs","volume-title":"Proc. 2nd Conf. Lang. Model.","author":"Roh"},{"key":"ref8","article-title":"Apple Siri: Get everyday tasks done using only your voice. just say \u201dSiri\u201d or \u201dhey Siri\u201d\u2019 to start your request","year":"2024"},{"key":"ref9","article-title":"Read speak app: AI speaking coach","author":"Yuan","year":"2024"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICOT.2018.8705924"},{"key":"ref11","article-title":"LLaSM: Large language and speech model","author":"Shu","year":"2023"},{"key":"ref12","article-title":"Mini-omni: Language models can hear, talk while thinking in streaming","author":"Xie","year":"2024"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.1055"},{"key":"ref14","article-title":"Risk taxonomy, mitigation, and assessment benchmarks of large language model systems","author":"Cui","year":"2024"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/3658644.3690291"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1016\/j.hcc.2024.100211"},{"key":"ref17","article-title":"Jailbreak attacks and defenses against large language models: A survey","author":"Yi","year":"2024"},{"key":"ref18","first-page":"80079","article-title":"Jailbroken: How does LLM safety training fail","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Wei"},{"key":"ref19","article-title":"Deepinception: Hypnotize large language model to be jailbreaker","author":"Li","year":"2023"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2026.3660147"},{"key":"ref21","article-title":"Universal and transferable adversarial attacks on aligned language models","author":"Zou","year":"2023"},{"key":"ref22","article-title":"Jailbreaking leading safety-aligned LLMs with simple adaptive attacks","author":"Andriushchenko","year":"2024"},{"key":"ref23","article-title":"Autodan: Generating stealthy jailbreak prompts on aligned large language models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Liu"},{"key":"ref24","article-title":"AutoDAN: Automatic and interpretable adversarial attacks on large language models","author":"Zhu","year":"2023"},{"key":"ref25","article-title":"Improved techniques for optimization-based jailbreaking on large language models","author":"Jia","year":"2024"},{"key":"ref26","article-title":"Jailbreaking black boxlarge language models in twenty queries","author":"Chao","year":"2023"},{"key":"ref27","article-title":"Tree of attacks: Jailbreaking black-box LLMs automatically","author":"Mehrotra","year":"2023"},{"key":"ref28","article-title":"GPT-4 technical report","author":"Achiam","year":"2024"},{"key":"ref29","article-title":"WavChat: A survey of spoken dialogue models","author":"Ji","year":"2024"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.682"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1121\/1.382599"},{"key":"ref32","article-title":"Code of AudioJailbreak","year":"2025"},{"key":"ref33","article-title":"Official website of AudioJailbreak","year":"2025"},{"key":"ref34","article-title":"GPT-4V(ision) system card","year":"2023"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.52202\/075280-2142"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1516"},{"key":"ref37","article-title":"PandaGPT: One model to instruction-follow them all","author":"Su","year":"2023"},{"key":"ref38","article-title":"Mini-omni2: Towards open-source GPT-4o with vision, speech and duplex capabilities","author":"Xie","year":"2024"},{"key":"ref39","article-title":"Next-GPT: Any-to-any multimodal LLM","author":"Wu","year":"2023"},{"key":"ref40","article-title":"BuboGPT: Enabling visual grounding in multi-modal LLMs","author":"Zhao","year":"2023"},{"key":"ref41","article-title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","author":"Reid","year":"2024"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.521"},{"key":"ref43","article-title":"VITA: Towards open-source interactive omni multimodal LLM","author":"Fu","year":"2024"},{"key":"ref44","article-title":"FunAudioLLM: Voice understanding and generation foundation models for natural interaction between humans and LLMs","author":"An","year":"2024"},{"key":"ref45","article-title":"Speech to speech: An effort for an open-sourced and modular GPT4-o","year":"2024"},{"key":"ref46","article-title":"ChatGPT can now see, hear, and speak","year":"2024"},{"key":"ref47","article-title":"Qwen2-audio technical report","volume":"abs\/2407.10759","author":"Chu","year":"2024","journal-title":"CoRR"},{"key":"ref48","article-title":"LLaMA-omni: Seamless speech interaction with large language models","author":"Fang","year":"2024"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.361"},{"key":"ref50","article-title":"GLM-4-voice: Towards intelligent and human-like end-to-end spoken chatbot","author":"Zeng","year":"2024"},{"key":"ref51","article-title":"Moshi: A speech-text foundation model for real-time dialogue","author":"D\u00e9fossez","year":"2024"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-1533"},{"key":"ref53","article-title":"A comparative study of discrete speech tokens for semantic-related tasks with large language models","author":"Wang","year":"2024"},{"key":"ref54","article-title":"Qwen-audio: Advancing universal audio understanding via unified large-scale audio-language models","author":"Chu","year":"2023"},{"key":"ref55","article-title":"SALMONN: Towards generic hearing abilities for large language models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Tang"},{"key":"ref56","article-title":"BLSP: Bootstrapping language-speech pre-training via behavior alignment of continuation writing","author":"Wang","year":"2023"},{"key":"ref57","first-page":"28492","article-title":"Robust speech recognition via large-scale weak supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford"},{"key":"ref58","article-title":"LLaMA3-s: Sound instruction language model 2024","author":"Research","year":"2024"},{"key":"ref59","article-title":"Multilingual jailbreak challenges in large language models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Deng"},{"key":"ref60","article-title":"2023 Jailbreak chat","author":"Albert","year":"2023"},{"key":"ref61","article-title":"The LLaMA 3 herd of models","year":"2024"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.30"},{"key":"ref63","article-title":"Improved large language model jailbreak detection via pretrained embeddings","author":"Galinkin","year":"2024"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/SP40001.2021.00004"},{"key":"ref65","first-page":"5231","article-title":"Imperceptible, robust, and targeted adversarial examples for automatic speech recognition","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Qin"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.14722\/ndss.2019.23288"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/tdsc.2022.3189397"},{"key":"ref68","first-page":"49","article-title":"Commandersong: A systematic approach for practical adversarial voice recognition","volume-title":"Proc. USENIX Conf. Secur.","author":"Yuan"},{"key":"ref69","first-page":"2437","article-title":"QFA2SR: Query-free adversarial transfer attacks to speaker recognition systems","volume-title":"Proc. USENIX Conf. Secur.","author":"Chen"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/TDSC.2022.3220673"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM48880.2022.9796934"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1145\/3548606.3559357"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.06083"},{"key":"ref74","article-title":"Explaining and harnessing adversarial examples","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Goodfellow"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1145\/3372297.3423348"},{"key":"ref76","first-page":"3799","article-title":"$\\lbrace${SMACK$\\rbrace$}: Semantically meaningful adversarial audio attack","volume-title":"Proc. 32nd USENIX Secur. Symp.","author":"Yu"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1145\/3376897.3377856"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/ICME46284.2020.9102886"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1353"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1007\/s11265-020-01629-9"},{"key":"ref81","article-title":"LaserAdv: Laser adversarial attacks on speech recognition systems","volume-title":"Proc. 33rd USENIX Secur. Symp.","author":"Zhang"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.14722\/ndss.2024.23030"},{"key":"ref83","article-title":"Coqui TTS is a library for advanced text-to-speech generation","author":"TTS","year":"2024"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/SP61157.2025.00238"},{"key":"ref85","article-title":"Harmbench: A standardized evaluation framework for automated red teaming and robust refusal","author":"Mazeika","year":"2024"},{"key":"ref86","article-title":"Silero VAD: Pre-trained enterprise-grade voice activity detector (VAD), number detector and language classifier","year":"2024"},{"key":"ref87","article-title":"The next generation of ai: Humanoid robot assistants","year":"2024"},{"key":"ref88","article-title":"LLM latency benchmark by use cases","year":"2024"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1145\/1476589.1476628"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2023-598"},{"key":"ref91","article-title":"VAD vs event-triggered for AI speech-to-speech applications","year":"2025"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/icassp55912.2026.11463929"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1016\/j.dsp.2023.104151"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.7717\/peerj-cs.1901"},{"key":"ref95","article-title":"The curious case of neural text degeneration","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Holtzman"},{"key":"ref96","article-title":"HuggingFaceH4 instruction dataset","year":"2023"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1109\/EUSIPCO.2016.7760424"},{"key":"ref98","first-page":"155","article-title":"MedleyDB: A multitrack dataset for annotation-intensive MIR research","volume-title":"Proc. 15th Int. Soc. Music Inf. Retrieval Conf.","author":"Bittner"},{"key":"ref99","article-title":"MedleyDB 2.0: New data and a system for sustainable data collection","volume":"36","author":"Bittner","year":"2016","journal-title":"ISMIR Late Breaking Demo Papers"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1145\/3560905.3568518"},{"key":"ref101","first-page":"7285","article-title":"Devil in the room: Triggering audio backdoors in the physical world","volume-title":"Proc. 33rd USENIX Secur. Symp.","author":"Chen"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.14722\/ndss.2025.240747"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1109\/SP.2017.49"},{"key":"ref104","article-title":"LLaMA 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023"},{"key":"ref105","article-title":"Moderation: Identify potentially harmful content in text and images","year":"2025"},{"key":"ref106","article-title":"The credamo platform","year":"2017"},{"key":"ref107","article-title":"I\u2019m spartacus, no, i\u2019m spartacus: Measuring and understanding LLM identity confusion","author":"Li","year":"2024"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1145\/3689217.3690615"},{"key":"ref109","first-page":"4581","article-title":"SafeSpeech: Robust and universal voice protection against malicious speech synthesis","volume-title":"Proc. Conf. USENIX Secur.","author":"Zhang"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-023-00765-8"},{"key":"ref111","article-title":"Auditory masking: Using sound to control sound","author":"Redon","year":"2023"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-07974-5"}],"container-title":["IEEE Transactions on Dependable and Secure Computing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/8858\/11517592\/11371728.pdf?arnumber=11371728","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,16]],"date-time":"2026-05-16T03:05:16Z","timestamp":1778900716000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11371728\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,5]]},"references-count":112,"journal-issue":{"issue":"3"},"URL":"https:\/\/doi.org\/10.1109\/tdsc.2026.3661073","relation":{},"ISSN":["1545-5971","1941-0018","2160-9209"],"issn-type":[{"value":"1545-5971","type":"print"},{"value":"1941-0018","type":"electronic"},{"value":"2160-9209","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,5]]}}}