{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T18:29:58Z","timestamp":1776882598627,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,12,6]],"date-time":"2022-12-06T00:00:00Z","timestamp":1670284800000},"content-version":"vor","delay-in-days":365,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["1948547"],"award-info":[{"award-number":["1948547"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,12,6]]},"DOI":"10.1145\/3485832.3485892","type":"proceedings-article","created":{"date-parts":[[2021,12,6]],"date-time":"2021-12-06T13:42:32Z","timestamp":1638798152000},"page":"720-731","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["CommanderGabble: A Universal Attack Against ASR Systems Leveraging Fast Speech"],"prefix":"10.1145","author":[{"given":"Zhaohe (John)","family":"Zhang","sequence":"first","affiliation":[{"name":"University of Oklahoma, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Edwin","family":"Yang","sequence":"additional","affiliation":[{"name":"University of Oklahoma, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Song","family":"Fang","sequence":"additional","affiliation":[{"name":"University of Oklahoma, United States of America"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2021,12,6]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2014. The CMU Pronunciation Dictionary (version 0.7b)[Online. http:\/\/www.speech.cs.cmu.edu\/"},{"key":"e_1_3_2_1_2_1","unstructured":"2021. Amazon Mechanical Turk. https:\/\/www.mturk.com"},{"key":"e_1_3_2_1_3_1","unstructured":"2021. LOGIOS Lexicon Tool. http:\/\/www.speech.cs.cmu.edu\/tools\/lextool.html"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.14722\/ndss.2019.23362"},{"key":"e_1_3_2_1_5_1","volume-title":"SoK: The Faults in our ASRs: An Overview of Attacks against Automatic Speech Recognition and Speaker Identification Systems. arXiv e-prints","author":"Abdullah Hadi","year":"2020","unstructured":"Hadi Abdullah, Kevin Warren, Vincent Bindschaedler, Nicolas Papernot, and Patrick Traynor. 2020. SoK: The Faults in our ASRs: An Overview of Attacks against Automatic Speech Recognition and Speaker Identification Systems. arXiv e-prints (2020), arXiv\u20132007."},{"key":"e_1_3_2_1_6_1","volume-title":"31st Conference on Neural Information Processing Systems (NIPS","author":"Alzantot Moustafa","year":"2017","unstructured":"Moustafa Alzantot, Bharathan Balaji, and Mani Srivastava. 2017. Did you hear that? Adversarial examples against automatic speech recognition. In 31st Conference on Neural Information Processing Systems (NIPS 2017)."},{"key":"e_1_3_2_1_7_1","unstructured":"Amazon. 2020. About the Automatic Speech Recognition (ASR) Evaluation tool. https:\/\/developer.amazon.com\/en-US\/docs\/alexa\/asr\/about-asr.html"},{"key":"e_1_3_2_1_8_1","unstructured":"Amazon. 2020. New Alexa Features for the Vehicle. https:\/\/www.amazon.com\/b?ie=UTF8&node=21439303011"},{"key":"e_1_3_2_1_9_1","volume-title":"The Biometric Computing: Recognition and Registration","author":"Arya Karm\u00a0Veer","unstructured":"Karm\u00a0Veer Arya and Robin\u00a0Singh Bhadoria. 2019. The Biometric Computing: Recognition and Registration. CRC Press."},{"key":"e_1_3_2_1_10_1","unstructured":"International\u00a0Phonetic Association. 2020. IPA charts and subcharts in four fonts. https:\/\/www.internationalphoneticassociation.org\/"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2007.02.006"},{"key":"e_1_3_2_1_12_1","volume-title":"Proceedings of the 11th International Conference on Speech and Computer (SPECOM). 3\u201316","author":"Benzeghiba Mohamed","year":"2006","unstructured":"Mohamed Benzeghiba, Renato De\u00a0Mori, Olivier Deroo, Stephane Dupont, Denis Jouvet, Luciano Fissore, Pietro Laface, Alfred Mertins, Christophe Ris, Richard Rose, 2006. Impact of variabilities on speech recognition. In Proceedings of the 11th International Conference on Speech and Computer (SPECOM). 3\u201316."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.5220\/0007309500750087"},{"key":"e_1_3_2_1_14_1","volume-title":"Hidden Voice Commands. In 25th USENIX Security Symposium (USENIX Security 16)","author":"Carlini Nicholas","year":"2016","unstructured":"Nicholas Carlini, Pratyush Mishra, Tavish Vaidya, Yuankai Zhang, Micah Sherr, Clay Shields, David Wagner, and Wenchao Zhou. 2016. Hidden Voice Commands. In 25th USENIX Security Symposium (USENIX Security 16). USENIX Association, Austin, TX, 513\u2013530."},{"key":"e_1_3_2_1_15_1","volume-title":"Audio Adversarial Examples: Targeted Attacks on Speech-to-Text. In 2018 IEEE Security and Privacy Workshops (SPW). 1\u20137.","author":"Carlini N.","unstructured":"N. Carlini and D. Wagner. 2018. Audio Adversarial Examples: Targeted Attacks on Speech-to-Text. In 2018 IEEE Security and Privacy Workshops (SPW). 1\u20137."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.14722\/ndss.2020.23055"},{"key":"e_1_3_2_1_17_1","volume-title":"29th {USENIX} Security Symposium ({USENIX} Security 20). 2667\u20132684.","author":"Chen Yuxuan","unstructured":"Yuxuan Chen, Xuejing Yuan, Jiangshan Zhang, Yue Zhao, Shengzhi Zhang, Kai Chen, and XiaoFeng Wang. 2020. Devil\u2019s whisper: A general approach for physical adversarial attacks against commercial black-box speech recognition devices. In 29th {USENIX} Security Symposium ({USENIX} Security 20). 2667\u20132684."},{"key":"e_1_3_2_1_18_1","volume-title":"Explaining sonority projection effects. Phonology","author":"Daland Robert","year":"2011","unstructured":"Robert Daland, Bruce Hayes, James White, Marc Garellek, Andrea Davis, and Ingrid Norrmann. 2011. Explaining sonority projection effects. Phonology (2011)."},{"key":"e_1_3_2_1_19_1","volume-title":"A Frequency Dictionary of Contemporary American English","author":"Davies Mark","unstructured":"Mark Davies and Dee Gardner. 2009. A Frequency Dictionary of Contemporary American English. Taylor & Francis."},{"key":"e_1_3_2_1_20_1","volume-title":"Language files: Materials for an introduction to language and linguistics","author":"Dawson Hope","unstructured":"Hope Dawson, Michael Phelan, 2016. Language files: Materials for an introduction to language and linguistics. The Ohio State University Press."},{"key":"e_1_3_2_1_21_1","first-page":"160","article-title":"Syllabic features and phonic impression in English","volume":"22","author":"Delattre Pierre","year":"1969","unstructured":"Pierre Delattre and Carroll Olsen. 1969. Syllabic features and phonic impression in English, German, French and Spanish. Lingua 22(1969), 160\u2013175.","journal-title":"German, French and Spanish. Lingua"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1121\/1.3159302"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1017\/S0022226700002267"},{"key":"e_1_3_2_1_24_1","unstructured":"Google. 2020. Android Auto. https:\/\/www.android.com\/auto\/"},{"key":"e_1_3_2_1_25_1","unstructured":"Google Cloud. 2020. Method: text.synthesize. https:\/\/cloud.google.com\/text-to-speech\/docs\/reference\/rest\/v1\/text\/synthesize"},{"key":"e_1_3_2_1_26_1","unstructured":"Google Cloud. 2020. Speech-to-Text. https:\/\/cloud.google.com\/speech-to-text"},{"key":"e_1_3_2_1_27_1","unstructured":"Google Cloud. 2020. Text-to-Speech. https:\/\/cloud.google.com\/text-to-speech\/"},{"key":"e_1_3_2_1_28_1","unstructured":"Awni Hannun Carl Case Jared Casper Bryan Catanzaro Greg Diamos Erich Elsen Ryan Prenger Sanjeev Satheesh Shubho Sengupta Adam Coates 2014. Deep speech: Scaling up end-to-end speech recognition. arXiv preprint arXiv:1412.5567(2014)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2005-138"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1121\/1.2773986"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Shreya Khare Rahul Aralikatte and Senthil Mani. 2018. Adversarial black-box attacks on automatic speech recognition systems using multi-objective evolutionary optimization. arXiv preprint arXiv:1811.01312(2018).","DOI":"10.21437\/Interspeech.2019-2420"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.5555\/3277203.3277207"},{"key":"e_1_3_2_1_33_1","unstructured":"Vladimir\u00a0I Levenshtein. 1966. Binary codes capable of correcting deletions insertions and reversals. In Soviet physics doklady Vol.\u00a010. 707\u2013710."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3372297.3423348"},{"key":"e_1_3_2_1_35_1","volume-title":"Speech recognition by machines and humans. Speech communication 22, 1","author":"Lippmann P","year":"1997","unstructured":"Richard\u00a0P Lippmann. 1997. Speech recognition by machines and humans. Speech communication 22, 1 (1997), 1\u201315."},{"key":"e_1_3_2_1_36_1","unstructured":"Taylor Martin David Priest Dale Smith and Andrew Gebhart. 2020. Every Google Assistant command for your Nest speaker or display. https:\/\/www.cnet.com\/how-to\/every-google-assistant-command-for-your-nest-speaker-or-display\/"},{"key":"e_1_3_2_1_37_1","volume-title":"The concise Oxford dictionary of linguistics","author":"Matthews Peter\u00a0Hugoe","unstructured":"Peter\u00a0Hugoe Matthews and Peter\u00a0Hugoe Matthews. 2014. The concise Oxford dictionary of linguistics. Oxford University Press."},{"key":"e_1_3_2_1_38_1","volume-title":"Facilitation in recognizing pairs of words: evidence of a dependence between retrieval operations.Journal of experimental psychology 90, 2","author":"Meyer E","year":"1971","unstructured":"David\u00a0E Meyer and Roger\u00a0W Schvaneveldt. 1971. Facilitation in recognizing pairs of words: evidence of a dependence between retrieval operations.Journal of experimental psychology 90, 2 (1971), 227."},{"key":"e_1_3_2_1_39_1","volume-title":"1996 IEEE International Conference on Acoustics, Speech, and Signal Processing Conference Proceedings, Vol.\u00a01. 335\u2013338","author":"Mirghafori N.","unstructured":"N. Mirghafori, E. Fosler, and N. Morgan. 1996. Towards robustness to fast speech in ASR. In 1996 IEEE International Conference on Acoustics, Speech, and Signal Processing Conference Proceedings, Vol.\u00a01. 335\u2013338."},{"key":"e_1_3_2_1_40_1","first-page":"138","article-title":"Voice Recognition Algorithms using Mel Frequency Cepstral Coefficient (MFCC) and Dynamic Time Warping (DTW) Techniques","volume":"2","author":"Muda Lindasalwa","year":"2010","unstructured":"Lindasalwa Muda, Mumtaj Begam, and I Elamvazuthi. 2010. Voice Recognition Algorithms using Mel Frequency Cepstral Coefficient (MFCC) and Dynamic Time Warping (DTW) Techniques. Journal of Computing 2, 3 (2010), 138\u2013143.","journal-title":"Journal of Computing"},{"key":"e_1_3_2_1_41_1","volume-title":"Wavenet: A generative model for raw audio. arXiv preprint arXiv:1609.03499(2016).","author":"van\u00a0den Oord Aaron","year":"2016","unstructured":"Aaron van\u00a0den Oord, Sander Dieleman, Heiga Zen, Karen Simonyan, Oriol Vinyals, Alex Graves, Nal Kalchbrenner, Andrew Senior, and Koray Kavukcuoglu. 2016. Wavenet: A generative model for raw audio. arXiv preprint arXiv:1609.03499(2016)."},{"key":"e_1_3_2_1_42_1","volume-title":"IEEE 2011 workshop on automatic speech recognition and understanding. IEEE Signal Processing Society.","author":"Povey Daniel","year":"2011","unstructured":"Daniel Povey, Arnab Ghoshal, Gilles Boulianne, Lukas Burget, Ondrej Glembek, Nagendra Goel, Mirko Hannemann, Petr Motlicek, Yanmin Qian, Petr Schwarz, 2011. The Kaldi speech recognition toolkit. In IEEE 2011 workshop on automatic speech recognition and understanding. IEEE Signal Processing Society."},{"key":"e_1_3_2_1_43_1","unstructured":"David Priest Tauren Dyson and Taylor Martin. 2020. Every Alexa command you can give your Amazon Echo smart speaker. https:\/\/www.cnet.com\/how-to\/every-alexa-command-you-can-give-your-amazon-echo-smart-speaker\/"},{"key":"e_1_3_2_1_44_1","unstructured":"Emil Protalinski. 2019. ProBeat: Has Google\u2019s word error rate progress stalled?https:\/\/venturebeat.com\/2019\/05\/10\/probeat-has-googles-word-error-rate-progress-stalled\/"},{"key":"e_1_3_2_1_45_1","volume-title":"International conference on machine learning. PMLR, 5231\u20135240","author":"Qin Yao","year":"2019","unstructured":"Yao Qin, Nicholas Carlini, Garrison Cottrell, Ian Goodfellow, and Colin Raffel. 2019. Imperceptible, robust, and targeted adversarial examples for automatic speech recognition. In International conference on machine learning. PMLR, 5231\u20135240."},{"key":"e_1_3_2_1_46_1","volume-title":"15th {USENIX} Symposium on Networked Systems Design and Implementation ({NSDI} 18). 547\u2013560.","author":"Roy Nirupam","unstructured":"Nirupam Roy, Sheng Shen, Haitham Hassanieh, and Romit\u00a0Roy Choudhury. 2018. Inaudible voice commands: The long-range attack and defense. In 15th {USENIX} Symposium on Networked Systems Design and Implementation ({NSDI} 18). 547\u2013560."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.14722\/ndss.2019.23288"},{"key":"e_1_3_2_1_48_1","unstructured":"Jonathan Shen Patrick Nguyen Yonghui Wu Zhifeng Chen Mia\u00a0X Chen Ye Jia Anjuli Kannan Tara Sainath Yuan Cao Chung-Cheng Chiu 2019. Lingvo: a modular and scalable framework for sequence-to-sequence modeling. arXiv preprint arXiv:1902.08295(2019)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1995.479672"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3133956.3138836"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2018.04.003"},{"key":"e_1_3_2_1_52_1","volume-title":"Attacking Speaker Recognition Systems with Phoneme Morphing. In European Symposium on Research in Computer Security. Springer, 471\u2013492","author":"Turner Henry","year":"2019","unstructured":"Henry Turner, Giulio Lovisotto, and Ivan Martinovic. 2019. Attacking Speaker Recognition Systems with Phoneme Morphing. In European Symposium on Research in Computer Security. Springer, 471\u2013492."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.14722\/ndss.2020.24068"},{"key":"e_1_3_2_1_54_1","volume-title":"CommanderSong: A Systematic Approach for Practical Adversarial Voice Recognition. In 27th USENIX Security Symposium (USENIX Security 18)","author":"Yuan Xuejing","year":"2018","unstructured":"Xuejing Yuan, Yuxuan Chen, Yue Zhao, Yunhui Long, Xiaokang Liu, Kai Chen, Shengzhi Zhang, Heqing Huang, XiaoFeng Wang, and Carl\u00a0A. Gunter. 2018. CommanderSong: A Systematic Approach for Practical Adversarial Voice Recognition. In 27th USENIX Security Symposium (USENIX Security 18). USENIX Association, Baltimore, MD, 49\u201364."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3133956.3134052"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"crossref","unstructured":"N. Zhang X. Mi X. Feng X. Wang Y. Tian and F. Qian. 2019. Dangerous Skills: Understanding and Mitigating Security Risks of Voice-Controlled Third-Party Functions on Virtual Personal Assistant Systems. In 2019 IEEE Symposium on Security and Privacy (SP). 1381\u20131396.","DOI":"10.1109\/SP.2019.00016"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.14722\/ndss.2019.23525"},{"key":"e_1_3_2_1_58_1","unstructured":"Zoom. 2021. Video Conferencing Web Conferencing Webinars Screen Sharing - Zoom. https:\/\/zoom.us\/"}],"event":{"name":"ACSAC '21: Annual Computer Security Applications Conference","location":"Virtual Event USA","acronym":"ACSAC '21"},"container-title":["Annual Computer Security Applications Conference"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3485832.3485892","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3485832.3485892","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3485832.3485892","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T19:13:36Z","timestamp":1755890016000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3485832.3485892"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,12,6]]},"references-count":58,"alternative-id":["10.1145\/3485832.3485892","10.1145\/3485832"],"URL":"https:\/\/doi.org\/10.1145\/3485832.3485892","relation":{},"subject":[],"published":{"date-parts":[[2021,12,6]]},"assertion":[{"value":"2021-12-06","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}