{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,4]],"date-time":"2026-03-04T16:36:31Z","timestamp":1772642191374,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,7,2]],"date-time":"2024-07-02T00:00:00Z","timestamp":1719878400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100006374","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61801049"],"award-info":[{"award-number":["61801049"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,7,2]]},"DOI":"10.1145\/3665451.3665532","type":"proceedings-article","created":{"date-parts":[[2024,7,23]],"date-time":"2024-07-23T06:41:36Z","timestamp":1721716896000},"page":"47-55","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Towards Evaluating the Robustness of Automatic Speech Recognition Systems via Audio Style Transfer"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-8209-2713","authenticated-orcid":false,"given":"Weifei","family":"Jin","sequence":"first","affiliation":[{"name":"Beijing University of Posts and Telecommunications, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-5766-0846","authenticated-orcid":false,"given":"Yuxin","family":"Cao","sequence":"additional","affiliation":[{"name":"Tsinghua University, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4262-6131","authenticated-orcid":false,"given":"Junjie","family":"Su","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5595-9282","authenticated-orcid":false,"given":"Qi","family":"Shen","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-2177-5531","authenticated-orcid":false,"given":"Kai","family":"Ye","sequence":"additional","affiliation":[{"name":"Tsinghua University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1388-7715","authenticated-orcid":false,"given":"Derui","family":"Wang","sequence":"additional","affiliation":[{"name":"CSIRO's Data61, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2032-665X","authenticated-orcid":false,"given":"Jie","family":"Hao","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4060-0839","authenticated-orcid":false,"given":"Ziyao","family":"Liu","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore"}]}],"member":"320","published-online":{"date-parts":[[2024,7,23]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"1115","volume-title":"International conference on machine learning","author":"Dai Hanjun","year":"2018","unstructured":"Hanjun Dai, Hui Li, Tian Tian, Xin Huang, Lin Wang, Jun Zhu, and Le Song. Adversarial attack on graph structured data. In International conference on machine learning, pages 1115--1124. PMLR, 2018."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.3390\/sym11081018"},{"key":"e_1_3_2_1_3_1","volume-title":"A review of deep learning techniques for speech processing. Information Fusion, page 101869","author":"Mehrish Ambuj","year":"2023","unstructured":"Ambuj Mehrish, Navonil Majumder, Rishabh Bharadwaj, Rada Mihalcea, and Soujanya Poria. A review of deep learning techniques for speech processing. Information Fusion, page 101869, 2023."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jretconser.2020.102283"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3527153"},{"key":"e_1_3_2_1_6_1","unstructured":"Video subtitles. https:\/\/www.kapwing.com\/resources\/subtitle-statistics\/."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3372297.3423348"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2019.2925452"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3460120.3485383"},{"key":"e_1_3_2_1_10_1","first-page":"2273","volume-title":"30th USENIX Security Symposium (USENIX Security 21)","author":"Hussain Shehzeen","year":"2021","unstructured":"Shehzeen Hussain, Paarth Neekhara, Shlomo Dubnov, Julian McAuley, and Farinaz Koushanfar. {WaveGuard}: Understanding and mitigating audio adversarial examples. In 30th USENIX Security Symposium (USENIX Security 21), pages 2273--2290, 2021."},{"key":"e_1_3_2_1_11_1","volume-title":"International Conference on Learning Representations","author":"Yang Zhuolin","year":"2019","unstructured":"Zhuolin Yang, Bo Li, Pin-Yu Chen, and Dawn Song. Characterizing audio adversarial examples using temporal dependency. In International Conference on Learning Representations, 2019."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413603"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539268"},{"key":"e_1_3_2_1_14_1","first-page":"3799","volume-title":"32nd USENIX Security Symposium (USENIX Security 23)","author":"Yu Zhiyuan","year":"2023","unstructured":"Zhiyuan Yu, Yuanhaur Chang, Ning Zhang, and Chaowei Xiao. {SMACK}: Semantically meaningful adversarial audio attack. In 32nd USENIX Security Symposium (USENIX Security 23), pages 3799--3816, 2023."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.265"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461711"},{"key":"e_1_3_2_1_17_1","first-page":"5210","volume-title":"International Conference on Machine Learning","author":"Qian Kaizhi","year":"2019","unstructured":"Kaizhi Qian, Yang Zhang, Shiyu Chang, Xuesong Yang, and Mark Hasegawa-Johnson. Autovc: Zero-shot voice style transfer with only autoencoder loss. In International Conference on Machine Learning, pages 5210--5219. PMLR, 2019."},{"key":"e_1_3_2_1_18_1","first-page":"7836","volume-title":"International Conference on Machine Learning","author":"Qian Kaizhi","year":"2020","unstructured":"Kaizhi Qian, Yang Zhang, Shiyu Chang, Mark Hasegawa-Johnson, and David Cox. Unsupervised speech decomposition via triple information bottleneck. In International Conference on Machine Learning, pages 7836--7846. PMLR, 2020."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747763"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-838"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747747"},{"key":"e_1_3_2_1_22_1","first-page":"10970","article-title":"Towards style transfer for generalizable out-of-domain text-to-speech","volume":"35","author":"Huang Rongjie","year":"2022","unstructured":"Rongjie Huang, Yi Ren, Jinglin Liu, Chenye Cui, and Zhou Zhao. Generspeech: Towards style transfer for generalizable out-of-domain text-to-speech. Advances in Neural Information Processing Systems, 35:10970--10983, 2022.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_23_1","volume-title":"Styletts: A style-based generative model for natural and diverse text-to-speech synthesis. arXiv preprint arXiv:2205.15439","author":"Li Yinghao Aaron","year":"2022","unstructured":"Yinghao Aaron Li, Cong Han, and Nima Mesgarani. Styletts: A style-based generative model for natural and diverse text-to-speech synthesis. arXiv preprint arXiv:2205.15439, 2022."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/SP40001.2021.00014"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/DSC55868.2022.00071"},{"key":"e_1_3_2_1_26_1","first-page":"1","volume-title":"Audio adversarial examples: Targeted attacks on speech-to-text. In 2018 IEEE security and privacy workshops (SPW)","author":"Carlini Nicholas","year":"2018","unstructured":"Nicholas Carlini and David Wagner. Audio adversarial examples: Targeted attacks on speech-to-text. In 2018 IEEE security and privacy workshops (SPW), pages 1--7. IEEE, 2018."},{"key":"e_1_3_2_1_27_1","first-page":"15","volume-title":"Targeted adversarial examples for black box audio systems. In 2019 IEEE security and privacy workshops (SPW)","author":"Taori Rohan","year":"2019","unstructured":"Rohan Taori, Amog Kamsetty, Brenton Chu, and Nikita Vemuri. Targeted adversarial examples for black box audio systems. In 2019 IEEE security and privacy workshops (SPW), pages 15--20. IEEE, 2019."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.5555\/3491440.3491878"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2019\/741"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746927"},{"key":"e_1_3_2_1_31_1","volume-title":"International Conference on Learning Representations","author":"Chiquier Mia","year":"2022","unstructured":"Mia Chiquier, Chengzhi Mao, and Carl Vondrick. Real-time neural voice camouflage. In International Conference on Learning Representations, 2022."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i16.17663"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3548606.3560660"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.14722\/ndss.2019.23288"},{"key":"e_1_3_2_1_35_1","first-page":"5231","volume-title":"International conference on machine learning","author":"Qin Yao","year":"2019","unstructured":"Yao Qin, Nicholas Carlini, Garrison Cottrell, Ian Goodfellow, and Colin Raffel. Imperceptible, robust, and targeted adversarial examples for automatic speech recognition. In International conference on machine learning, pages 5231--5240. PMLR, 2019."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.14722\/ndss.2019.23362"},{"key":"e_1_3_2_1_37_1","first-page":"247","volume-title":"32nd USENIX Security Symposium (USENIX Security 23)","author":"Wu Xinghui","year":"2023","unstructured":"Xinghui Wu, Shiqing Ma, Chao Shen, Chenhao Lin, Qian Wang, Qi Li, and Yuan Rao. {KENKU}: Towards efficient and stealthy black-box adversarial attacks against {ASR} systems. In 32nd USENIX Security Symposium (USENIX Security 23), pages 247--264, 2023."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/SP46215.2023.10179383"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3304476"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/SP.2017.49"},{"key":"e_1_3_2_1_42_1","volume-title":"Cstr vctk corpus: English multi-speaker corpus for cstr voice cloning toolkit (version 0.92)","author":"Yamagishi Junichi","year":"2019","unstructured":"Junichi Yamagishi, Christophe Veaux, Kirsten MacDonald, et al. Cstr vctk corpus: English multi-speaker corpus for cstr voice cloning toolkit (version 0.92). University of Edinburgh. The Centre for Speech Technology Research (CSTR), 2019."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2007.1078"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i04.5928"},{"key":"e_1_3_2_1_45_1","first-page":"173","volume-title":"International conference on machine learning","author":"Amodei Dario","year":"2016","unstructured":"Dario Amodei, Sundaram Ananthanarayanan, Rishita Anubhai, Jingliang Bai, Eric Battenberg, Carl Case, Jared Casper, Bryan Catanzaro, Qiang Cheng, Guoliang Chen, et al. Deep speech 2: End-to-end speech recognition in english and mandarin. In International conference on machine learning, pages 173--182. PMLR, 2016."},{"key":"e_1_3_2_1_46_1","volume-title":"3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings","author":"Diederik","year":"2015","unstructured":"Diederik P. Kingma and Jimmy Ba. Adam: A method for stochastic optimization. In Yoshua Bengio and Yann LeCun, editors, 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings, 2015."},{"key":"e_1_3_2_1_47_1","unstructured":"Amazon mechanical turk. https:\/\/www.mturk.com."},{"key":"e_1_3_2_1_48_1","volume-title":"Archives of Psychology","author":"Likert Rensis","year":"1932","unstructured":"Rensis Likert. A technique for the measurement of attitudes. Archives of Psychology, 1932."}],"event":{"name":"ASIA CCS '24: ACM Asia Conference on Computer and Communications Security","location":"Singapore Singapore","acronym":"ASIA CCS '24","sponsor":["SIGSAC ACM Special Interest Group on Security, Audit, and Control"]},"container-title":["Proceedings of the 2nd ACM Workshop on Secure and Trustworthy Deep Learning Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3665451.3665532","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3665451.3665532","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T18:41:56Z","timestamp":1755974516000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3665451.3665532"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,7,2]]},"references-count":48,"alternative-id":["10.1145\/3665451.3665532","10.1145\/3665451"],"URL":"https:\/\/doi.org\/10.1145\/3665451.3665532","relation":{},"subject":[],"published":{"date-parts":[[2024,7,2]]},"assertion":[{"value":"2024-07-23","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}