{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,18]],"date-time":"2026-06-18T07:48:37Z","timestamp":1781768917972,"version":"3.54.5"},"reference-count":46,"publisher":"Springer Science and Business Media LLC","issue":"10","license":[{"start":{"date-parts":[[2021,2,10]],"date-time":"2021-02-10T00:00:00Z","timestamp":1612915200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,2,10]],"date-time":"2021-02-10T00:00:00Z","timestamp":1612915200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Sign Process Syst"],"published-print":{"date-parts":[[2021,10]]},"DOI":"10.1007\/s11265-020-01629-9","type":"journal-article","created":{"date-parts":[[2021,2,10]],"date-time":"2021-02-10T14:45:53Z","timestamp":1612968353000},"page":"1187-1200","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":19,"title":["Real-time, Robust and Adaptive Universal Adversarial Attacks Against Speaker Recognition Systems"],"prefix":"10.1007","volume":"93","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6179-0013","authenticated-orcid":false,"given":"Yi","family":"Xie","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhuohang","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Cong","family":"Shi","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jian","family":"Liu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yingying","family":"Chen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Bo","family":"Yuan","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2021,2,10]]},"reference":[{"key":"1629_CR1","unstructured":"Abadi, M., Barham, P., Chen, J., Chen, Z., Davis, A., Dean, J., Devin, M., Ghemawat, S., Irving, G., Isard, M., & et al. (2016). Tensorflow: a system for large-scale machine learning. In 12th USENIX symposium on operating systems design and implementation (OSDI 16) (pp. 265\u2013283)."},{"key":"1629_CR2","unstructured":"Abdoli, S., Hafemann, L.G., Rony, J., Ayed, I.B., Cardinal, P., & Koerich, A.L. (2019). Universal adversarial audio perturbations. arXiv:1908.03173."},{"key":"1629_CR3","unstructured":"Amazon: Alexa uses voice profiles to recognize your voice and personalize your experience (2020). https:\/\/www.amazon.com\/gp\/help\/customer\/display.html?nodeId=202199440."},{"key":"1629_CR4","unstructured":"Amazon: Amazon echo and Alexa devices (2020). https:\/\/www.amazon.com\/smart-home-devices\/."},{"key":"1629_CR5","unstructured":"Apple: Siri (2020). https:\/\/www.apple.com\/siri\/."},{"key":"1629_CR6","unstructured":"Bank, C. (2019). Security as unique as your voice. https:\/\/www.chase.com\/personal\/voice-biometrics."},{"key":"1629_CR7","unstructured":"Brown, T., Man\u00e9, D., Roy, A., Abadi, M., & Gilmer, J. (2017). Adversarial patch. arXiv:1712.09665."},{"key":"1629_CR8","unstructured":"Carlini, N., Mishra, P., Vaidya, T., Zhang, Y., Sherr, M., Shields, C., Wagner, D., & Zhou, W. (2016). Hidden voice commands. In 25th USENIX security symposium (USENIX security 16) (pp. 513\u2013530)."},{"key":"1629_CR9","doi-asserted-by":"crossref","unstructured":"Carlini, N., & Wagner, D. (2017). Towards evaluating the robustness of neural networks. In 2017 IEEE symposium on security and privacy (SP) (pp. 39\u201357): IEEE.","DOI":"10.1109\/SP.2017.49"},{"key":"1629_CR10","doi-asserted-by":"crossref","unstructured":"Carlini, N., & Wagner, D. (2018). Audio adversarial examples: targeted attacks on speech-to-text. In 2018 IEEE security and privacy workshops (SPW) (pp. 1\u20137): IEEE.","DOI":"10.1109\/SPW.2018.00009"},{"key":"1629_CR11","unstructured":"Chen, G., Chen, S., Fan, L., Du, X., Zhao, Z., Song, F., & Liu, Y. (2019). Who is real bob? Adversarial attacks on speaker recognition systems. arXiv:1911.01840."},{"key":"1629_CR12","doi-asserted-by":"crossref","unstructured":"Chen, P.Y., Zhang, H., Sharma, Y., Yi, J., & Hsieh, C.J. (2017). Zoo: zeroth order optimization based black-box attacks to deep neural networks without training substitute models. In Proceedings of the 10th ACM workshop on artificial intelligence and security (pp. 15\u201326).","DOI":"10.1145\/3128572.3140448"},{"key":"1629_CR13","unstructured":"Christophe, V., Junichi, Y., & Kirsten, M. (2016). Cstr vctk corpus: english multi-speaker corpus for cstr voice cloning toolkit. The Centre for Speech Technology Research (CSTR)."},{"key":"1629_CR14","unstructured":"Goodfellow, I.J., Shlens, J., & Szegedy, C. (2014). Explaining and harnessing adversarial examples. arXiv:1412.6572."},{"key":"1629_CR15","unstructured":"Google: voice match and media on google home. https:\/\/support.google.com\/googlenest\/answer\/7342711?hl=en (2019)."},{"key":"1629_CR16","unstructured":"Google: hey google (2020). https:\/\/assistant.google.com\/."},{"key":"1629_CR17","doi-asserted-by":"crossref","unstructured":"Grosse, K., Papernot, N., Manoharan, P., Backes, M., & McDaniel, P. (2017). Adversarial examples for malware detection. In European symposium on research in computer security (pp. 62\u201379): Springer.","DOI":"10.1007\/978-3-319-66399-9_4"},{"key":"1629_CR18","doi-asserted-by":"crossref","unstructured":"Hendrik Metzen, J., Chaithanya Kumar, M., Brox, T., & Fischer, V. (2017). Universal adversarial perturbations against semantic image segmentation. In Proceedings of the IEEE international conference on computer vision (pp. 2755\u20132764).","DOI":"10.1109\/ICCV.2017.300"},{"key":"1629_CR19","doi-asserted-by":"crossref","unstructured":"Kreuk, F., Adi, Y., Cisse, M., & Keshet, J. (2018). Fooling end-to-end speaker verification with adversarial examples. In 2018 IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp. 1962\u20131966): IEEE.","DOI":"10.1109\/ICASSP.2018.8462693"},{"key":"1629_CR20","unstructured":"Kurakin, A., Goodfellow, I., & Bengio, S. (2016). Adversarial examples in the physical world. arXiv:1607.02533."},{"key":"1629_CR21","doi-asserted-by":"crossref","unstructured":"Lei, Y., Scheffer, N., Ferrer, L., & McLaren, M. (2014). A novel scheme for speaker recognition using a phonetically-aware deep neural network. In 2014 IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp. 1695\u20131699): IEEE.","DOI":"10.1109\/ICASSP.2014.6853887"},{"key":"1629_CR22","doi-asserted-by":"crossref","unstructured":"Li, Z., Shi, C., Xie, Y., Liu, J., Yuan, B., & Chen, Y. (2020). Practical adversarial attacks against speaker recognition systems. In Proceedings of the 21st international workshop on mobile computing systems and applications (pp. 9\u201314).","DOI":"10.1145\/3376897.3377856"},{"key":"1629_CR23","doi-asserted-by":"crossref","unstructured":"Lin, Y.C., Hong, Z.W., Liao, Y.H., Shih, M.L., Liu, M.Y., & Sun, M. (2017). Tactics of adversarial attack on deep reinforcement learning agents. In Proceedings of the 26th international joint conference on artificial intelligence (pp. 3756\u20133762).","DOI":"10.24963\/ijcai.2017\/525"},{"key":"1629_CR24","unstructured":"Liu, X., Yang, H., Liu, Z., Song, L., Li, H., & Chen, Y. (2018). Dpatch: an adversarial patch attack on object detectors. arXiv:1806.02299."},{"key":"1629_CR25","doi-asserted-by":"crossref","unstructured":"McLaren, M., Lei, Y., & Ferrer, L. (2015). Advances in deep neural network approaches to speaker recognition. In 2015 IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp. 4814\u20134818): IEEE.","DOI":"10.1109\/ICASSP.2015.7178885"},{"key":"1629_CR26","unstructured":"Microsoft: Cortana (2020). https:\/\/www.microsoft.com\/en-us\/cortana."},{"key":"1629_CR27","doi-asserted-by":"crossref","unstructured":"Moosavi-Dezfooli, S.M., Fawzi, A., Fawzi, O., & Frossard, P. (2017). Universal adversarial perturbations. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 1765\u20131773).","DOI":"10.1109\/CVPR.2017.17"},{"key":"1629_CR28","unstructured":"Nair, V., & Hinton, G.E. (2010). Rectified linear units improve restricted boltzmann machines. In International conference on machine learning (ICML)."},{"key":"1629_CR29","doi-asserted-by":"crossref","unstructured":"Neekhara, P., Hussain, S., Pandey, P., Dubnov, S., McAuley, J., & Koushanfar, F. (2019). Universal adversarial perturbations for speech recognition systems. arXiv:1905.03828.","DOI":"10.21437\/Interspeech.2019-1353"},{"key":"1629_CR30","unstructured":"Povey, D., Ghoshal, A., Boulianne, G., Burget, L., Glembek, O., Goel, N., Hannemann, M., Motlicek, P., Qian, Y., Schwarz, P., & et al. (2011). The kaldi speech recognition toolkit. In IEEE 2011 workshop on automatic speech recognition and understanding: CONF. IEEE Signal Processing Society."},{"key":"1629_CR31","unstructured":"Qin, Y., Carlini, N., Cottrell, G., Goodfellow, I., & Raffel, C. (2019). Imperceptible, robust, and targeted adversarial examples for automatic speech recognition. In International conference on machine learning (pp. 5231\u20135240)."},{"key":"1629_CR32","doi-asserted-by":"crossref","unstructured":"Raj, D., Snyder, D., Povey, D., & Khudanpur, S. (2019). Probing the information encoded in x-vectors. arXiv:1909.06351.","DOI":"10.1109\/ASRU46091.2019.9003979"},{"key":"1629_CR33","unstructured":"Rix, A.W., Beerends, J.G., Hollier, M.P., & Hekstra, A.P. (2001). Perceptual evaluation of speech quality (pesq)-a new method for speech quality assessment of telephone networks and codecs. In 2001 IEEE international conference on acoustics, speech, and signal processing. Proceedings (Cat. No. 01CH37221), (Vol. 2 pp. 749\u2013752): IEEE."},{"key":"1629_CR34","doi-asserted-by":"crossref","unstructured":"Sainburg, T., Thielk, M., & Gentner, T.Q. (2019). Latent space visualization, characterization, and generation of diverse vocal communication signals. bioRxiv, 870311.","DOI":"10.1101\/870311"},{"key":"1629_CR35","doi-asserted-by":"crossref","unstructured":"Scheibler, R., Bezzam, E., & Dokmani\u0107, I. (2018). Pyroomacoustics: a python package for audio room simulation and array processing algorithms. In 2018 IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp. 351\u2013355): IEEE.","DOI":"10.1109\/ICASSP.2018.8461310"},{"key":"1629_CR36","doi-asserted-by":"crossref","unstructured":"Snyder, D., Garcia-Romero, D., Povey, D., & Khudanpur, S. (2017). Deep neural network embeddings for text-independent speaker verification. In Interspeech (pp. 999\u20131003).","DOI":"10.21437\/Interspeech.2017-620"},{"key":"1629_CR37","doi-asserted-by":"crossref","unstructured":"Snyder, D., Garcia-Romero, D., Sell, G., McCree, A., Povey, D., & Khudanpur, S. (2019). Speaker recognition for multi-speaker conversations using x-vectors. In ICASSP 2019-2019 IEEE International conference on acoustics, speech and signal processing (ICASSP) (pp. 5796\u20135800): IEEE.","DOI":"10.1109\/ICASSP.2019.8683760"},{"key":"1629_CR38","doi-asserted-by":"crossref","unstructured":"Snyder, D., Garcia-Romero, D., Sell, G., Povey, D., & Khudanpur, S. (2018). X-vectors: robust dnn embeddings for speaker recognition. In 2018 IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp. 5329\u20135333): IEEE.","DOI":"10.1109\/ICASSP.2018.8461375"},{"issue":"1","key":"1629_CR39","first-page":"1929","volume":"15","author":"N Srivastava","year":"2014","unstructured":"Srivastava, N., Hinton, G., Krizhevsky, A., Sutskever, I., & Salakhutdinov, R. (2014). Dropout: a simple way to prevent neural networks from overfitting. The Journal of Machine Learning Research, 15(1), 1929\u20131958.","journal-title":"The Journal of Machine Learning Research"},{"key":"1629_CR40","unstructured":"Szegedy, C., Zaremba, W., Sutskever, I., Bruna, J., Erhan, D., Goodfellow, I., & Fergus, R. (2013). Intriguing properties of neural networks. arXiv:1312.6199."},{"key":"1629_CR41","doi-asserted-by":"crossref","unstructured":"Thys, S., Van Ranst, W., & Goedem\u00e9, T. (2019). Fooling automated surveillance cameras: adversarial patches to attack person detection. In Proceedings of the IEEE conference on computer vision and pattern recognition workshops (pp. 0\u20130).","DOI":"10.1109\/CVPRW.2019.00012"},{"key":"1629_CR42","unstructured":"Vadillo, J., & Santana, R. (2019). Universal adversarial examples in speech command classification. arXiv:1911.10182."},{"key":"1629_CR43","unstructured":"Vaidya, T., Zhang, Y., Sherr, M., & Shields, C. (2015). Cocaine noodles: exploiting the gap between human and machine speech recognition. In 9th USENIX workshop on offensive technologies."},{"key":"1629_CR44","doi-asserted-by":"crossref","unstructured":"Variani, E., Lei, X., McDermott, E., Moreno, I.L., & Gonzalez-Dominguez, J. (2014). Deep neural networks for small footprint text-dependent speaker verification. In 2014 IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp. 4052\u20134056): IEEE.","DOI":"10.1109\/ICASSP.2014.6854363"},{"key":"1629_CR45","doi-asserted-by":"crossref","unstructured":"Xie, Y., Shi, C., Li, Z., Liu, J., Chen, Y., & Yuan, B. (2020). Real-time, universal, and robust adversarial attacks against speaker recognition systems. In ICASSP 2020-2020 IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp. 1738\u20131742): IEEE.","DOI":"10.1109\/ICASSP40776.2020.9053747"},{"key":"1629_CR46","unstructured":"Yuan, X., Chen, Y., Zhao, Y., Long, Y., Liu, X., Chen, K., Zhang, S., Huang, H., Wang, X., & Gunter, C.A. (2018). Commandersong: a systematic approach for practical adversarial voice recognition. In 27th USENIX security symposium (USENIX security 18) (pp. 49\u201364)."}],"container-title":["Journal of Signal Processing Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11265-020-01629-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11265-020-01629-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11265-020-01629-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,10,19]],"date-time":"2021-10-19T20:27:42Z","timestamp":1634675262000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11265-020-01629-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,2,10]]},"references-count":46,"journal-issue":{"issue":"10","published-print":{"date-parts":[[2021,10]]}},"alternative-id":["1629"],"URL":"https:\/\/doi.org\/10.1007\/s11265-020-01629-9","relation":{},"ISSN":["1939-8018","1939-8115"],"issn-type":[{"value":"1939-8018","type":"print"},{"value":"1939-8115","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,2,10]]},"assertion":[{"value":"26 June 2020","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 November 2020","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 December 2020","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"10 February 2021","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}