{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,7,27]],"date-time":"2025-07-27T07:52:20Z","timestamp":1753602740628,"version":"3.37.3"},"reference-count":67,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2023]]},"DOI":"10.1109\/taslp.2023.3306714","type":"journal-article","created":{"date-parts":[[2023,8,18]],"date-time":"2023-08-18T17:25:23Z","timestamp":1692379523000},"page":"3848-3858","source":"Crossref","is-referenced-by-count":5,"title":["Timbre-Reserved Adversarial Attack in Speaker Identification"],"prefix":"10.1109","volume":"31","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-5449-4815","authenticated-orcid":false,"given":"Qing","family":"Wang","sequence":"first","affiliation":[{"name":"Audio, Speech and Langauge Processing Group (ASLP@NPU), School of Computer Science, Northwestern Polytechnical University, Xi&#x0027;an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5324-7360","authenticated-orcid":false,"given":"Jixun","family":"Yao","sequence":"additional","affiliation":[{"name":"Audio, Speech and Langauge Processing Group (ASLP@NPU), School of Computer Science, Northwestern Polytechnical University, Xi&#x0027;an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5077-9962","authenticated-orcid":false,"given":"Li","family":"Zhang","sequence":"additional","affiliation":[{"name":"Audio, Speech and Langauge Processing Group (ASLP@NPU), School of Computer Science, Northwestern Polytechnical University, Xi&#x0027;an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-2388-5935","authenticated-orcid":false,"given":"Pengcheng","family":"Guo","sequence":"additional","affiliation":[{"name":"Audio, Speech and Langauge Processing Group (ASLP@NPU), School of Computer Science, Northwestern Polytechnical University, Xi&#x0027;an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8234-0823","authenticated-orcid":false,"given":"Lei","family":"Xie","sequence":"additional","affiliation":[{"name":"Audio, Speech and Langauge Processing Group (ASLP@NPU), School of Computer Science, Northwestern Polytechnical University, Xi&#x0027;an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/SPW.2018.00009"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-277"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1201\/9781351251389-8"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1052"},{"article-title":"Did you hear that? Adversarial examples against automatic speech recognition","year":"2018","author":"alzantot","key":"ref15"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/ICSDA.2017.8384449"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.14722\/ndss.2019.23288"},{"article-title":"AISHELL-3: A multi-speaker Mandarin TTS corpus and the baselines","year":"2020","author":"shi","key":"ref58"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref52","first-page":"5530","article-title":"Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech","author":"kim","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref11","article-title":"Intriguing properties of neural networks","author":"szegedy","year":"0","journal-title":"Proc 2nd Int Conf Learn Representations"},{"key":"ref55","first-page":"17022","article-title":"HiFi-GAN: Generative adversarial networks for efficient and high fidelity speech synthesis","author":"kong","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/SP.2017.49"},{"key":"ref54","article-title":"FastSpeech 2: Fast and high-quality end-to-end text to speech","author":"ren","year":"0","journal-title":"Proc 9th Int Conf Learn Representations"},{"key":"ref17","first-page":"5231","article-title":"Imperceptible, robust, and targeted adversarial examples for automatic speech recognition","author":"qin","year":"0","journal-title":"Proc 36th Int Conf Mach Learn"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2933146"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461375"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/SP40001.2021.00014"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747020"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413699"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.21437\/VCC_BC.2020-17"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413973"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639535"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747558"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3038524"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1145\/3460120.3484742"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2016.7552917"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178896"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO.2018.8553236"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2249"},{"key":"ref7","article-title":"The ASVspoof 2017 challenge: Assessing the limits of replay spoofing attack detection","author":"kinnunen","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref9","article-title":"Explaining and harnessing adversarial examples","author":"goodfellow","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2014.10.005"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2012-465"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-462"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2017.2671435"},{"key":"ref40","first-page":"125","article-title":"WaveNet: A generative model for raw audio","author":"oord","year":"0","journal-title":"Proc 9th ISCA Speech Synth Workshop"},{"key":"ref35","first-page":"3171","article-title":"FastSpeech: Fast, robust and controllable text to speech","author":"ren","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref34","first-page":"4006","article-title":"Tacotron: Towards end-to-end speech synthesis","author":"wang","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-11581-8_59"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ISCSLP.2010.5684887"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2021.101199"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/SP40001.2021.00004"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746939"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1007\/s40747-022-00782-x"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/89.365379"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2015.2462851"},{"key":"ref39","article-title":"SampleRNN: An unconditional end-to-end neural audio generation model","author":"mehri","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"article-title":"Attacking speaker recognition with deep generative models","year":"2018","author":"cai","key":"ref38"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.14722\/ndss.2019.23362"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2983"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1032"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICME46284.2020.9102886"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1955"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462665"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747766"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.21437\/ASVSPOOF.2021-1"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462693"},{"key":"ref66","first-page":"2579","article-title":"Visualizing data using t-SNE","volume":"9","author":"maaten","year":"2008","journal-title":"J Mach Learn Res"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2650"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2003"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/3376897.3377856"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053747"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053076"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1983"},{"article-title":"ASVspoof 2021: Automatic speaker verification spoofing and countermeasures challenge evaluation plan","year":"2021","author":"delgado","key":"ref62"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00482"}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6570655\/9970249\/10224300.pdf?arnumber=10224300","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,11,13]],"date-time":"2023-11-13T19:33:47Z","timestamp":1699904027000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10224300\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"references-count":67,"URL":"https:\/\/doi.org\/10.1109\/taslp.2023.3306714","relation":{},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"type":"print","value":"2329-9290"},{"type":"electronic","value":"2329-9304"}],"subject":[],"published":{"date-parts":[[2023]]}}}