{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T01:17:39Z","timestamp":1740100659649,"version":"3.37.3"},"reference-count":38,"publisher":"IEEE","license":[{"start":{"date-parts":[[2022,5,23]],"date-time":"2022-05-23T00:00:00Z","timestamp":1653264000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,5,23]],"date-time":"2022-05-23T00:00:00Z","timestamp":1653264000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100003803","name":"University of Hong Kong","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100003803","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,5,23]]},"DOI":"10.1109\/icassp43922.2022.9747242","type":"proceedings-article","created":{"date-parts":[[2022,4,27]],"date-time":"2022-04-27T19:50:34Z","timestamp":1651089034000},"page":"3164-3168","source":"Crossref","is-referenced-by-count":3,"title":["Characterizing the Adversarial Vulnerability of Speech self-Supervised Learning"],"prefix":"10.1109","author":[{"given":"Haibin","family":"Wu","sequence":"first","affiliation":[{"name":"National Taiwan University,Graduate Institute of Communication Engineering"}]},{"given":"Bo","family":"Zheng","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong,Centre for Perceptual and Interactive Intelligence"}]},{"given":"Xu","family":"Li","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong,Human-Computer Communications Laboratory"}]},{"given":"Xixin","family":"Wu","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong,Centre for Perceptual and Interactive Intelligence"}]},{"given":"Hung-Yi","family":"Lee","sequence":"additional","affiliation":[{"name":"National Taiwan University,Graduate Institute of Communication Engineering"}]},{"given":"Helen","family":"Meng","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong,Centre for Perceptual and Interactive Intelligence"}]}],"member":"263","reference":[{"article-title":"Adversarial examples in the physical world","year":"2016","author":"kurakin","key":"ref38"},{"key":"ref33","doi-asserted-by":"crossref","first-page":"44","DOI":"10.1007\/978-3-030-31372-2_4","article-title":"Recent advances in end-to-end spoken language understanding","author":"tomashenko","year":"2019","journal-title":"Lecture Notes in Computer Science"},{"article-title":"Snips voice platform: an embedded spoken language understanding system for private-by-design voice interfaces","year":"2018","author":"coucke","key":"ref32"},{"article-title":"Semi-supervised spoken language understanding via self-supervised speech and language model pretraining","year":"2020","author":"lai","key":"ref31"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2396"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2899"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00552"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461375"},{"key":"ref34","doi-asserted-by":"crossref","first-page":"335","DOI":"10.1007\/s10579-008-9076-6","article-title":"Iemocap: interactive emotional dyadic motion capture database","volume":"42","author":"busso","year":"2008","journal-title":"Language Resources and Evaluation"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462693"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2430"},{"key":"ref12","first-page":"202","article-title":"Improving the adversarial robustness for speaker verification by self-supervised learning","volume":"30","author":"wu","year":"2021","journal-title":"IEEE\/ACM TASLP"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053076"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1452"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413737"},{"article-title":"Spotting adversarial samples for speaker verifi-cation by neural vocoders","year":"2021","author":"wu","key":"ref16"},{"article-title":"Practical attacks on voice spoofing countermeasures","year":"2021","author":"kassis","key":"ref17"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9003763"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053643"},{"article-title":"Librimix: An open-source dataset for generalizable speech separation","year":"2020","author":"cosentino","key":"ref28"},{"article-title":"Intriguing properties of neural networks","year":"2013","author":"szegedy","key":"ref4"},{"key":"ref27","doi-asserted-by":"crossref","DOI":"10.21437\/Interspeech.2017-950","article-title":"Voxceleb: a large-scale speaker identification dataset","author":"nagrani","year":"2017"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1775"},{"article-title":"Robust audio adversarial example for a physical attack","year":"2018","author":"yakura","key":"ref6"},{"key":"ref29","article-title":"The kaldi speech recognition toolkit","author":"povey","year":"2011","journal-title":"ASRU"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/SPW.2018.00009"},{"key":"ref8","first-page":"5231","article-title":"Imperceptible, robust, and targeted adversarial examples for automatic speech recognition","author":"qin","year":"2019","journal-title":"International Conference on Machine Learning"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/SPW.2019.00016"},{"key":"ref2","first-page":"7345","article-title":"How useful is self-supervised pre-training for visual tasks?","author":"newell","year":"2020","journal-title":"IEEE Proc CVPR"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2458"},{"article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","year":"2018","author":"devlin","key":"ref1"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2026"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383529"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2834"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414460"},{"article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","year":"2020","author":"baevski","key":"ref23"},{"article-title":"Speech commands: A dataset for limited-vocabulary speech recognition","year":"2018","author":"warden","key":"ref26"},{"key":"ref25","first-page":"5206","article-title":"Lib-rispeech: An asr corpus based on public domain audio books","author":"panayotov","year":"2015","journal-title":"ICASSP 2015"}],"event":{"name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","start":{"date-parts":[[2022,5,23]]},"location":"Singapore, Singapore","end":{"date-parts":[[2022,5,27]]}},"container-title":["ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9745891\/9746004\/09747242.pdf?arnumber=9747242","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,8,15]],"date-time":"2022-08-15T20:05:24Z","timestamp":1660593924000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9747242\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,5,23]]},"references-count":38,"URL":"https:\/\/doi.org\/10.1109\/icassp43922.2022.9747242","relation":{},"subject":[],"published":{"date-parts":[[2022,5,23]]}}}