{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T19:57:37Z","timestamp":1777579057922,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,11,12]],"date-time":"2021-11-12T00:00:00Z","timestamp":1636675200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001659","name":"Deutsche Forschungsgemeinschaft","doi-asserted-by":"publisher","award":["CRC 1119"],"award-info":[{"award-number":["CRC 1119"]}],"id":[{"id":"10.13039\/501100001659","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61925109, 61941120"],"award-info":[{"award-number":["61925109, 61941120"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,11,12]]},"DOI":"10.1145\/3460120.3485365","type":"proceedings-article","created":{"date-parts":[[2021,11,13]],"date-time":"2021-11-13T12:05:33Z","timestamp":1636805133000},"page":"1861-1883","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":22,"title":["FakeWake: Understanding and Mitigating Fake Wake-up Words of Voice Assistants"],"prefix":"10.1145","author":[{"given":"Yanjiao","family":"Chen","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"given":"Yijie","family":"Bai","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"given":"Richard","family":"Mitev","sequence":"additional","affiliation":[{"name":"Technical University of Darmstadt, Darmstadt, Germany"}]},{"given":"Kaibo","family":"Wang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"given":"Ahmad-Reza","family":"Sadeghi","sequence":"additional","affiliation":[{"name":"Technical University of Darmstadt, Darmstadt, Germany"}]},{"given":"Wenyuan","family":"Xu","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2021,11,13]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Alexa Dataset: Build voice-first applications. https:\/\/www.kaggle.com\/aanhari\/alexa-dataset","author":"Anhari Amir","year":"2021","unstructured":"Amir Anhari. 2021. Alexa Dataset: Build voice-first applications. https:\/\/www.kaggle.com\/aanhari\/alexa-dataset"},{"key":"e_1_3_2_2_2_1","volume-title":"Convolutional recurrent neural networks for small-footprint keyword spotting. arXiv preprint arXiv:1703.05390","author":"Arik Sercan O","year":"2017","unstructured":"Sercan O Arik, Markus Kliegl, Rewon Child, Joel Hestness, Andrew Gibiansky, Chris Fougner, Ryan Prenger, and Adam Coates. 2017. Convolutional recurrent neural networks for small-footprint keyword spotting. arXiv preprint arXiv:1703.05390 (2017)."},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2014.7078631"},{"key":"e_1_3_2_2_4_1","volume-title":"Strategy Analytics: Global Smart Speaker Sales Cross 150 Million Units for 2020 Following Robust Q4 Demand. https:\/\/smallurl.net\/businesswire","year":"2020","unstructured":"BusinessWire. 2020. Strategy Analytics: Global Smart Speaker Sales Cross 150 Million Units for 2020 Following Robust Q4 Demand. https:\/\/smallurl.net\/businesswire"},{"key":"e_1_3_2_2_5_1","unstructured":"Canalys. 2020. Global smart speaker market 2021 forecast. https:\/\/www.canalys.com\/newsroom\/canalys-global-smart-speaker-market-2021-forecast"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.5555\/3241094.3241135"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/SPW.2018.00009"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2014.6854370"},{"key":"e_1_3_2_2_9_1","volume-title":"Temporal convolution for real-time keyword spotting on mobile devices. arXiv preprint arXiv:1904.03814","author":"Choi Seungwoo","year":"2019","unstructured":"Seungwoo Choi, Seokjun Seo, Beomjun Shin, Hyeongmin Byun, Martin Kersner, Beomsu Kim, Dongyoung Kim, and Sungjoo Ha. 2019. Temporal convolution for real-time keyword spotting on mobile devices. arXiv preprint arXiv:1904.03814 (2019)."},{"key":"e_1_3_2_2_10_1","unstructured":"CNET. 2018. Alexa sent private audio to a random contact Portland family says. https:\/\/www.cnet.com\/home\/smart-home\/alexa-sent-private-audio-to-a-random-contact-portland-family-says\/"},{"key":"e_1_3_2_2_11_1","unstructured":"Toby Cox. 2020. Siri and Alexa Fails: Frustrations With Voice Search. https:\/\/themanifest.com\/digital-marketing\/resources\/siri-alexa-fails-frustrations-with-voice-search"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/4235.996017"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.2478\/popets-2020-0072"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-74695-9_23"},{"key":"e_1_3_2_2_15_1","unstructured":"Center for Hearing and Communication. 2020. Common environmental noise levels. https:\/\/chchearing.org\/noise\/common-environmental-noise-levels\/"},{"key":"e_1_3_2_2_16_1","volume-title":"Greedy Function Approximation: A Gradient Boosting Machine. Annals of Statistics","author":"Friedman Jerome H.","year":"2000","unstructured":"Jerome H. Friedman. 2000. Greedy Function Approximation: A Gradient Boosting Machine. Annals of Statistics (2000)."},{"key":"e_1_3_2_2_17_1","unstructured":"FutureLearn. 2021. Introduction to Pinyin. https:\/\/www.futurelearn.com\/info\/courses\/chinese-pronunciation-tone\/0\/steps\/64892."},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2006.1660179"},{"key":"e_1_3_2_2_19_1","unstructured":"Kris Gesling. 2021. Precise. https:\/\/mycroft-ai.gitbook.io\/docs\/mycroft-technologies\/precise."},{"key":"e_1_3_2_2_20_1","volume-title":"Ranked: The 100 Most Spoken Languages Around the World. https:\/\/www.visualcapitalist.com\/100-most-spoken-languages\/","author":"Ghosh Iman","year":"2020","unstructured":"Iman Ghosh. 2020. Ranked: The 100 Most Spoken Languages Around the World. https:\/\/www.visualcapitalist.com\/100-most-spoken-languages\/"},{"key":"e_1_3_2_2_21_1","unstructured":"Google. 2018. Google Speech. https:\/\/pypi.org\/project\/google-speech\/"},{"key":"e_1_3_2_2_22_1","first-page":"3","article-title":"Some distance properties of latent root and vector methods used in multivariate analysis","volume":"53","author":"J. C.","year":"1966","unstructured":"J. C. GOWER. 1966. Some distance properties of latent root and vector methods used in multivariate analysis. Biometrika, Vol. 53, 3--4 (1966), 325--338.","journal-title":"Biometrika"},{"key":"e_1_3_2_2_23_1","unstructured":"HealthLinkBC. 2020. Harmful Noise Levels. https:\/\/www.healthlinkbc.ca\/health-topics\/tf4173"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2019.2936282"},{"key":"e_1_3_2_2_25_1","volume-title":"IEEE International Joint Conference on Neural Networks","volume":"4","author":"Kalman B. L.","unstructured":"B. L. Kalman and S. C. Kwasny. 1992. Why tanh: choosing a sigmoidal function. In IEEE International Joint Conference on Neural Networks, Vol. 4. 578--581."},{"key":"e_1_3_2_2_26_1","volume-title":"DIMSIM: An Accurate Chinese Phonetic Similarity Algorithm Based on Learned High Dimensional Encoding. In 22nd Conference on Computational Natural Language Learning.","author":"Li Min","year":"2018","unstructured":"Min Li, Marina Danilevsky, Sara Noeman, and Yunyao Li. 2018. DIMSIM: An Accurate Chinese Phonetic Similarity Algorithm Based on Learned High Dimensional Encoding. In 22nd Conference on Computational Natural Language Learning."},{"key":"e_1_3_2_2_27_1","volume-title":"From local explanations to global understanding with explainable AI for trees. Nature machine intelligence","author":"Lundberg Scott M.","year":"2020","unstructured":"Scott M. Lundberg, Gabriel Erion, Hugh Chen, Alex DeGrave, Jordan M. Prutkin, Bala Nair, Ronit Katz, Jonathan Himmelfarb, Nisha Bansal, and Su-In Lee. 2020. From local explanations to global understanding with explainable AI for trees. Nature machine intelligence, Vol. 2, 1 (2020), 56--67."},{"key":"e_1_3_2_2_28_1","unstructured":"Markets and Markets. 2020. Smart Speaker Market with COVID-19 Impact Analysis by IVA (Alexa Google Assistant Siri DuerOS Ali Genie) Component (Hardware (Speaker Driver Connectivity IC Processor Audio IC Memory Power IC Microphone) and Software) Application and Region - Global Forecast to 2025. https:\/\/www.marketsandmarkets.com\/Market-Reports\/smart-speaker-market-44984088.html"},{"key":"e_1_3_2_2_29_1","volume-title":"LeakyPick: IoT Audio Spy Detector. In Annual Computer Security Applications Conference.","author":"Mitev Richard","year":"2020","unstructured":"Richard Mitev, Anna Pazii, Markus Miettinen, William Enck, and Ahmad-Reza Sadeghi. 2020. LeakyPick: IoT Audio Spy Detector. In Annual Computer Security Applications Conference."},{"key":"e_1_3_2_2_30_1","volume-title":"26th International Conference on Computational Linguistics: Technical Papers.","author":"Mortensen David R.","year":"2016","unstructured":"David R. Mortensen, Patrick Littell, Akash Bharadwaj, Kartik Goyal, Chris Dyer, and Lori Levin. 2016. PanPhon: A Resource for Mapping IPA Segments to Articulatory Feature Vectors. In 26th International Conference on Computational Linguistics: Technical Papers."},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/375360.375365"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"crossref","unstructured":"Sankaran Panchapagesan Ming Sun Aparna Khare Spyros Matsoukas Arindam Mandal Bj\u00f6rn Hoffmeister and Shiv Vitaladevuni. 2016. Multi-task learning and weighted cross-entropy for DNN-based keyword spotting. In Interspeech.","DOI":"10.21437\/Interspeech.2016-1485"},{"key":"e_1_3_2_2_33_1","unstructured":"Kyubyong Park and Jongseok Kim. 2019. g2pE: A Simple Python Module for English Grapheme To Phoneme Conversion. https:\/\/github.com\/Kyubyong\/g2p."},{"key":"e_1_3_2_2_34_1","unstructured":"Sarah Perez. 2019. China overtakes US in smart speaker market share. shorturl.at\/bBGOW"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054423"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-352"},{"key":"e_1_3_2_2_37_1","volume-title":"where is my privacy? Exploring Accidental Triggers of Smart Speakers. arXiv preprint arXiv:2008.00508","author":"Sch\u00f6nherr Lea","year":"2020","unstructured":"Lea Sch\u00f6nherr, Maximilian Golla, Thorsten Eisenhofer, Jan Wiele, Dorothea Kolossa, and Thorsten Holz. 2020. Unacceptable, where is my privacy? Exploring Accidental Triggers of Smart Speakers. arXiv preprint arXiv:2008.00508 (2020)."},{"key":"e_1_3_2_2_38_1","volume-title":"Adversarial Attacks Against Automatic Speech Recognition Systems via Psychoacoustic Hiding. arXiv preprint arXiv:1808.05665","author":"Sch\u00f6nherr Lea","year":"2018","unstructured":"Lea Sch\u00f6nherr, Katharina Kohls, Steffen Zeiler, Thorsten Holz, and Dorothea Kolossa. 2018. Adversarial Attacks Against Automatic Speech Recognition Systems via Psychoacoustic Hiding. arXiv preprint arXiv:1808.05665 (2018)."},{"key":"e_1_3_2_2_39_1","unstructured":"Eric Hal Schwartz. 2020. Voice Assistants Very Prone to Accidentally Waking Up and Recording Long Audio Clips: Study. shorturl.at\/bdCEY"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"crossref","unstructured":"Ming Sun David Snyder Yixin Gao Varun K Nagaraja Mike Rodehorst Sankaran Panchapagesan Nikko Strom Spyros Matsoukas and Shiv Vitaladevuni. 2017. Compressed Time Delay Neural Network for Small-Footprint Keyword Spotting.. In Interspeech.","DOI":"10.21437\/Interspeech.2017-480"},{"key":"e_1_3_2_2_41_1","volume-title":"International Conference on Machine Learning. PMLR, 3319--3328","author":"Sundararajan Mukund","year":"2017","unstructured":"Mukund Sundararajan, Ankur Taly, and Qiqi Yan. 2017. Axiomatic attribution for deep networks. In International Conference on Machine Learning. PMLR, 3319--3328."},{"key":"e_1_3_2_2_42_1","volume-title":"9th USENIX Workshop on Offensive Technologies.","author":"Vaidya Tavish","year":"2015","unstructured":"Tavish Vaidya, Yuankai Zhang, Micah Sherr, and Clay Shields. 2015. Cocaine noodles: exploiting the gap between human and machine speech recognition. In 9th USENIX Workshop on Offensive Technologies."},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2012.08.006"},{"key":"e_1_3_2_2_44_1","unstructured":"Yellowbridge. 2021. Learn Chinese Pinyin Rules: Initials Finals and Tones. https:\/\/www.yellowbridge.com\/chinese\/pinyin-rules.php."},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.14722\/ndss.2020.24178"},{"key":"e_1_3_2_2_46_1","volume-title":"27th USENIX Security Symposium.","author":"Yuan Xuejing","year":"2018","unstructured":"Xuejing Yuan, Yuxuan Chen, Yue Zhao, Yunhui Long, Xiaokang Liu, Kai Chen, Shengzhi Zhang, Heqing Huang, Xiaofeng Wang, and Carl A Gunter. 2018. Commandersong: A systematic approach for practical adversarial voice recognition. In 27th USENIX Security Symposium."},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3133956.3134052"},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2009.5372931"}],"event":{"name":"CCS '21: 2021 ACM SIGSAC Conference on Computer and Communications Security","location":"Virtual Event Republic of Korea","acronym":"CCS '21","sponsor":["SIGSAC ACM Special Interest Group on Security, Audit, and Control"]},"container-title":["Proceedings of the 2021 ACM SIGSAC Conference on Computer and Communications Security"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3460120.3485365","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3460120.3485365","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,18]],"date-time":"2025-11-18T20:48:33Z","timestamp":1763498913000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3460120.3485365"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,11,12]]},"references-count":48,"alternative-id":["10.1145\/3460120.3485365","10.1145\/3460120"],"URL":"https:\/\/doi.org\/10.1145\/3460120.3485365","relation":{},"subject":[],"published":{"date-parts":[[2021,11,12]]},"assertion":[{"value":"2021-11-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}