{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,28]],"date-time":"2025-03-28T02:19:45Z","timestamp":1743128385886,"version":"3.40.3"},"publisher-location":"Cham","reference-count":37,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031779602"},{"type":"electronic","value":"9783031779619"}],"license":[{"start":{"date-parts":[[2024,11,22]],"date-time":"2024-11-22T00:00:00Z","timestamp":1732233600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,22]],"date-time":"2024-11-22T00:00:00Z","timestamp":1732233600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-77961-9_8","type":"book-chapter","created":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T13:53:51Z","timestamp":1732197231000},"page":"104-118","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Pre-training and\u00a0Adverse Audio Samples for\u00a0Data-Efficient Wake Word Detection"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-6287-5657","authenticated-orcid":false,"given":"Manuel","family":"Torralbo","sequence":"first","affiliation":[]},{"given":"Ariane","family":"M\u00e9ndez","sequence":"additional","affiliation":[]},{"given":"Maia","family":"Agirre","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3505-5514","authenticated-orcid":false,"given":"Arantza","family":"Del Pozo","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,22]]},"reference":[{"unstructured":"FFmpeg. https:\/\/ffmpeg.org\/. Accessed 10 Jul 2024","key":"8_CR1"},{"doi-asserted-by":"crossref","unstructured":"Piper. https:\/\/github.com\/rhasspy\/piper. Accessed 15 Jul 2024","key":"8_CR2","DOI":"10.4324\/9781003586999-2"},{"unstructured":"webrtcvad. https:\/\/pypi.org\/project\/webrtcvad\/. Accessed 10 Jul 2024","key":"8_CR3"},{"unstructured":"Ardila, R., et al.: Common voice: a massively-multilingual speech corpus. arXiv preprint arXiv:1912.06670 (2019)","key":"8_CR4"},{"unstructured":"Bahdanau, D., Cho, K., Bengio, Y.: Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:1409.0473 (2014)","key":"8_CR5"},{"doi-asserted-by":"publisher","unstructured":"Bonet, D., et al.: Speech enhancement for wake-up-word detection in voice assistants. In: Proceedings of the IberSPEECH 2021, pp. 41\u201345 (2021). https:\/\/doi.org\/10.21437\/IberSPEECH.2021-9","key":"8_CR6","DOI":"10.21437\/IberSPEECH.2021-9"},{"doi-asserted-by":"crossref","unstructured":"Chang, S., Park, H., Cho, J., Park, H., Yun, S., Hwang, K.: Subspectral normalization for neural audio data processing. In: ICASSP 2021-2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 850\u2013854. IEEE (2021)","key":"8_CR7","DOI":"10.1109\/ICASSP39728.2021.9413522"},{"doi-asserted-by":"publisher","unstructured":"Chen, G., Parada, C., Heigold, G.: Small-footprint keyword spotting using deep neural networks. In: 2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4087\u20134091 (2014). https:\/\/doi.org\/10.1109\/ICASSP.2014.6854370","key":"8_CR8","DOI":"10.1109\/ICASSP.2014.6854370"},{"doi-asserted-by":"crossref","unstructured":"Chen, G., Yilmaz, O., Trmal, J., Povey, D., Khudanpur, S.: Using proxies for OOV keywords in the keyword search task. In: 2013 IEEE Workshop on Automatic Speech Recognition and Understanding, pp. 416\u2013421. IEEE (2013)","key":"8_CR9","DOI":"10.1109\/ASRU.2013.6707766"},{"doi-asserted-by":"crossref","unstructured":"Cho, K., et al.: Learning phrase representations using RNN encoder-decoder for statistical machine translation. arXiv preprint arXiv:1406.1078 (2014)","key":"8_CR10","DOI":"10.3115\/v1\/D14-1179"},{"unstructured":"De\u00a0Andrade, D.C., Leo, S., Viana, M.L.D.S., Bernkopf, C.: A neural attention model for speech command recognition. arXiv preprint arXiv:1808.08929 (2018)","key":"8_CR11"},{"doi-asserted-by":"crossref","unstructured":"Dean, D., Kanagasundaram, A., Ghaemmaghami, H., Rahman, M.H., Sridharan, S.: The qut-noise-sre protocol for the evaluation of noisy speaker recognition. In: Proceedings of the 16th Annual Conference of the International Speech Communication Association, Interspeech 2015, pp. 3456\u20133460. International Speech Communication Association (2015)","key":"8_CR12","DOI":"10.21437\/Interspeech.2015-685"},{"unstructured":"DeVries, T., Taylor, G.W.: Improved regularization of convolutional neural networks with cutout. arXiv preprint arXiv:1708.04552 (2017)","key":"8_CR13"},{"unstructured":"Gong, Y., Liu, L., Yang, M., Bourdev, L.: Compressing deep convolutional networks using vector quantization. arXiv preprint arXiv:1412.6115 (2014)","key":"8_CR14"},{"doi-asserted-by":"crossref","unstructured":"Hossain, D., Sato, Y.: Efficient corpus design for wake-word detection. In: 2021 IEEE Spoken Language Technology Workshop (SLT), pp. 1094\u20131100. IEEE (2021)","key":"8_CR15","DOI":"10.1109\/SLT48900.2021.9383569"},{"doi-asserted-by":"crossref","unstructured":"Jacob, B., et al.: Quantization and training of neural networks for efficient integer-arithmetic-only inference. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2704\u20132713 (2018)","key":"8_CR16","DOI":"10.1109\/CVPR.2018.00286"},{"doi-asserted-by":"crossref","unstructured":"Kim, B., Chang, S., Lee, J., Sung, D.: Broadcasted residual learning for efficient keyword spotting. arXiv preprint arXiv:2106.04140 (2021)","key":"8_CR17","DOI":"10.21437\/Interspeech.2021-383"},{"unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)","key":"8_CR18"},{"doi-asserted-by":"crossref","unstructured":"Kriman, S., et al.: QuartzNet: deep automatic speech recognition with 1d time-channel separable convolutions. In: ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6124\u20136128. IEEE (2020)","key":"8_CR19","DOI":"10.1109\/ICASSP40776.2020.9053889"},{"doi-asserted-by":"crossref","unstructured":"Lau, J., Zimmerman, B., Schaub, F.: Alexa, are you listening? privacy perceptions, concerns and privacy-seeking behaviors with smart speakers. Proc. ACM Hum.-Comput. Interact. 2(CSCW), 1\u201331 (2018)","key":"8_CR20","DOI":"10.1145\/3274371"},{"key":"8_CR21","doi-asserted-by":"publisher","first-page":"4169","DOI":"10.1109\/ACCESS.2021.3139508","volume":"10","author":"I L\u00f3pez-Espejo","year":"2021","unstructured":"L\u00f3pez-Espejo, I., Tan, Z.H., Hansen, J.H., Jensen, J.: Deep spoken keyword spotting: an overview. IEEE Access 10, 4169\u20134199 (2021)","journal-title":"IEEE Access"},{"unstructured":"Loshchilov, I., Hutter, F.: SGDR: stochastic gradient descent with warm restarts. arXiv preprint arXiv:1608.03983 (2016)","key":"8_CR22"},{"unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)","key":"8_CR23"},{"doi-asserted-by":"crossref","unstructured":"Majumdar, S., Ginsburg, B.: Matchboxnet: 1d time-channel separable convolutional neural network architecture for speech commands recognition. arXiv preprint arXiv:2004.08531 (2020)","key":"8_CR24","DOI":"10.21437\/Interspeech.2020-1058"},{"doi-asserted-by":"crossref","unstructured":"Miller, D.R., et al.: Rapid and accurate spoken term detection. In: Interspeech. vol.\u00a07, pp. 314\u2013317 (2007)","key":"8_CR25","DOI":"10.21437\/Interspeech.2007-174"},{"doi-asserted-by":"crossref","unstructured":"Panchapagesan, S., et al.: Multi-task learning and weighted cross-entropy for DNN-based keyword spotting (2016)","key":"8_CR26","DOI":"10.21437\/Interspeech.2016-1485"},{"doi-asserted-by":"crossref","unstructured":"Park, D.S., et al.: Specaugment: a simple data augmentation method for automatic speech recognition. arXiv preprint arXiv:1904.08779 (2019)","key":"8_CR27","DOI":"10.21437\/Interspeech.2019-2680"},{"unstructured":"Rose, R.C., Paul, D.B.: A hidden Markov model based keyword recognition system. In: International Conference on Acoustics, Speech, and Signal Processing, pp. 129\u2013132. IEEE (1990)","key":"8_CR28"},{"doi-asserted-by":"crossref","unstructured":"Rybakov, O., Kononenko, N., Subrahmanya, N., Visontai, M., Laurenzo, S.: Streaming keyword spotting on mobile devices. arXiv preprint arXiv:2005.06720 (2020)","key":"8_CR29","DOI":"10.21437\/Interspeech.2020-1003"},{"doi-asserted-by":"crossref","unstructured":"Sainath, T.N., Parada, C.: Convolutional neural networks for small-footprint keyword spotting. In: Interspeech, pp. 1478\u20131482 (2015)","key":"8_CR30","DOI":"10.21437\/Interspeech.2015-352"},{"doi-asserted-by":"crossref","unstructured":"Shan, C., Zhang, J., Wang, Y., Xie, L.: Attention-based end-to-end models for small-footprint keyword spotting. arXiv preprint arXiv:1803.10916 (2018)","key":"8_CR31","DOI":"10.21437\/Interspeech.2018-1777"},{"unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)","key":"8_CR32"},{"unstructured":"Warden, P.: Speech commands: a dataset for limited-vocabulary speech recognition. arXiv preprint arXiv:1804.03209 (2018)","key":"8_CR33"},{"doi-asserted-by":"crossref","unstructured":"Wilpon, J.G., Miller, L.G., Modi, P.: Improvements and applications for key word recognition using hidden markov modeling techniques. In: [Proceedings] ICASSP 91: 1991 International Conference on Acoustics, Speech, and Signal Processing, pp. 309\u2013312. IEEE (1991)","key":"8_CR34","DOI":"10.1109\/ICASSP.1991.150338"},{"doi-asserted-by":"crossref","unstructured":"Wu, M., et al.: Monophone-based background modeling for two-stage on-device wake word detection. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5494\u20135498. IEEE (2018)","key":"8_CR35","DOI":"10.1109\/ICASSP.2018.8462227"},{"doi-asserted-by":"crossref","unstructured":"Xi, Y., Yang, B., Li, H., Guo, J., Yu, K.: Contrastive learning with audio discrimination for customizable keyword spotting in continuous speech. In: ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 11666\u201311670. IEEE (2024)","key":"8_CR36","DOI":"10.1109\/ICASSP48485.2024.10447881"},{"unstructured":"Zhang, Y., Suda, N., Lai, L., Chandra, V.: Hello edge: keyword spotting on microcontrollers. arXiv preprint arXiv:1711.07128 (2017)","key":"8_CR37"}],"container-title":["Lecture Notes in Computer Science","Speech and Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-77961-9_8","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,9]],"date-time":"2025-01-09T16:04:33Z","timestamp":1736438673000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-77961-9_8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,22]]},"ISBN":["9783031779602","9783031779619"],"references-count":37,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-77961-9_8","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,22]]},"assertion":[{"value":"22 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"SPECOM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Speech and Computer","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Belgrade","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Serbia","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"25 November 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 November 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"26","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"specom2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/specom2024.ftn.uns.ac.rs\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}