{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,11]],"date-time":"2026-02-11T17:17:58Z","timestamp":1770830278739,"version":"3.50.1"},"publisher-location":"Cham","reference-count":27,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031801358","type":"print"},{"value":"9783031801365","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-80136-5_12","type":"book-chapter","created":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T09:38:13Z","timestamp":1733045893000},"page":"170-184","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Audio-Visual Wake-up Word Spotting Under Noisy and\u00a0Multi-person Scenarios"],"prefix":"10.1007","author":[{"given":"Cancan","family":"Li","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Fei","family":"Su","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Juan","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,12,1]]},"reference":[{"key":"12_CR1","doi-asserted-by":"publisher","first-page":"4169","DOI":"10.1109\/ACCESS.2021.3139508","volume":"10","author":"I L\u00f3pez-Espejo","year":"2021","unstructured":"L\u00f3pez-Espejo, I., Tan, Z.H., Hansen, J.H., Jensen, J.: Deep spoken keyword spotting: an overview. IEEE Access 10, 4169\u20134199 (2021)","journal-title":"IEEE Access"},{"key":"12_CR2","doi-asserted-by":"crossref","unstructured":"Xu, Y., et al.: Audio-visual wake word spotting system for MISP challenge 2021. In: ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 9246\u20139250. IEEE (2022)","DOI":"10.1109\/ICASSP43922.2022.9746762"},{"key":"12_CR3","doi-asserted-by":"crossref","unstructured":"Zhang, A., et al.: VE-KWS: visual modality enhanced end-to-end keyword spotting. In: ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a01\u20135. IEEE (2023)","DOI":"10.1109\/ICASSP49357.2023.10096858"},{"key":"12_CR4","doi-asserted-by":"crossref","unstructured":"Wang, H., Cheng, M., Fu, Q., Li, M.: Robust wake word spotting with frame-level cross-modal attention based audio-visual conformer. In: ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 11556\u201311560. IEEE (2024)","DOI":"10.1109\/ICASSP48485.2024.10446074"},{"key":"12_CR5","doi-asserted-by":"crossref","unstructured":"Cheng, M., Wang, H., Wang, Y., Li, M.: The DKU audio-visual wake word spotting system for the 2021 MISP challenge. In: ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 9256\u20139260. IEEE (2022)","DOI":"10.1109\/ICASSP43922.2022.9747216"},{"key":"12_CR6","doi-asserted-by":"crossref","unstructured":"Wang, H., Cheng, M., Fu, Q., Li, M.: The DKU post-challenge audio-visual wake word spotting system for the 2021 MISP challenge: Deep analysis. In: ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a01\u20135. IEEE (2023)","DOI":"10.1109\/ICASSP49357.2023.10095459"},{"key":"12_CR7","doi-asserted-by":"crossref","unstructured":"Chen, H., et\u00a0al.: The first multimodal information based speech processing (MISP) challenge: data, tasks, baselines and results. In: ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 9266\u20139270. IEEE (2022)","DOI":"10.1109\/ICASSP43922.2022.9746683"},{"issue":"12","key":"12_CR8","doi-asserted-by":"publisher","first-page":"8717","DOI":"10.1109\/TPAMI.2018.2889052","volume":"44","author":"T Afouras","year":"2018","unstructured":"Afouras, T., Chung, J.S., Senior, A., Vinyals, O., Zisserman, A.: Deep audio-visual speech recognition. IEEE Trans. Pattern Anal. Mach. Intell. 44(12), 8717\u20138727 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"12_CR9","doi-asserted-by":"crossref","unstructured":"Ma, P., Petridis, S., Pantic, M.: End-to-end audio-visual speech recognition with conformers. In: ICASSP 2021-2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 7613\u20137617. IEEE (2021)","DOI":"10.1109\/ICASSP39728.2021.9414567"},{"key":"12_CR10","unstructured":"Shi, B., Hsu, W.N., Lakhotia, K., Mohamed, A.: Learning audio-visual speech representation by masked multimodal cluster prediction. arXiv preprint arXiv:2201.02184 (2022)"},{"key":"12_CR11","doi-asserted-by":"crossref","unstructured":"Gao, R., Grauman, K.: VisualVoice: audio-visual speech separation with cross-modal consistency. In: 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 15490\u201315500. IEEE (2021)","DOI":"10.1109\/CVPR46437.2021.01524"},{"key":"12_CR12","doi-asserted-by":"crossref","unstructured":"Ephrat, A., et al.: Looking to listen at the cocktail party: a speaker-independent audio-visual model for speech separation. arXiv preprint arXiv:1804.03619 (2018)","DOI":"10.1145\/3197517.3201357"},{"key":"12_CR13","doi-asserted-by":"publisher","first-page":"1079","DOI":"10.1109\/TASLP.2021.3057230","volume":"29","author":"Y Qian","year":"2021","unstructured":"Qian, Y., Chen, Z., Wang, S.: Audio-visual deep neural network for robust person verification. IEEE\/ACM Trans. Audio Speech Lang. Process. 29, 1079\u20131092 (2021)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"12_CR14","doi-asserted-by":"crossref","unstructured":"Wuerkaixi, A., Zhang, Y., Duan, Z., Zhang, C.: Rethinking audio-visual synchronization for active speaker detection. In: 2022 IEEE 32nd International Workshop on Machine Learning for Signal Processing (MLSP), pp. 01\u201306. IEEE (2022)","DOI":"10.1109\/MLSP55214.2022.9943352"},{"key":"12_CR15","doi-asserted-by":"crossref","unstructured":"Kim, Y.J., et al.: Look who\u2019s talking: active speaker detection in the wild. arXiv preprint arXiv:2108.07640 (2021)","DOI":"10.21437\/Interspeech.2021-2041"},{"key":"12_CR16","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2023.121648","volume":"238","author":"J Peymanfard","year":"2024","unstructured":"Peymanfard, J., Heydarian, S., Lashini, A., Zeinali, H., Mohammadi, M.R., Mozayani, N.: A multi-purpose audio-visual corpus for multi-modal persian speech recognition: the Arman-AV dataset. Expert Syst. Appl. 238, 121648 (2024)","journal-title":"Expert Syst. Appl."},{"key":"12_CR17","doi-asserted-by":"crossref","unstructured":"Chung, J.S., Nagrani, A., Zisserman, A.: VoxCeleb2: deep speaker recognition. arXiv preprint arXiv:1806.05622 (2018)","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"12_CR18","unstructured":"Tao, R., Qian, X., Das, R.K., Gao, X., Wang, J., Li, H.: Enhancing real-world active speaker detection with multi-modal extraction pre-training. arXiv preprint arXiv:2404.00861 (2024)"},{"key":"12_CR19","doi-asserted-by":"crossref","unstructured":"Braga, O., Makino, T., Siohan, O., Liao, H.: End-to-end multi-person audio\/visual automatic speech recognition. In: ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6994\u20136998. IEEE (2020)","DOI":"10.1109\/ICASSP40776.2020.9053974"},{"key":"12_CR20","doi-asserted-by":"crossref","unstructured":"Braga, O., Siohan, O.: A closer look at audio-visual multi-person speech recognition and active speaker selection. In: ICASSP 2021-2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). pp. 6863\u20136867. IEEE (2021)","DOI":"10.1109\/ICASSP39728.2021.9414160"},{"key":"12_CR21","doi-asserted-by":"crossref","unstructured":"Braga, O., Siohan, O.: Best of both worlds: multi-task audio-visual automatic speech recognition and active speaker detection. In: ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6047\u20136051. IEEE (2022)","DOI":"10.1109\/ICASSP43922.2022.9746036"},{"key":"12_CR22","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"251","DOI":"10.1007\/978-3-319-54427-4_19","volume-title":"Computer Vision \u2013 ACCV 2016 Workshops","author":"JS Chung","year":"2017","unstructured":"Chung, J.S., Zisserman, A.: Out of time: automated lip sync in the wild. In: Chen, C.-S., Lu, J., Ma, K.-K. (eds.) ACCV 2016. LNCS, vol. 10117, pp. 251\u2013263. Springer, Cham (2017). https:\/\/doi.org\/10.1007\/978-3-319-54427-4_19"},{"key":"12_CR23","doi-asserted-by":"crossref","unstructured":"Chung, S.W., Chung, J.S., Kang, H.G.: Perfect match: improved cross-modal embeddings for audio-visual synchronisation. In: ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 3965\u20133969. IEEE (2019)","DOI":"10.1109\/ICASSP.2019.8682524"},{"key":"12_CR24","unstructured":"Chen, H., Xie, W., Afouras, T., Nagrani, A., Vedaldi, A., Zisserman, A.: Audio-visual synchronisation in the wild. arXiv preprint arXiv:2112.04432 (2021)"},{"key":"12_CR25","doi-asserted-by":"crossref","unstructured":"Kadandale, V.S., Montesinos, J.F., Haro, G.: VocaLIST: an audio-visual synchronisation model for lips and voices. arXiv preprint arXiv:2204.02090 (2022)","DOI":"10.21437\/Interspeech.2022-10861"},{"key":"12_CR26","doi-asserted-by":"crossref","unstructured":"Zhou, et\u00a0al.: Audio-visual wake word spotting in misp2021 challenge: dataset release and deep analysis. In: Interspeech, pp. 1111\u20131115 (2022)","DOI":"10.21437\/Interspeech.2022-10650"},{"key":"12_CR27","doi-asserted-by":"crossref","unstructured":"Deng, J., Guo, J., Zhou, Y., Yu, J., Kotsia, I., Zafeiriou, S.: RetinaFace: single-stage dense face localisation in the wild. arXiv preprint arXiv:1905.00641 (2019)","DOI":"10.1109\/CVPR42600.2020.00525"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-80136-5_12","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T10:03:43Z","timestamp":1733047423000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-80136-5_12"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,1]]},"ISBN":["9783031801358","9783031801365"],"references-count":27,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-80136-5_12","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12,1]]},"assertion":[{"value":"1 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICPR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Pattern Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Kolkata","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"India","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icpr2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icpr2024.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}