{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:04:25Z","timestamp":1765343065610,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U23A20315"],"award-info":[{"award-number":["U23A20315"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2022YFB4500600"],"award-info":[{"award-number":["2022YFB4500600"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3758268","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:37:21Z","timestamp":1761377841000},"page":"13148-13155","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["MISP-QEKS: A Large-Scale Dataset with Multimodal Cues for Query-by-Example Keyword Spotting"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4759-147X","authenticated-orcid":false,"given":"Shifu","family":"Xiong","sequence":"first","affiliation":[{"name":"NERC-SLIP, USTC, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0904-8946","authenticated-orcid":false,"given":"Hang","family":"Chen","sequence":"additional","affiliation":[{"name":"NERC-SLIP, USTC, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1499-0941","authenticated-orcid":false,"given":"Shi","family":"Cheng","sequence":"additional","affiliation":[{"name":"iFLYTEK, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-5570-4876","authenticated-orcid":false,"given":"Kai","family":"Shen","sequence":"additional","affiliation":[{"name":"NERC-SLIP, USTC, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7878-6531","authenticated-orcid":false,"given":"Hengshun","family":"Zhou","sequence":"additional","affiliation":[{"name":"iFLYTEK, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5813-9430","authenticated-orcid":false,"given":"Genshun","family":"Wan","sequence":"additional","affiliation":[{"name":"NERC-SLIP, USTC, hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7827-7662","authenticated-orcid":false,"given":"Chenyue","family":"Zhang","sequence":"additional","affiliation":[{"name":"iFLYTEK, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-4315-2874","authenticated-orcid":false,"given":"Kewei","family":"Li","sequence":"additional","affiliation":[{"name":"NERC-SLIP, USTC, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2387-0389","authenticated-orcid":false,"given":"Jun","family":"Du","sequence":"additional","affiliation":[{"name":"NERC-SLIP, USTC, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0859-2827","authenticated-orcid":false,"given":"Lirong","family":"Dai","sequence":"additional","affiliation":[{"name":"NERC-SLIP, USTC, Hefei, Anhui, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2889052"},{"key":"e_1_3_2_1_2_1","volume-title":"End-to-end Streaming Keyword Spotting. In ICASSP 2019 - 2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 6336-6340","author":"Alvarez Raziel","year":"2019","unstructured":"Raziel Alvarez and Hyun-Jin Park. 2019. End-to-end Streaming Keyword Spotting. In ICASSP 2019 - 2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 6336-6340."},{"key":"e_1_3_2_1_3_1","first-page":"3661","article-title":"Robust Keyword Spotting via Recycle-Pooling for Mobile Game","author":"An Shounan","year":"2019","unstructured":"Shounan An, Youngsoo Kim, Hu Xu, Jinwoo Lee, Myungwoo Lee, and Insoo Oh. 2019. Robust Keyword Spotting via Recycle-Pooling for Mobile Game. In INTERSPEECH. 3661-3662.","journal-title":"INTERSPEECH."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746683"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747216"},{"key":"e_1_3_2_1_6_1","volume-title":"Boosting keyword spotting through on-device learnable user speech characteristics. arXiv preprint arXiv:2403.07802","author":"Cioflan Cristian","year":"2024","unstructured":"Cristian Cioflan, Lukas Cavigelli, and Luca Benini. 2024. Boosting keyword spotting through on-device learnable user speech characteristics. arXiv preprint arXiv:2403.07802 (2024)."},{"key":"e_1_3_2_1_7_1","volume-title":"Efficient Keyword Spotting Using Dilated Convolutions and Gating. In ICASSP 2019 - 2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 6351-6355","author":"Coucke Alice","year":"2019","unstructured":"Alice Coucke, Mohammed Chlieh, Thibault Gisselbrecht, David Leroy, Mathieu Poumeyrol, and Thibaut Lavril. 2019. Efficient Keyword Spotting Using Dilated Convolutions and Gating. In ICASSP 2019 - 2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 6351-6355."},{"key":"e_1_3_2_1_8_1","volume-title":"CAB-KWS: Contrastive Augmentation: An Unsupervised Learning Approach for Keyword Spotting in Speech Technology. In International Conference on Pattern Recognition. Springer, 98-112","author":"Dai Weinan","year":"2025","unstructured":"Weinan Dai, Yifeng Jiang, Yuanjing Liu, Jinkun Chen, Xin Sun, and Jinglei Tao. 2025. CAB-KWS: Contrastive Augmentation: An Unsupervised Learning Approach for Keyword Spotting in Speech Technology. In International Conference on Pattern Recognition. Springer, 98-112."},{"key":"e_1_3_2_1_9_1","first-page":"2592","article-title":"Stacked 1D Convolutional Networks for End-to-End Small Footprint Voice Trigger Detection","author":"Higuchi Takuya","year":"2020","unstructured":"Takuya Higuchi, Mohammad Ghasemzadeh, Kisun You, and Chandra Dhir. 2020. Stacked 1D Convolutional Networks for End-to-End Small Footprint Voice Trigger Detection. In INTERSPEECH. 2592-2596.","journal-title":"INTERSPEECH."},{"key":"e_1_3_2_1_10_1","volume-title":"Visual context-driven audio feature enhancement for robust end-to-end audio-visual speech recognition. arXiv preprint arXiv:2207.06020","author":"Hong Joanna","year":"2022","unstructured":"Joanna Hong, Minsu Kim, Daehun Yoo, and Yong Man Ro. 2022. Visual context-driven audio feature enhancement for robust end-to-end audio-visual speech recognition. arXiv preprint arXiv:2207.06020 (2022)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2019.2936282"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2019.2936282"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414156"},{"key":"e_1_3_2_1_14_1","volume-title":"Supervised Noise Reduction for Multichannel Keyword Spotting. In 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 5474-5478","author":"Huang Yiteng","year":"2018","unstructured":"Yiteng Huang, Thad Hughes, Turaj Z. Shabestary, and Taylor Applebaum. 2018. Supervised Noise Reduction for Multichannel Keyword Spotting. In 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 5474-5478."},{"key":"e_1_3_2_1_15_1","volume-title":"Query-by-Example On-Device Keyword Spotting. In 2019 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU). 532-538","author":"Kim Byeonggeun","year":"2019","unstructured":"Byeonggeun Kim, Mingu Lee, Jinkyu Lee, Yeonseok Kim, and Kyuwoong Hwang. 2019. Query-by-Example On-Device Keyword Spotting. In 2019 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU). 532-538."},{"key":"e_1_3_2_1_16_1","first-page":"2758","article-title":"Lip to speech synthesis with visual context attentional GAN","volume":"34","author":"Kim Minsu","year":"2021","unstructured":"Minsu Kim, Joanna Hong, and Yong Man Ro. 2021. Lip to speech synthesis with visual context attentional GAN. Advances in Neural Information Processing Systems, Vol. 34 (2021), 2758-2770.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_17_1","volume-title":"Text-aware Speech Separation for Multi-talker Keyword Spotting. arXiv preprint arXiv:2406.12447","author":"Li Haoyu","year":"2024","unstructured":"Haoyu Li, Baochen Yang, Yu Xi, Linfeng Yu, Tian Tan, Hao Li, and Kai Yu. 2024. Text-aware Speech Separation for Multi-talker Keyword Spotting. arXiv preprint arXiv:2406.12447 (2024)."},{"key":"e_1_3_2_1_18_1","first-page":"1","article-title":"Phoneme-Level Contrastive Learning for User-Defined Keyword Spotting with Flexible Enrollment. In ICASSP 2025-2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","author":"Li Kewei","year":"2025","unstructured":"Kewei Li, Hengshun Zhou, Kai Shen, Yusheng Dai, and Jun Du. 2025. Phoneme-Level Contrastive Learning for User-Defined Keyword Spotting with Flexible Enrollment. In ICASSP 2025-2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1-5.","journal-title":"IEEE"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCE.2022.3213075"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682868"},{"key":"e_1_3_2_1_21_1","first-page":"1","article-title":"Frequency & Channel Attention Network for Small Footprint Noisy Spoken Keyword Spotting. In 2024 Asia Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC)","author":"Lin Yuanxi","year":"2024","unstructured":"Yuanxi Lin and Yuriy Evgenyevich Gapanyuk. 2024. Frequency & Channel Attention Network for Small Footprint Noisy Spoken Keyword Spotting. In 2024 Asia Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC). IEEE, 1-6.","journal-title":"IEEE"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2021.3139508"},{"key":"e_1_3_2_1_23_1","volume-title":"DCCRN-KWS: An audio bias based model for noise robust small-footprint keyword spotting. arXiv preprint arXiv:2305.12331","author":"Lv Shubo","year":"2023","unstructured":"Shubo Lv, Xiong Wang, Sining Sun, Long Ma, and Lei Xie. 2023. DCCRN-KWS: An audio bias based model for noise robust small-footprint keyword spotting. arXiv preprint arXiv:2305.12331 (2023)."},{"key":"e_1_3_2_1_24_1","volume-title":"Low-Bit Quantization and Quantization-Aware Training for Small-Footprint Keyword Spotting. In 2019 18th IEEE International Conference On Machine Learning And Applications (ICMLA). 706-711","author":"Mishchenko Yuriy","year":"2019","unstructured":"Yuriy Mishchenko, Yusuf Goren, Ming Sun, Chris Beauchene, Spyros Matsoukas, Oleg Rybakov, and Shiv Naga Prasad Vitaladevuni. 2019. Low-Bit Quantization and Quantization-Aware Training for Small-Footprint Keyword Spotting. In 2019 18th IEEE International Conference On Machine Learning And Applications (ICMLA). 706-711."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446912"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447547"},{"key":"e_1_3_2_1_27_1","volume-title":"g2p en: A simple Python module for English grapheme to phoneme conversion. URL: https:\/\/github.com\/Kyubyong\/g2p","author":"Park Kyubyong","year":"2019","unstructured":"Kyubyong Park and Jongseok Kim. 2019. g2p en: A simple Python module for English grapheme to phoneme conversion. URL: https:\/\/github.com\/Kyubyong\/g2p (2019)."},{"key":"e_1_3_2_1_28_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"28518","author":"Radford Alec","year":"2023","unstructured":"Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine Mcleavey, and Ilya Sutskever. 2023. Robust Speech Recognition via Large-Scale Weak Supervision. In Proceedings of the 40th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 202), Andreas Krause, Emma Brunskill, Kyunghyun Cho, Barbara Engelhardt, Sivan Sabato, and Jonathan Scarlett (Eds.). PMLR, 28492-28518. https:\/\/proceedings.mlr.press\/v202\/radford23a.html"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095400"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2001.941023"},{"key":"e_1_3_2_1_31_1","volume-title":"Lip Reading in the Wild. In Asian Conference on Computer Vision.","author":"Son Chung Joon","year":"2016","unstructured":"Chung Joon Son and Zisserman Andrew. 2016. Lip Reading in the Wild. In Asian Conference on Computer Vision."},{"key":"e_1_3_2_1_32_1","unstructured":"Jonas Sunde Valfridsson. 2021. Query By Example Keyword Spotting."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2011.2114881"},{"key":"e_1_3_2_1_34_1","volume-title":"Query-by-Example Keyword Spotting Using Spectral-Temporal Graph Attentive Pooling and Multi-Task Learning. arXiv preprint arXiv:2409.00099","author":"Wang Zhenyu","year":"2024","unstructured":"Zhenyu Wang, Shuyu Kong, Li Wan, Biqiao Zhang, Yiteng Huang, Mumin Jin, Ming Sun, Xin Lei, and Zhaojun Yang. 2024. Query-by-Example Keyword Spotting Using Spectral-Temporal Graph Attentive Pooling and Multi-Task Learning. arXiv preprint arXiv:2409.00099 (2024)."},{"key":"e_1_3_2_1_35_1","volume-title":"Speech Commands: A Dataset for Limited-Vocabulary Speech Recognition. CoRR","author":"Warden Pete","year":"2018","unstructured":"Pete Warden. 2018. Speech Commands: A Dataset for Limited-Vocabulary Speech Recognition. CoRR, Vol. abs\/1804.03209 (2018)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10889332"},{"key":"e_1_3_2_1_37_1","first-page":"1638","article-title":"Robust keyword spotting for noisy environments by leveraging speech enhancement and speech presence probability","volume":"2023","author":"Yang Chouchang","year":"2023","unstructured":"Chouchang Yang, Yashas Malur Saidutta, Rakshith Sharma Srinivasa, Ching-Hua Lee, Yilin Shen, and Hongxia Jin. 2023. Robust keyword spotting for noisy environments by leveraging speech enhancement and speech presence probability. In Proc. Interspeech 2023. 1638-1642.","journal-title":"Proc. Interspeech"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/FG.2019.8756582"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096858"},{"key":"e_1_3_2_1_40_1","volume-title":"Hello edge: Keyword spotting on microcontrollers. arXiv preprint arXiv:1711.07128","author":"Zhang Yundong","year":"2017","unstructured":"Yundong Zhang, Naveen Suda, Liangzhen Lai, and Vikas Chandra. 2017. Hello edge: Keyword spotting on microcontrollers. arXiv preprint arXiv:1711.07128 (2017)."},{"key":"e_1_3_2_1_41_1","volume-title":"Audio-Visual Wake Word Spotting in MISP2021 Challenge: Dataset Release and Deep Analysis.. In Interspeech. 1111-1115","author":"Zhou Hengshun","year":"2022","unstructured":"Hengshun Zhou, Jun Du, Gongzhen Zou, Zhaoxu Nian, Chin-Hui Lee, Sabato Marco Siniscalchi, Shinji Watanabe, Odette Scharenborg, Jingdong Chen, Shifu Xiong, et al., 2022b. Audio-Visual Wake Word Spotting in MISP2021 Challenge: Dataset Release and Deep Analysis.. In Interspeech. 1111-1115."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10650"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3758268","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:59:57Z","timestamp":1765342797000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3758268"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":42,"alternative-id":["10.1145\/3746027.3758268","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3758268","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}