{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,7]],"date-time":"2026-05-07T02:46:35Z","timestamp":1778121995871,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,7,11]],"date-time":"2021-07-11T00:00:00Z","timestamp":1625961600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U1936203"],"award-info":[{"award-number":["U1936203"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,7,11]]},"DOI":"10.1145\/3404835.3462964","type":"proceedings-article","created":{"date-parts":[[2021,7,12]],"date-time":"2021-07-12T02:41:54Z","timestamp":1626057714000},"page":"491-500","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":10,"title":["Multimodal Activation: Awakening Dialog Robots without Wake Words"],"prefix":"10.1145","author":[{"given":"Liqiang","family":"Nie","sequence":"first","affiliation":[{"name":"Shandong University, Shandong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mengzhao","family":"Jia","sequence":"additional","affiliation":[{"name":"Shandong University, Shandong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xuemeng","family":"Song","sequence":"additional","affiliation":[{"name":"Shandong University, Shandong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ganglu","family":"Wu","sequence":"additional","affiliation":[{"name":"Alibaba Group, Zhejiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Harry","family":"Cheng","sequence":"additional","affiliation":[{"name":"Shandong University, Shandong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jian","family":"Gu","sequence":"additional","affiliation":[{"name":"Alibaba Group, Zhejiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2021,7,11]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3379337.3415588"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_27"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"crossref","unstructured":"Mengli Cheng Chengyu Wang Xu Hu Jun Huang and Xiaobo Wang. 2020. Weakly Supervised Construction of ASR Systems with Massive Video Data. arXiv preprint arXiv:2008.01300 .","DOI":"10.21437\/Interspeech.2021-7"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/2911451.2914765"},{"key":"e_1_3_2_2_6_1","volume-title":"Out of Time: Automated Lip Sync In The Wild. In Asian Conference on Computer Vision. Springer, 251--263","author":"Chung Joon Son","year":"2016","unstructured":"Joon Son Chung and Andrew Zisserman. 2016. Out of Time: Automated Lip Sync In The Wild. In Asian Conference on Computer Vision. Springer, 251--263."},{"key":"e_1_3_2_2_7_1","volume-title":"Perfect Match: Improved Cross-modal Embeddings for Audio-visual Synchronisation. In IEEE International Conference on Acoustics, Speech and Signal Processing. IEEE, 3965--3969","author":"Chung Soo-Whan","year":"2019","unstructured":"Soo-Whan Chung, Joon Son Chung, and Hong-Goo Kang. 2019. Perfect Match: Improved Cross-modal Embeddings for Audio-visual Synchronisation. In IEEE International Conference on Acoustics, Speech and Signal Processing. IEEE, 3965--3969."},{"key":"e_1_3_2_2_8_1","volume-title":"Towards End-to-End Spoken Intent Recognition in Smart Home. In International Conference on Speech Technology and Human-Computer Dialogue. IEEE, 1--8.","author":"Desot Thierry","year":"2019","unstructured":"Thierry Desot, Francois Portet, and Michel Vacher. 2019. Towards End-to-End Spoken Intent Recognition in Smart Home. In International Conference on Speech Technology and Human-Computer Dialogue. IEEE, 1--8."},{"key":"e_1_3_2_2_9_1","volume-title":"Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. ACL, 4171--4186","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. ACL, 4171--4186."},{"key":"e_1_3_2_2_10_1","volume-title":"CN-Celeb: A Challenging Chinese Speaker Recognition Dataset. In IEEE International Conference on Acoustics, Speech and Signal Processing. IEEE, 7604--7608","author":"Fan Y.","unstructured":"Y. Fan, J. W. Kang, L. T. Li, K. C. Li, H. L. Chen, S. T. Cheng, P. Y. Zhang, Z. Y. Zhou, Y. Q. Cai, and D. Wang. 2020. CN-Celeb: A Challenging Chinese Speaker Recognition Dataset. In IEEE International Conference on Acoustics, Speech and Signal Processing. IEEE, 7604--7608."},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1162\/jocn_a_00575"},{"key":"e_1_3_2_2_12_1","volume-title":"Speech Intention Classification with Multimodal Deep Learning. In Canadian Conference on Artificial Intelligence. Springer, 260--271","author":"Gu Yue","year":"2017","unstructured":"Yue Gu, Xinyu Li, Shuhong Chen, Jianyu Zhang, and Ivan Marsic. 2017. Speech Intention Classification with Multimodal Deep Learning. In Canadian Conference on Artificial Intelligence. Springer, 260--271."},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3331184.3331245"},{"key":"e_1_3_2_2_14_1","volume-title":"3D Convolutional Neural Networks for Human Action Recognition","author":"Ji Shuiwang","unstructured":"Shuiwang Ji, Wei Xu, Ming Yang, and Kai Yu. 2012. 3D Convolutional Neural Networks for Human Action Recognition. IEEE transactions on pattern analysis and machine intelligence, Vol. 35, 1, 221--231."},{"key":"e_1_3_2_2_15_1","volume-title":"Tongue Movements and Speech Acoustics. In International Conference on Spoken Language Processing. ISCA, 42--45","author":"Jiang Jintao","year":"2000","unstructured":"Jintao Jiang, Abeer Alwan, Lynne E Bernstein, Patricia Keating, and Ed Auer. 2000. On the Correlation between Facial Movements, Tongue Movements and Speech Acoustics. In International Conference on Spoken Language Processing. ISCA, 42--45."},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CCWC.2018.8301638"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.na.2009.06.089"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1181"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACSSC.1994.471519"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1002\/vis.4340020404"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1209"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1180"},{"key":"e_1_3_2_2_23_1","volume-title":"Detecting Audio-Visual Synchrony Using Deep Neural Networks. In the Annual Conference of the International Speech Communication Association. ISCA, 548--552","author":"Marcheret Etienne","year":"2015","unstructured":"Etienne Marcheret, Gerasimos Potamianos, Josef Vopicka, and Vaibhava Goel. 2015. Detecting Audio-Visual Synchrony Using Deep Neural Networks. In the Annual Conference of the International Speech Communication Association. ISCA, 548--552."},{"key":"e_1_3_2_2_24_1","volume-title":"Lip Synchronization of Speech. In Workshop on Audio-Visual Speech Processing. ISCA, 133--136","author":"McAllister David F","year":"1997","unstructured":"David F McAllister, Robert D Rodman, Donald L Bitzer, and Andrew S Freeman. 1997. Lip Synchronization of Speech. In Workshop on Audio-Visual Speech Processing. ISCA, 133--136."},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3359278"},{"key":"e_1_3_2_2_26_1","volume-title":"Seeing Wake Words: Audio-visual Keyword Spotting. In British Machine Vision Conference. BMVA, 1--13","author":"Momeni Liliane","year":"2020","unstructured":"Liliane Momeni, Triantafyllos Afouras, Themos Stafylakis, Samuel Albanie, and Andrew Zisserman. 2020. Seeing Wake Words: Audio-visual Keyword Spotting. In British Machine Vision Conference. BMVA, 1--13."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2002.5745053"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3331184.3331400"},{"key":"e_1_3_2_2_29_1","unstructured":"Alex Nichol Joshua Achiam and John Schulman. 2018. On first-order meta-learning algorithms. arXiv preprint arXiv:1803.02999 ."},{"key":"e_1_3_2_2_30_1","volume-title":"Multi-Task Deep Learning for User Intention Understanding in Speech Interaction Systems. In AAAI Conference on Artificial Intelligence. AAAI, 161--167","author":"Ning Yishuang","year":"2017","unstructured":"Yishuang Ning, Jia Jia, Zhiyong Wu, Runnan Li, Yongsheng An, Yanfeng Wang, and Helen Meng. 2017. Multi-Task Deep Learning for User Intention Understanding in Speech Interaction Systems. In AAAI Conference on Artificial Intelligence. AAAI, 161--167."},{"key":"e_1_3_2_2_31_1","volume-title":"Considering Wake Gestures for Smart Assistant Use. In the CHI Conference on Human Factors in Computing Systems. ACM, 1--8.","author":"Pomykalski Patryk","year":"2020","unstructured":"Patryk Pomykalski, Miko\u0142aj P Wo'zniak, Pawe\u0142 W Wo'zniak, Krzysztof Grudzie'n, Shengdong Zhao, and Andrzej Romanowski. 2020. Considering Wake Gestures for Smart Assistant Use. In the CHI Conference on Human Factors in Computing Systems. ACM, 1--8."},{"key":"e_1_3_2_2_32_1","volume-title":"Continuous Hidden Markov Modeling for Speaker-Independent Word Spotting. In International Conference on Acoustics, Speech, and Signal Processing. IEEE, 627--630","author":"Rohlicek J Robin","year":"1989","unstructured":"J Robin Rohlicek, William Russell, Salim Roukos, and Herbert Gish. 1989. Continuous Hidden Markov Modeling for Speaker-Independent Word Spotting. In International Conference on Acoustics, Speech, and Signal Processing. IEEE, 627--630."},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10044-008-0121-2"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2007.906583"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/258525.258544"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073640"},{"key":"e_1_3_2_2_37_1","volume-title":"Improving End-to-End Speech-to-Intent Classification with Reptile. In the Annual Conference of the International Speech Communication Association. ISCA, 891--895","author":"Tian Yusheng","year":"2020","unstructured":"Yusheng Tian and Philip John Gorinski. 2020. Improving End-to-End Speech-to-Intent Classification with Reptile. In the Annual Conference of the International Speech Communication Association. ISCA, 891--895."},{"key":"e_1_3_2_2_38_1","volume-title":"Wake Word Detection with Alignment-Free Lattice-Free MMI. In the Annual Conference of the International Speech Communication Association, Virtual Event. ISCA, 4258--4262","author":"Wang Yiming","year":"2020","unstructured":"Yiming Wang, Hang Lv, Daniel Povey, Lei Xie, and Sanjeev Khudanpur. 2020. Wake Word Detection with Alignment-Free Lattice-Free MMI. In the Annual Conference of the International Speech Communication Association, Virtual Event. ISCA, 4258--4262."},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2005.1511821"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-3302"},{"key":"e_1_3_2_2_41_1","volume-title":"Le","author":"Yang Zhilin","year":"2019","unstructured":"Zhilin Yang, Zihang Dai, Yiming Yang, Jaime G. Carbonell, Ruslan Salakhutdinov, and Quoc V. Le. 2019. XLNet: Generalized Autoregressive Pretraining for Language Understanding. In Advances in Neural Information Processing Systems. MIT, 5754--5764."},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1115"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12021"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401326"}],"event":{"name":"SIGIR '21: The 44th International ACM SIGIR Conference on Research and Development in Information Retrieval","location":"Virtual Event Canada","acronym":"SIGIR '21","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"]},"container-title":["Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3404835.3462964","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3404835.3462964","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:18:20Z","timestamp":1750191500000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3404835.3462964"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,7,11]]},"references-count":44,"alternative-id":["10.1145\/3404835.3462964","10.1145\/3404835"],"URL":"https:\/\/doi.org\/10.1145\/3404835.3462964","relation":{},"subject":[],"published":{"date-parts":[[2021,7,11]]},"assertion":[{"value":"2021-07-11","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}