{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,24]],"date-time":"2026-06-24T12:55:12Z","timestamp":1782305712060,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":73,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,11]],"date-time":"2024-05-11T00:00:00Z","timestamp":1715385600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,11]]},"DOI":"10.1145\/3613904.3642092","type":"proceedings-article","created":{"date-parts":[[2024,5,11]],"date-time":"2024-05-11T08:38:06Z","timestamp":1715416686000},"page":"1-15","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":19,"title":["Watch Your Mouth: Silent Speech Recognition with Depth Sensing"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4551-4932","authenticated-orcid":false,"given":"Xue","family":"Wang","sequence":"first","affiliation":[{"name":"Electrical and Computer Engineering, University of California, Los Angeles, United States"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6048-3268","authenticated-orcid":false,"given":"Zixiong","family":"Su","sequence":"additional","affiliation":[{"name":"Rekimoto Lab, The University of Tokyo, Japan"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3629-2514","authenticated-orcid":false,"given":"Jun","family":"Rekimoto","sequence":"additional","affiliation":[{"name":"The University of Tokyo, Japan and Sony CSL Kyoto, Japan"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2472-6968","authenticated-orcid":false,"given":"Yang","family":"Zhang","sequence":"additional","affiliation":[{"name":"Electrical and Computer Engineering, University of California, Los Angeles, United States"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,5,11]]},"reference":[{"key":"e_1_3_3_3_1_1","volume-title":"LRS3-TED: a large-scale dataset for visual speech recognition. arXiv preprint arXiv:1809.00496","author":"Afouras Triantafyllos","year":"2018","unstructured":"Triantafyllos Afouras, Joon\u00a0Son Chung, and Andrew Zisserman. 2018. LRS3-TED: a large-scale dataset for visual speech recognition. arXiv preprint arXiv:1809.00496 (2018)."},{"key":"e_1_3_3_3_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054253"},{"key":"e_1_3_3_3_3_1","doi-asserted-by":"publisher","DOI":"10.3390\/rs5041624"},{"key":"e_1_3_3_3_4_1","volume-title":"Lipnet: Sentence-level lipreading. arXiv preprint arXiv:1611.01599 2, 4","author":"Assael M","year":"2016","unstructured":"Yannis\u00a0M Assael, Brendan Shillingford, Shimon Whiteson, and Nando De\u00a0Freitas. 2016. Lipnet: Sentence-level lipreading. arXiv preprint arXiv:1611.01599 2, 4 (2016)."},{"key":"e_1_3_3_3_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/2642918.2647402"},{"key":"e_1_3_3_3_6_1","volume-title":"Empirical evaluation of gated recurrent neural networks on sequence modeling. arXiv preprint arXiv:1412.3555","author":"Chung Junyoung","year":"2014","unstructured":"Junyoung Chung, Caglar Gulcehre, KyungHyun Cho, and Yoshua Bengio. 2014. Empirical evaluation of gated recurrent neural networks on sequence modeling. arXiv preprint arXiv:1412.3555 (2014)."},{"key":"e_1_3_3_3_7_1","doi-asserted-by":"publisher","DOI":"10.1121\/1.2229005"},{"key":"e_1_3_3_3_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2020.3023541"},{"key":"e_1_3_3_3_9_1","volume-title":"The speech chain","author":"Denes B","unstructured":"Peter\u00a0B Denes and Elliot Pinson. 1993. The speech chain. Macmillan."},{"key":"e_1_3_3_3_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3526113.3545634"},{"key":"e_1_3_3_3_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3379337.3415881"},{"key":"e_1_3_3_3_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01398"},{"key":"e_1_3_3_3_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3161735"},{"key":"e_1_3_3_3_14_1","volume-title":"Learn an effective lip reading model without pains. arXiv preprint arXiv:2011.07557","author":"Feng Dalu","year":"2020","unstructured":"Dalu Feng, Shuang Yang, Shiguang Shan, and Xilin Chen. 2020. Learn an effective lip reading model without pains. arXiv preprint arXiv:2011.07557 (2020)."},{"key":"e_1_3_3_3_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"e_1_3_3_3_16_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"e_1_3_3_3_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/2047196.2047255"},{"key":"e_1_3_3_3_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/1753326.1753394"},{"key":"e_1_3_3_3_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/2642918.2647383"},{"key":"e_1_3_3_3_20_1","volume-title":"Scaling laws for neural language models. arXiv preprint arXiv:2001.08361","author":"Kaplan Jared","year":"2020","unstructured":"Jared Kaplan, Sam McCandlish, Tom Henighan, Tom\u00a0B Brown, Benjamin Chess, Rewon Child, Scott Gray, Alec Radford, Jeffrey Wu, and Dario Amodei. 2020. Scaling laws for neural language models. arXiv preprint arXiv:2001.08361 (2020)."},{"key":"e_1_3_3_3_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3172944.3172977"},{"key":"e_1_3_3_3_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00036"},{"key":"e_1_3_3_3_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3502015"},{"key":"e_1_3_3_3_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3290605.3300376"},{"key":"e_1_3_3_3_25_1","volume-title":"Proceedings of the Thirteenth Language Resources and Evaluation Conference. European Language Resources Association","author":"Kimura Naoki","year":"2022","unstructured":"Naoki Kimura, Zixiong Su, Takaaki Saeki, and Jun Rekimoto. 2022. SSR7000: A Synchronized Corpus of Ultrasound Tongue Imaging for End-to-End Silent Speech Recognition. In Proceedings of the Thirteenth Language Resources and Evaluation Conference. European Language Resources Association, Marseille, France, 6866\u20136873. https:\/\/aclanthology.org\/2022.lrec-1.741"},{"key":"e_1_3_3_3_26_1","doi-asserted-by":"publisher","DOI":"10.5555\/1577069.1755843"},{"key":"e_1_3_3_3_27_1","volume-title":"Fifth ISCA workshop on speech synthesis.","author":"Kominek John","year":"2004","unstructured":"John Kominek and Alan\u00a0W Black. 2004. The CMU Arctic speech databases. In Fifth ISCA workshop on speech synthesis."},{"key":"e_1_3_3_3_28_1","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-48452-3_8"},{"key":"e_1_3_3_3_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/86.372898"},{"key":"e_1_3_3_3_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3311823.3311831"},{"key":"e_1_3_3_3_31_1","doi-asserted-by":"publisher","DOI":"10.3115\/1118108.1118117"},{"key":"e_1_3_3_3_32_1","volume-title":"textblob Documentation. Release 0.15 2","author":"Loria Steven","year":"2018","unstructured":"Steven Loria. 2018. textblob Documentation. Release 0.15 2 (2018)."},{"key":"e_1_3_3_3_33_1","volume-title":"Mediapipe: A framework for building perception pipelines. arXiv preprint arXiv:1906.08172","author":"Lugaresi Camillo","year":"2019","unstructured":"Camillo Lugaresi, Jiuqiang Tang, Hadon Nash, Chris McClanahan, Esha Uboweja, Michael Hays, Fan Zhang, Chuo-Ling Chang, Ming\u00a0Guang Yong, Juhyun Lee, 2019. Mediapipe: A framework for building perception pipelines. arXiv preprint arXiv:1906.08172 (2019)."},{"key":"e_1_3_3_3_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096889"},{"key":"e_1_3_3_3_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414567"},{"key":"e_1_3_3_3_36_1","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-022-00550-z"},{"key":"e_1_3_3_3_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/2642918.2647404"},{"key":"e_1_3_3_3_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053841"},{"key":"e_1_3_3_3_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/34.982900"},{"key":"e_1_3_3_3_40_1","unstructured":"National\u00a0Institute of Standards and Technology. 2021. SCTK the NIST Scoring Toolkit. https:\/\/github.com\/usnistgov\/SCTK."},{"key":"e_1_3_3_3_41_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cad.2004.11.005"},{"key":"e_1_3_3_3_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445565"},{"key":"e_1_3_3_3_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639643"},{"key":"e_1_3_3_3_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01381"},{"key":"e_1_3_3_3_45_1","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition. 652\u2013660","author":"Qi R","year":"2017","unstructured":"Charles\u00a0R Qi, Hao Su, Kaichun Mo, and Leonidas\u00a0J Guibas. 2017. Pointnet: Deep learning on point sets for 3d classification and segmentation. In Proceedings of the IEEE conference on computer vision and pattern recognition. 652\u2013660."},{"key":"e_1_3_3_3_46_1","volume-title":"Deep hierarchical feature learning on point sets in a metric space. Advances in neural information processing systems 30","author":"Qi Charles\u00a0Ruizhongtai","year":"2017","unstructured":"Charles\u00a0Ruizhongtai Qi, Li Yi, Hao Su, and Leonidas\u00a0J Guibas. 2017. Pointnet++: Deep hierarchical feature learning on point sets in a metric space. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_3_47_1","volume-title":"2011 8th international conference on information, communications & signal processing. IEEE, 1\u20135.","author":"Ren Zhou","year":"2011","unstructured":"Zhou Ren, Jingjing Meng, and Junsong Yuan. 2011. Depth camera based hand gesture recognition and its applications in human-computer-interaction. In 2011 8th international conference on information, communications & signal processing. IEEE, 1\u20135."},{"key":"e_1_3_3_3_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3196709.3196772"},{"key":"e_1_3_3_3_49_1","volume-title":"Super-convergence: Very fast training of neural networks using large learning rates. In Artificial intelligence and machine learning for multi-domain operations applications, Vol.\u00a011006","author":"Smith N","year":"2019","unstructured":"Leslie\u00a0N Smith and Nicholay Topin. 2019. Super-convergence: Very fast training of neural networks using large learning rates. In Artificial intelligence and machine learning for multi-domain operations applications, Vol.\u00a011006. SPIE, 369\u2013386."},{"key":"e_1_3_3_3_50_1","volume-title":"Combining residual networks with LSTMs for lipreading. arXiv preprint arXiv:1703.04105","author":"Stafylakis Themos","year":"2017","unstructured":"Themos Stafylakis and Georgios Tzimiropoulos. 2017. Combining residual networks with LSTMs for lipreading. arXiv preprint arXiv:1703.04105 (2017)."},{"key":"e_1_3_3_3_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581465"},{"key":"e_1_3_3_3_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3448018.3458011"},{"key":"e_1_3_3_3_53_1","volume-title":"Visual contribution to speech intelligibility in noise. The journal of the acoustical society of america 26, 2","author":"Sumby H","year":"1954","unstructured":"William\u00a0H Sumby and Irwin Pollack. 1954. Visual contribution to speech intelligibility in noise. The journal of the acoustical society of america 26, 2 (1954), 212\u2013215."},{"key":"e_1_3_3_3_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3242587.3242599"},{"key":"e_1_3_3_3_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3447526.3472060"},{"key":"e_1_3_3_3_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3332165.3347872"},{"key":"e_1_3_3_3_57_1","volume-title":"YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors. arXiv preprint arXiv:2207.02696","author":"Wang Chien-Yao","year":"2022","unstructured":"Chien-Yao Wang, Alexey Bochkovskiy, and Hong-Yuan\u00a0Mark Liao. 2022. YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors. arXiv preprint arXiv:2207.02696 (2022)."},{"key":"e_1_3_3_3_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3369812"},{"key":"e_1_3_3_3_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/1866029.1866073"},{"key":"e_1_3_3_3_60_1","unstructured":"Kin-Yiu Wong. 2022. yolov7. https:\/\/github.com\/WongKinYiu\/yolov7."},{"key":"e_1_3_3_3_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3173574.3173669"},{"key":"e_1_3_3_3_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/2470654.2466113"},{"key":"e_1_3_3_3_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/FG.2018.00088"},{"key":"e_1_3_3_3_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/FG.2019.8756582"},{"key":"e_1_3_3_3_65_1","doi-asserted-by":"publisher","DOI":"10.1145\/3580838"},{"key":"e_1_3_3_3_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01179"},{"key":"e_1_3_3_3_67_1","doi-asserted-by":"publisher","DOI":"10.1145\/3594738.3611365"},{"key":"e_1_3_3_3_68_1","doi-asserted-by":"publisher","DOI":"10.1145\/3494990"},{"key":"e_1_3_3_3_69_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3580801"},{"key":"e_1_3_3_3_70_1","doi-asserted-by":"publisher","DOI":"10.1109\/FG47880.2020.00134"},{"key":"e_1_3_3_3_71_1","doi-asserted-by":"publisher","DOI":"10.1145\/3332165.3347906"},{"key":"e_1_3_3_3_72_1","doi-asserted-by":"publisher","DOI":"10.1145\/2700648.2809865"},{"key":"e_1_3_3_3_73_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746112"}],"event":{"name":"CHI '24: CHI Conference on Human Factors in Computing Systems","location":"Honolulu HI USA","acronym":"CHI '24","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction","SIGACCESS ACM Special Interest Group on Accessible Computing"]},"container-title":["Proceedings of the CHI Conference on Human Factors in Computing Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3613904.3642092","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3613904.3642092","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T23:56:41Z","timestamp":1750291001000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3613904.3642092"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,11]]},"references-count":73,"alternative-id":["10.1145\/3613904.3642092","10.1145\/3613904"],"URL":"https:\/\/doi.org\/10.1145\/3613904.3642092","relation":{},"subject":[],"published":{"date-parts":[[2024,5,11]]},"assertion":[{"value":"2024-05-11","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}