{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,14]],"date-time":"2026-02-14T10:21:00Z","timestamp":1771064460665,"version":"3.50.1"},"reference-count":41,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2024,1,25]],"date-time":"2024-01-25T00:00:00Z","timestamp":1706140800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,25]],"date-time":"2024-01-25T00:00:00Z","timestamp":1706140800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62272143"],"award-info":[{"award-number":["62272143"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Anhui Provincial Major Science and Technology Project","award":["202203a05020025"],"award-info":[{"award-number":["202203a05020025"]}]},{"name":"University Synergy Innovation Program of Anhui Province","award":["GXXT-2022-054"],"award-info":[{"award-number":["GXXT-2022-054"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2024,2]]},"DOI":"10.1007\/s00530-023-01226-3","type":"journal-article","created":{"date-parts":[[2024,1,25]],"date-time":"2024-01-25T03:02:31Z","timestamp":1706151751000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":11,"title":["Generalizing sentence-level lipreading to unseen speakers: a two-stream end-to-end approach"],"prefix":"10.1007","volume":"30","author":[{"given":"Yu","family":"Li","sequence":"first","affiliation":[]},{"given":"Feng","family":"Xue","sequence":"additional","affiliation":[]},{"given":"Lin","family":"Wu","sequence":"additional","affiliation":[]},{"given":"Yincen","family":"Xie","sequence":"additional","affiliation":[]},{"given":"Shujie","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,1,25]]},"reference":[{"key":"1226_CR1","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Xu, R., Wang, X., Hou, P., Tang, H., Song, M.: Hearing lips: improving lip reading by distilling speech recognizers. In: AAAI Conference on Artificial Intelligence, pp. 6917\u20136924 (2020)","DOI":"10.1609\/aaai.v34i04.6174"},{"key":"1226_CR2","doi-asserted-by":"crossref","unstructured":"Chung, J.S., Zisserman, A.: Out of time: automated lip sync in the wild. In: Computer Vision\u2014ACCV 2016 Workshops: ACCV 2016 International Workshops, Taipei, Taiwan, November 20\u201324, 2016, Revised Selected Papers, Part II 13, pp. 251\u2013263. Springer (2017)","DOI":"10.1007\/978-3-319-54427-4_19"},{"issue":"2","key":"1226_CR3","doi-asserted-by":"publisher","first-page":"295","DOI":"10.1016\/S0167-739X(03)00145-6","volume":"20","author":"JO Kim","year":"2004","unstructured":"Kim, J.O., Lee, W., Hwang, J., Baik, K.S., Chung, C.H.: Lip print recognition for security systems by multi-resolution architecture. Future Gener. Comput. Syst. 20(2), 295\u2013301 (2004)","journal-title":"Future Gener. Comput. Syst."},{"key":"1226_CR4","doi-asserted-by":"publisher","first-page":"981","DOI":"10.1007\/s11760-019-01630-1","volume":"14","author":"X Chen","year":"2020","unstructured":"Chen, X., Du, J., Zhang, H.: Lipreading with DenseNet and RESBI-LSTM. Signal Image Video Process. 14, 981\u2013989 (2020)","journal-title":"Signal Image Video Process."},{"key":"1226_CR5","doi-asserted-by":"crossref","unstructured":"Lee, D., Lee, J., Kim, K.-E.: Multi-view automatic lip-reading using neural network. In: Computer Vision\u2014ACCV 2016 Workshops: ACCV 2016 International Workshops, Taipei, Taiwan, November 20\u201324, 2016, Revised Selected Papers, Part II 13, pp. 290\u2013302. Springer (2017)","DOI":"10.1007\/978-3-319-54427-4_22"},{"key":"1226_CR6","doi-asserted-by":"crossref","unstructured":"Martinez, B., Ma, P., Petridis, S., Pantic, M.: Lipreading using temporal convolutional networks. In: ICASSP 2020\u20132020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6319\u20136323. IEEE (2020)","DOI":"10.1109\/ICASSP40776.2020.9053841"},{"key":"1226_CR7","doi-asserted-by":"crossref","unstructured":"Petridis, S., Stafylakis, T., Ma, P., Cai, F., Tzimiropoulos, G., Pantic, M.: End-to-end audiovisual speech recognition. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6548\u20136552. IEEE (2018)","DOI":"10.1109\/ICASSP.2018.8461326"},{"issue":"15","key":"1226_CR8","doi-asserted-by":"publisher","first-page":"6975","DOI":"10.3390\/app11156975","volume":"11","author":"T Zhang","year":"2021","unstructured":"Zhang, T., He, L., Li, X., Feng, G.: Efficient end-to-end sentence-level lipreading with temporal convolutional networks. Appl. Sci. 11(15), 6975 (2021)","journal-title":"Appl. Sci."},{"key":"1226_CR9","doi-asserted-by":"crossref","unstructured":"Xu, K., Li, D., Cassimatis, N., Wang, X.: LCANet: End-to-end lipreading with cascaded attention-CTC. In: 2018 13th IEEE International Conference on Automatic Face & Gesture Recognition (FG 2018), pp. 548\u2013555. IEEE (2018)","DOI":"10.1109\/FG.2018.00088"},{"key":"1226_CR10","unstructured":"Margam, D.K., Aralikatti, R., Sharma, T., Thanda, A., Roy, S., Venkatesan, S.M., et al.: Lipreading with 3D-2D-CNN BLSTM-HMM and word-CTC models. arXiv preprint arXiv:1906.12170 (2019)"},{"key":"1226_CR11","unstructured":"Assael, Y.M., Shillingford, B., Whiteson, S., De\u00a0Freitas, N.: LIPNet: end-to-end sentence-level lipreading. arXiv preprint arXiv:1611.01599 (2016)"},{"key":"1226_CR12","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Xu, R., Song, M.: A cascade sequence-to-sequence model for Chinese mandarin lip reading. In: Proceedings of the ACM Multimedia Asia, pp. 1\u20136 (2019)","DOI":"10.1145\/3338533.3366579"},{"key":"1226_CR13","doi-asserted-by":"crossref","unstructured":"Haghpanah, M.A., Saeedizade, E., Masouleh, M.T., Kalhor, A.: Real-time facial expression recognition using facial landmarks and neural networks. In: 2022 International Conference on Machine Vision and Image Processing (MVIP), pp. 1\u20137. IEEE (2022)","DOI":"10.1109\/MVIP53647.2022.9738754"},{"key":"1226_CR14","doi-asserted-by":"crossref","unstructured":"Lo, L., Xie, H.-X., Shuai, H.-H., Cheng, W.-H.: MER-GCN: micro-expression recognition based on relation modeling with graph convolutional networks. In: 2020 IEEE Conference on Multimedia Information Processing and Retrieval (MIPR), pp. 79\u201384. IEEE (2020)","DOI":"10.1109\/MIPR49039.2020.00023"},{"key":"1226_CR15","doi-asserted-by":"crossref","unstructured":"Yan, S., Xiong, Y., Lin, D.: Spatial temporal graph convolutional networks for skeleton-based action recognition. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 32 (2018)","DOI":"10.1609\/aaai.v32i1.12328"},{"key":"1226_CR16","doi-asserted-by":"crossref","unstructured":"Stafylakis, T., Tzimiropoulos, G.: Combining residual networks with LSTMs for lipreading. arXiv preprint arXiv:1703.04105 (2017)","DOI":"10.21437\/Interspeech.2017-85"},{"key":"1226_CR17","doi-asserted-by":"publisher","first-page":"204518","DOI":"10.1109\/ACCESS.2020.3036865","volume":"8","author":"M Hao","year":"2020","unstructured":"Hao, M., Mamut, M., Yadikar, N., Aysa, A., Ubul, K.: A survey of research on lipreading technology. IEEE Access 8, 204518\u2013204544 (2020)","journal-title":"IEEE Access"},{"key":"1226_CR18","doi-asserted-by":"crossref","unstructured":"Xiao, J., Yang, S., Zhang, Y., Shan, S., Chen, X.: Deformation flow based two-stream network for lip reading. In: 2020 15th IEEE International Conference on Automatic Face and Gesture Recognition (FG 2020), pp. 364\u2013370. IEEE (2020)","DOI":"10.1109\/FG47880.2020.00132"},{"key":"1226_CR19","doi-asserted-by":"publisher","first-page":"22","DOI":"10.1016\/j.cviu.2018.10.003","volume":"176","author":"T Stafylakis","year":"2018","unstructured":"Stafylakis, T., Khan, M.H., Tzimiropoulos, G.: Pushing the boundaries of audiovisual word recognition using residual networks and LSTMs. Comput. Vis. Image Underst. 176, 22\u201332 (2018)","journal-title":"Comput. Vis. Image Underst."},{"key":"1226_CR20","doi-asserted-by":"crossref","unstructured":"Zhao, X., Yang, S., Shan, S., Chen, X.: Mutual information maximization for effective lip reading. In: 2020 15th IEEE International Conference on Automatic Face and Gesture Recognition (FG 2020), pp. 420\u2013427. IEEE (2020)","DOI":"10.1109\/FG47880.2020.00133"},{"key":"1226_CR21","doi-asserted-by":"crossref","unstructured":"Wand, M., Schmidhuber, J.: Improving speaker-independent lipreading with domain-adversarial training. arXiv preprint arXiv:1708.01565 (2017)","DOI":"10.21437\/Interspeech.2017-421"},{"key":"1226_CR22","doi-asserted-by":"crossref","unstructured":"Zhang, X., Gong, H., Dai, X., Yang, F., Liu, N., Liu, M.: Understanding pictograph with facial features: end-to-end sentence-level lip reading of Chinese. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 33, pp. 9211\u20139218 (2019)","DOI":"10.1609\/aaai.v33i01.33019211"},{"key":"1226_CR23","first-page":"1","volume":"9","author":"F Xue","year":"2023","unstructured":"Xue, F., Yang, T., Liu, K., Hong, Z., Cao, M., Guo, D., Hong, R.: LCSNet: end-to-end lipreading with channel-aware feature selection. ACM Trans. Multimedia. Comput. Commun. Appl. 9, 1\u201321 (2023)","journal-title":"ACM Trans. Multimedia. Comput. Commun. Appl."},{"key":"1226_CR24","doi-asserted-by":"crossref","unstructured":"Kim, M., Yeo, J.H., Choi, J., Ro, Y.M.: Lip reading for low-resource languages by learning and combining general speech knowledge and language-specific knowledge. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 15359\u201315371 (2023)","DOI":"10.1109\/ICCV51070.2023.01409"},{"key":"1226_CR25","doi-asserted-by":"publisher","DOI":"10.1016\/j.image.2023.117002","volume":"117","author":"B Sun","year":"2023","unstructured":"Sun, B., Xie, D., Shi, H.: MALip: modal amplification lipreading based on reconstructed audio features. Signal Process. Image Commun. 117, 117002 (2023)","journal-title":"Signal Process. Image Commun."},{"key":"1226_CR26","doi-asserted-by":"crossref","unstructured":"Santos, T.I., Abel, A., Wilson, N., Xu, Y.: Speaker-independent visual speech recognition with the inception v3 model. In: 2021 IEEE Spoken Language Technology Workshop (SLT), pp. 613\u2013620. IEEE (2021)","DOI":"10.1109\/SLT48900.2021.9383540"},{"issue":"6","key":"1226_CR27","doi-asserted-by":"publisher","first-page":"1705","DOI":"10.1109\/TAI.2022.3220190","volume":"4","author":"P Nemani","year":"2023","unstructured":"Nemani, P., Krishna, G.S., Ramisetty, N., Sai, B.D.S., Kumar, S.: Deep Learning-Based Holistic Speaker Independent Visual Speech Recognition. IEEE Trans. Artif. Intell. 4(6), 1705\u20131713 (2023)","journal-title":"IEEE Trans. Artif. Intell."},{"key":"1226_CR28","doi-asserted-by":"crossref","unstructured":"Huang, Y., Liang, X., Fang, C.: CALLip: Lipreading using contrastive and attribute learning. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 2492\u20132500 (2021)","DOI":"10.1145\/3474085.3475420"},{"key":"1226_CR29","doi-asserted-by":"crossref","unstructured":"Kim, M., Kim, H., Ro, Y.M.: Speaker-adaptive lip reading with user-dependent padding. In: European Conference on Computer Vision, pp. 576\u2013593. Springer (2022)","DOI":"10.1007\/978-3-031-20059-5_33"},{"key":"1226_CR30","doi-asserted-by":"crossref","unstructured":"Kim, M., Kim, H.-I., Ro, Y.M.: Prompt tuning of deep neural networks for speaker-adaptive visual speech recognition. arXiv preprint arXiv:2302.08102 (2023)","DOI":"10.1109\/TPAMI.2024.3484658"},{"key":"1226_CR31","doi-asserted-by":"crossref","unstructured":"Li, M., Chen, S., Zhao, Y., Zhang, Y., Wang, Y., Tian, Q.: Dynamic multiscale graph neural networks for 3d skeleton based human motion prediction. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 214\u2013223 (2020)","DOI":"10.1109\/CVPR42600.2020.00029"},{"key":"1226_CR32","doi-asserted-by":"publisher","first-page":"4433","DOI":"10.1109\/TMM.2021.3117124","volume":"24","author":"S Tang","year":"2021","unstructured":"Tang, S., Guo, D., Hong, R., Wang, M.: Graph-based multimodal sequential embedding for sign language translation. IEEE Transac. Multimed. 24, 4433\u20134445 (2021)","journal-title":"IEEE Transac. Multimed."},{"key":"1226_CR33","doi-asserted-by":"crossref","unstructured":"Papadimitriou, K., Potamianos, G.: Sign language recognition via deformable 3D convolutions and modulated graph convolutional networks. In: ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1\u20135. IEEE (2023)","DOI":"10.1109\/ICASSP49357.2023.10096714"},{"issue":"1","key":"1226_CR34","doi-asserted-by":"publisher","first-page":"264","DOI":"10.1109\/TMI.2023.3299518","volume":"43","author":"Y Ban","year":"2024","unstructured":"Ban, Y., Eckhoff, J.A., Ward, T.M., Hashimoto, D.A., Meireles, O.R., Rus, D., Rosman, G.: Concept graph neural networks for surgical video understanding. IEEE Trans. Med. Imaging. 43(1), 264\u2013274 (2024)","journal-title":"IEEE Trans. Med. Imaging."},{"issue":"8","key":"1226_CR35","doi-asserted-by":"publisher","first-page":"3038","DOI":"10.1109\/TITS.2018.2871262","volume":"20","author":"A Amodio","year":"2018","unstructured":"Amodio, A., Ermidoro, M., Maggi, D., Formentin, S., Savaresi, S.M.: Automatic detection of driver impairment based on pupillary light reflex. IEEE Trans. Intell. Transp. Syst. 20(8), 3038\u20133048 (2018)","journal-title":"IEEE Trans. Intell. Transp. Syst."},{"issue":"1","key":"1226_CR36","doi-asserted-by":"publisher","first-page":"221","DOI":"10.1109\/TPAMI.2012.59","volume":"35","author":"S Ji","year":"2012","unstructured":"Ji, S., Xu, W., Yang, M., Yu, K.: 3D convolutional neural networks for human action recognition. IEEE Trans. Pattern Anal. Mach. Intell. 35(1), 221\u2013231 (2012)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1226_CR37","doi-asserted-by":"crossref","unstructured":"Cho, K., Van\u00a0Merri\u00ebnboer, B., Gulcehre, C., Bahdanau, D., Bougares, F., Schwenk, H., Bengio, Y.: Learning phrase representations using RNN encoder\u2013decoder for statistical machine translation. arXiv preprint arXiv:1406.1078 (2014)","DOI":"10.3115\/v1\/D14-1179"},{"key":"1226_CR38","unstructured":"Sutskever, I., Vinyals, O., Le, Q.V.: Sequence to sequence learning with neural networks. In: Advances in Neural Information Processing Systems, pp. 3104\u20133112 (2014)"},{"issue":"5","key":"1226_CR39","doi-asserted-by":"publisher","first-page":"2421","DOI":"10.1121\/1.2229005","volume":"120","author":"M Cooke","year":"2006","unstructured":"Cooke, M., Barker, J., Cunningham, S., Shao, X.: An audio-visual corpus for speech perception and automatic speech recognition. J. Acoust. Soc. Am. 120(5), 2421\u20132424 (2006)","journal-title":"J. Acoust. Soc. Am."},{"key":"1226_CR40","unstructured":"Levenshtein, V.I., et al.: Binary codes capable of correcting deletions, insertions, and reversals. In: Soviet Physics Doklady, vol. 10, pp. 707\u2013710. Soviet Union (1966)"},{"key":"1226_CR41","doi-asserted-by":"crossref","unstructured":"Son\u00a0Chung, J., Senior, A., Vinyals, O., Zisserman, A.: Lip reading sentences in the wild. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6447\u20136456 (2017)","DOI":"10.1109\/CVPR.2017.367"}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-023-01226-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-023-01226-3\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-023-01226-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,9]],"date-time":"2024-11-09T02:43:51Z","timestamp":1731120231000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-023-01226-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,1,25]]},"references-count":41,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2024,2]]}},"alternative-id":["1226"],"URL":"https:\/\/doi.org\/10.1007\/s00530-023-01226-3","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"value":"0942-4962","type":"print"},{"value":"1432-1882","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,1,25]]},"assertion":[{"value":"26 July 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"8 December 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 January 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"42"}}