{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T08:35:14Z","timestamp":1742978114964,"version":"3.40.3"},"publisher-location":"Cham","reference-count":45,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031781858"},{"type":"electronic","value":"9783031781865"}],"license":[{"start":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T00:00:00Z","timestamp":1732924800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T00:00:00Z","timestamp":1732924800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-78186-5_20","type":"book-chapter","created":{"date-parts":[[2024,11,29]],"date-time":"2024-11-29T14:17:02Z","timestamp":1732889822000},"page":"297-312","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Conformer-Based Audio Visual Speech Recognition with\u00a0Taylor Attention"],"prefix":"10.1007","author":[{"given":"Yewei","family":"Xiao","sequence":"first","affiliation":[]},{"given":"Jian","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Xuanming","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Aosu","family":"Zhu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,30]]},"reference":[{"key":"20_CR1","doi-asserted-by":"crossref","unstructured":"Levis, J., Suvorov, R.: Automatic speech recognition. In: The Encyclopedia of Applied Linguistics (2012)","DOI":"10.1002\/9781405198431.wbeal0066"},{"key":"20_CR2","doi-asserted-by":"crossref","unstructured":"Burchi, M., Vielzeuf, V.: Efficient conformer: progressive downsampling and grouped attention for automatic speech recognition. In: 2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), pp. 8\u201315. IEEE, December 2021","DOI":"10.1109\/ASRU51503.2021.9687874"},{"key":"20_CR3","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"87","DOI":"10.1007\/978-3-319-54184-6_6","volume-title":"Computer Vision \u2013 ACCV 2016","author":"JS Chung","year":"2017","unstructured":"Chung, J.S., Zisserman, A.: Lip reading in the wild. In: Lai, S.-H., Lepetit, V., Nishino, K., Sato, Y. (eds.) ACCV 2016. LNCS, vol. 10112, pp. 87\u2013103. Springer, Cham (2017). https:\/\/doi.org\/10.1007\/978-3-319-54184-6_6"},{"key":"20_CR4","doi-asserted-by":"crossref","unstructured":"Chen, W., Xing, X., Xu, X., Yang, J., Pang, J.: Key-sparse transformer for multimodal speech emotion recognition. In: ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6897\u20136901. IEEE, May 2022","DOI":"10.1109\/ICASSP43922.2022.9746598"},{"issue":"11","key":"20_CR5","doi-asserted-by":"publisher","first-page":"930","DOI":"10.1038\/s42256-022-00550-z","volume":"4","author":"P Ma","year":"2022","unstructured":"Ma, P., Petridis, S., Pantic, M.: Visual speech recognition for multiple languages in the wild. Nat. Mach. Intell. 4(11), 930\u2013939 (2022)","journal-title":"Nat. Mach. Intell."},{"key":"20_CR6","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"20_CR7","doi-asserted-by":"crossref","unstructured":"Gulati, A., et al.: Conformer: convolution-augmented transformer for speech recognition (2020). arXiv preprint arXiv:2005.08100","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"20_CR8","unstructured":"Shen, Z., Zhang, M., Zhao, H., Yi, S., Li, H.: Efficient attention: attention with linear complexities. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 3531\u20133539 (2021)"},{"key":"20_CR9","doi-asserted-by":"crossref","unstructured":"Chang, O., Liao, H., Serdyuk, D., Shah, A., Siohan, O.: Conformers are all You need for visual speech recognition. arXiv preprint arXiv:2302.10915 (2023)","DOI":"10.1109\/ICASSP48485.2024.10446532"},{"key":"20_CR10","doi-asserted-by":"crossref","unstructured":"Chen, S., et al.: Continuous speech separation with conformer. In: ICASSP 2021-2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5749\u20135753. IEEE, June 2021","DOI":"10.1109\/ICASSP39728.2021.9413423"},{"key":"20_CR11","unstructured":"Beltagy, I., Peters, M.E., Cohan, A.: Longformer: the long-document transformer (2020). arXiv preprint arXiv:2004.05150"},{"issue":"3","key":"20_CR12","doi-asserted-by":"publisher","first-page":"141","DOI":"10.1109\/6046.865479","volume":"2","author":"S Dupont","year":"2000","unstructured":"Dupont, S., Luettin, J.: Audio-visual speech modeling for continuous speech recognition. IEEE Trans. Multimedia 2(3), 141\u2013151 (2000)","journal-title":"IEEE Trans. Multimedia"},{"issue":"3","key":"20_CR13","doi-asserted-by":"publisher","first-page":"361","DOI":"10.1016\/S0959-440X(96)80056-X","volume":"6","author":"SR Eddy","year":"1996","unstructured":"Eddy, S.R.: Hidden Markov models. Curr. Opin. Struct. Biol. 6(3), 361\u2013365 (1996)","journal-title":"Curr. Opin. Struct. Biol."},{"issue":"12","key":"20_CR14","doi-asserted-by":"publisher","first-page":"8717","DOI":"10.1109\/TPAMI.2018.2889052","volume":"44","author":"T Afouras","year":"2018","unstructured":"Afouras, T., Chung, J.S., Senior, A., Vinyals, O., Zisserman, A.: Deep audio-visual speech recognition. IEEE Trans. Pattern Anal. Mach. Intell. 44(12), 8717\u20138727 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"20_CR15","doi-asserted-by":"crossref","unstructured":"Petridis, S., Stafylakis, T., Ma, P., Tzimiropoulos, G., Pantic, M.: Audio-visual speech recognition with a hybrid CTC\/attention architecture. In: 2018 IEEE Spoken Language Technology Workshop (SLT), pp. 513\u2013520. IEEE, December 2018","DOI":"10.1109\/SLT.2018.8639643"},{"key":"20_CR16","doi-asserted-by":"crossref","unstructured":"Makino, T., et al.: Recurrent neural network transducer for audio-visual speech recognition. In: 2019 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), pp. 905\u2013912. IEEE, December 2019","DOI":"10.1109\/ASRU46091.2019.9004036"},{"key":"20_CR17","doi-asserted-by":"crossref","unstructured":"Xu, B., Lu, C., Guo, Y., Wang, J.: Discriminative multi-modality speech recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14433\u201314442 (2020)","DOI":"10.1109\/CVPR42600.2020.01444"},{"key":"20_CR18","doi-asserted-by":"crossref","unstructured":"Li, W., Wang, S., Lei, M., Siniscalchi, S.M., Lee, C.H.: Improving audio-visual speech recognition performance with cross-modal student-teacher training. In: ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6560\u20136564. IEEE, May 2019","DOI":"10.1109\/ICASSP.2019.8682868"},{"key":"20_CR19","doi-asserted-by":"crossref","unstructured":"Paraskevopoulos, G., Parthasarathy, S., Khare, A., Sundaram, S.: Multiresolution and multimodal speech recognition with transformers (2020). arXiv preprint arXiv:2004.14840","DOI":"10.18653\/v1\/2020.acl-main.216"},{"key":"20_CR20","doi-asserted-by":"crossref","unstructured":"Shukla, A., Vougioukas, K., Ma, P., Petridis, S., Pantic, M.: Visually guided self supervised learning of speech representations. In: ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6299\u20136303. IEEE, May 2020","DOI":"10.1109\/ICASSP40776.2020.9053415"},{"key":"20_CR21","doi-asserted-by":"crossref","unstructured":"Tao, F., Busso, C.: End-to-end audiovisual speech recognition system with multitask learning. IEEE Trans. Multimedia 23, 1\u201311 (2020)","DOI":"10.1109\/TMM.2020.2975922"},{"key":"20_CR22","doi-asserted-by":"crossref","unstructured":"Martinez, B., Ma, P., Petridis, S., Pantic, M.: Lipreading using temporal convolutional networks. In: ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6319\u20136323. IEEE, May 2020","DOI":"10.1109\/ICASSP40776.2020.9053841"},{"key":"20_CR23","unstructured":"Choromanski, K., et al.: Rethinking attention with performers (2020). arXiv preprint arXiv:2009.14794"},{"key":"20_CR24","unstructured":"Qin, Z., et al.: cosFormer: rethinking softmax in attention (2022). arXiv preprint arXiv:2202.08791"},{"key":"20_CR25","unstructured":"Kitaev, N., Kaiser, \u0141., Levskaya, A.: Reformer: the efficient transformer (2020). arXiv preprint arXiv:2001.04451"},{"key":"20_CR26","doi-asserted-by":"publisher","unstructured":"Bolya, D., Fu, C.Y., Dai, X., Zhang, P., Hoffman, J.: Hydra attention: efficient attention with many heads. In: Karlinsky, L., Michaeli, T., Nishino, K. (eds.) ECCV 2022. LNCS, vol. 13807, pp. 35\u201349. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-25082-8_3","DOI":"10.1007\/978-3-031-25082-8_3"},{"key":"20_CR27","doi-asserted-by":"crossref","unstructured":"Deng, J., et al.: Confidence score based conformer speaker adaptation for speech recognition (2022). arXiv preprint arXiv:2206.12045","DOI":"10.21437\/Interspeech.2022-680"},{"key":"20_CR28","doi-asserted-by":"crossref","unstructured":"Burchi, M., Timofte, R.: Audio-visual efficient conformer for robust speech recognition. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 2258\u20132267 (2023)","DOI":"10.1109\/WACV56688.2023.00229"},{"key":"20_CR29","doi-asserted-by":"crossref","unstructured":"Zhang, Y., et al.: MFA-Conformer: multi-scale feature aggregation conformer for automatic speaker verification (2022). arXiv preprint arXiv:2203.15249","DOI":"10.21437\/Interspeech.2022-563"},{"key":"20_CR30","doi-asserted-by":"crossref","unstructured":"Andrusenko, A., Nasretdinov, R., Romanenko, A.: UCONV-conformer: high reduction of input sequence length for end-to-end speech recognition. In: ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1\u20135. IEEE, June 2023","DOI":"10.1109\/ICASSP49357.2023.10095430"},{"key":"20_CR31","doi-asserted-by":"crossref","unstructured":"Hernandez, S.M., et al.: Sharing low rank conformer weights for tiny always-on ambient speech recognition models. In: ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1\u20135. IEEE, June 2023","DOI":"10.1109\/ICASSP49357.2023.10095006"},{"key":"20_CR32","doi-asserted-by":"crossref","unstructured":"Deng, J., Guo, J., Ververas, E., Kotsia, I., Zafeiriou, S.: RetinaFace: single-shot multi-level face localisation in the wild. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5203\u20135212 (2020)","DOI":"10.1109\/CVPR42600.2020.00525"},{"key":"20_CR33","doi-asserted-by":"crossref","unstructured":"Bulat, A., Tzimiropoulos, G.: How far are we from solving the 2D & 3D face alignment problem?(and a dataset of 230,000 3D facial landmarks). In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1021\u20131030 (2017)","DOI":"10.1109\/ICCV.2017.116"},{"key":"20_CR34","doi-asserted-by":"crossref","unstructured":"Kudo, T., Richardson, J.: SentencePiece: a simple and language independent subword tokenizer and detokenizer for neural text processing (2018). arXiv preprint arXiv:1808.06226","DOI":"10.18653\/v1\/D18-2012"},{"key":"20_CR35","doi-asserted-by":"crossref","unstructured":"Park, D.S., et al.: SpecAugment on large scale datasets. In: ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6879\u20136883. IEEE, May 2020","DOI":"10.1109\/ICASSP40776.2020.9053205"},{"key":"20_CR36","doi-asserted-by":"crossref","unstructured":"Prajwal, K.R., Afouras, T., Zisserman, A.: Sub-word level lip reading with visual attention. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5162\u20135172 (2022)","DOI":"10.1109\/CVPR52688.2022.00510"},{"key":"20_CR37","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization (2014). arXiv preprint arXiv:1412.6980"},{"key":"20_CR38","doi-asserted-by":"crossref","unstructured":"Zhang, X., Cheng, F., Wang, S.: Spatio-temporal fusion based convolutional sequence learning for lip reading. In: Proceedings of the IEEE\/CVF International conference on Computer Vision, pp. 713\u2013722 (2019)","DOI":"10.1109\/ICCV.2019.00080"},{"key":"20_CR39","doi-asserted-by":"crossref","unstructured":"Afouras, T., Chung, J.S., Zisserman, A.: ASR is all you need: cross-modal distillation for lip reading. In: ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 2143\u20132147. IEEE, May 2020","DOI":"10.1109\/ICASSP40776.2020.9054253"},{"key":"20_CR40","doi-asserted-by":"crossref","unstructured":"Yu, J., et al.: Audio-visual recognition of overlapped speech for the LRS2 dataset. In: ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6984\u20136988. IEEE, May 2020","DOI":"10.1109\/ICASSP40776.2020.9054127"},{"key":"20_CR41","doi-asserted-by":"crossref","unstructured":"Ma, P., Petridis, S., Pantic, M.: End-to-end audio-visual speech recognition with conformers. In: ICASSP 2021-2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 7613\u20137617. IEEE, June 2021","DOI":"10.1109\/ICASSP39728.2021.9414567"},{"key":"20_CR42","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Xu, R., Wang, X., Hou, P., Tang, H., Song, M.: Hearing lips: improving lip reading by distilling speech recognizers. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 34, no. 04, pp. 6917\u20136924, April 2020","DOI":"10.1609\/aaai.v34i04.6174"},{"key":"20_CR43","unstructured":"Shillingford, B., et al.: Large-scale visual speech recognition (2018). arXiv preprint arXiv:1807.05162"},{"key":"20_CR44","doi-asserted-by":"crossref","unstructured":"Serdyuk, D., Braga, O., Siohan, O.: Audio-visual speech recognition is worth $$32\\times 32\\times 8$$ voxels. In: 2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), pp. 796\u2013802. IEEE, December 2021","DOI":"10.1109\/ASRU51503.2021.9688191"},{"key":"20_CR45","doi-asserted-by":"crossref","unstructured":"Shi, B., Hsu, W.N., Mohamed, A.: Robust self-supervised audio-visual speech recognition (2022). arXiv preprint arXiv:2201.01763","DOI":"10.21437\/Interspeech.2022-99"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-78186-5_20","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,29]],"date-time":"2024-11-29T15:14:26Z","timestamp":1732893266000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-78186-5_20"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,30]]},"ISBN":["9783031781858","9783031781865"],"references-count":45,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-78186-5_20","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,30]]},"assertion":[{"value":"30 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICPR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Pattern Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Kolkata","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"India","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icpr2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icpr2024.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}