{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,2]],"date-time":"2026-01-02T07:45:13Z","timestamp":1767339913640,"version":"3.40.3"},"publisher-location":"Cham","reference-count":31,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031456442"},{"type":"electronic","value":"9783031456459"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-45645-9_63","type":"book-chapter","created":{"date-parts":[[2024,2,13]],"date-time":"2024-02-13T04:30:12Z","timestamp":1707798612000},"page":"655-667","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Multimodal Data Fusion Architectures in\u00a0Audiovisual Speech Recognition"],"prefix":"10.1007","author":[{"given":"Hadeer M.","family":"Sayed","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hesham E.","family":"ElDeeb","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shereen A.","family":"Taie","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,2,14]]},"reference":[{"key":"63_CR1","volume-title":"Artificial Intelligence a Modern Approach","author":"SJ Russell","year":"2010","unstructured":"Russell, S.J.: Artificial Intelligence a Modern Approach. Pearson Education Inc., London (2010)"},{"key":"63_CR2","unstructured":"Zaykovskiy, D.: Survey of the speech recognition techniques for mobile devices. In: Proceedings of DS Publications (2006)"},{"key":"63_CR3","doi-asserted-by":"publisher","DOI":"10.7551\/mitpress\/3348.001.0001","volume-title":"Graphical Models for Machine Learning and Digital Communication","author":"BJ Frey","year":"1998","unstructured":"Frey, B.J.: Graphical Models for Machine Learning and Digital Communication. MIT Press, Cambridge (1998)"},{"key":"63_CR4","series-title":"LNCS","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-662-44415-3_16","volume-title":"Joint IAPR International Workshops on Statistical Techniques in Pattern Recognition (SPR) and Structural and Syntactic Pattern Recognition (SSPR)","author":"E Morvant","year":"2014","unstructured":"Morvant, E., Habrard, A., Ayache, S.: Majority vote of diverse classifiers for late fusion. In: Fr\u00e4nti, P., Brown, G., Loog, M., Escolano, F., Pelillo, M. (eds.) S+SSPR 2014. LNCS, vol. 8621. Springer, Heidelberg (2014). https:\/\/doi.org\/10.1007\/978-3-662-44415-3_16"},{"issue":"7","key":"63_CR5","doi-asserted-by":"publisher","first-page":"1553","DOI":"10.1109\/TMM.2013.2267205","volume":"15","author":"G Evangelopoulos","year":"2013","unstructured":"Evangelopoulos, G., et al.: Multimodal saliency and fusion for movie summarization based on aural, visual, and textual attention. IEEE Trans. Multimedia 15(7), 1553\u20131568 (2013)","journal-title":"IEEE Trans. Multimedia"},{"key":"63_CR6","doi-asserted-by":"crossref","unstructured":"Shutova, E., Kiela, D., Maillard, J.: Black holes and white rabbits: metaphor identification with visual features. In: Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (2016)","DOI":"10.18653\/v1\/N16-1020"},{"issue":"9","key":"63_CR7","doi-asserted-by":"publisher","first-page":"1306","DOI":"10.1109\/JPROC.2003.817150","volume":"91","author":"G Potamianos","year":"2003","unstructured":"Potamianos, G., Neti, C., Gravier, G., Garg, A., Senior, A.W.: Recent advances in the automatic recognition of audiovisual speech. Proc. IEEE 91(9), 1306\u20131326 (2003)","journal-title":"Proc. IEEE"},{"key":"63_CR8","doi-asserted-by":"publisher","first-page":"26","DOI":"10.1109\/RBME.2011.2170675","volume":"4","author":"F Biessmann","year":"2011","unstructured":"Biessmann, F., Plis, S., Meinecke, F.C., Eichele, T., Muller, K.R.: Analysis of multimodal neuroimaging data. IEEE Rev. Biomed. Eng. 4, 26\u201358 (2011)","journal-title":"IEEE Rev. Biomed. Eng."},{"key":"63_CR9","doi-asserted-by":"publisher","first-page":"53","DOI":"10.1016\/j.chemolab.2013.06.006","volume":"129","author":"E Acar","year":"2013","unstructured":"Acar, E., Rasmussen, M.A., Savorani, F., N\u00e6s, T., Bro, R.: Understanding data fusion within the framework of coupled matrix and tensor factorizations. Chemometr. Intell. Lab. Syst. 129, 53\u201363 (2013)","journal-title":"Chemometr. Intell. Lab. Syst."},{"issue":"1","key":"63_CR10","doi-asserted-by":"publisher","first-page":"83","DOI":"10.1016\/j.chemolab.2010.04.012","volume":"104","author":"I Van Mechelen","year":"2010","unstructured":"Van Mechelen, I., Smilde, A.K.: A generic linked-mode decomposition model for data fusion. Chemometr. Intell. Lab. Syst. 104(1), 83\u201394 (2010)","journal-title":"Chemometr. Intell. Lab. Syst."},{"issue":"6","key":"63_CR11","doi-asserted-by":"publisher","first-page":"2405","DOI":"10.1109\/JSTARS.2014.2305441","volume":"7","author":"C Debes","year":"2014","unstructured":"Debes, C., et al.: Hyperspectral and LiDAR data fusion: outcome of the 2013 GRSS data fusion contest. IEEE J. Sel. Top. Appl. Earth Obs. Remote Sens. 7(6), 2405\u20132418 (2014)","journal-title":"IEEE J. Sel. Top. Appl. Earth Obs. Remote Sens."},{"key":"63_CR12","unstructured":"Ngiam, J., Khosla, A., Kim, M., Nam, J., Lee, H., Ng, A.Y.: Multimodal deep learning. In: ICML (2011)"},{"key":"63_CR13","doi-asserted-by":"publisher","first-page":"54","DOI":"10.1016\/j.dsp.2018.06.004","volume":"82","author":"MH Rahmani","year":"2018","unstructured":"Rahmani, M.H., Almasganj, F., Seyyedsalehi, S.A.: Audio-visual feature fusion via deep neural networks for automatic speech recognition. Digit. Signal Process. 82, 54\u201363 (2018)","journal-title":"Digit. Signal Process."},{"key":"63_CR14","doi-asserted-by":"crossref","unstructured":"Yang, X., Ramesh, P., Chitta, R., Madhvanath, S., Bernal, E.A., Luo, J.: Deep multimodal representation learning from temporal data. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5447\u20135455 (2017)","DOI":"10.1109\/CVPR.2017.538"},{"key":"63_CR15","doi-asserted-by":"crossref","unstructured":"Petridis, S., Li, Z., Pantic, M.: End-to-end visual speech recognition with LSTMs. In: 2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 2592\u20132596. IEEE (2017)","DOI":"10.1109\/ICASSP.2017.7952625"},{"key":"63_CR16","doi-asserted-by":"crossref","unstructured":"Petridis, S., Stafylakis, T., Ma, P., Cai, F., Tzimiropoulos, G., Pantic, M.: End-to-end audiovisual speech recognition. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6548\u20136552. IEEE (2018)","DOI":"10.1109\/ICASSP.2018.8461326"},{"key":"63_CR17","doi-asserted-by":"crossref","unstructured":"Yang, C.-C., Fan, W.-C., Yang, C.-F., Wang, Y.-C.F.: Cross-modal mutual learning for audio-visual speech recognition and manipulation. In: Proceedings of the 36th AAAI Conference on Artificial Intelligence, Vancouver, BC, Canada, vol. 22 (2022)","DOI":"10.1609\/aaai.v36i3.20210"},{"key":"63_CR18","first-page":"1","volume":"112","author":"HM Sayed","year":"2021","unstructured":"Sayed, H.M., ElDeeb, H.E., Taie, S.A.: Bimodal variational autoencoder for audiovisual speech recognition. Mach. Learn. 112, 1\u201326 (2021)","journal-title":"Mach. Learn."},{"key":"63_CR19","doi-asserted-by":"crossref","unstructured":"Zhang, S., Lei, M., Ma, B., Xie, L.: Robust audio-visual speech recognition using bimodal DFSMN with multi-condition training and dropout regularization. In: ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6570\u20136574. IEEE (2019)","DOI":"10.1109\/ICASSP.2019.8682566"},{"key":"63_CR20","unstructured":"Povey, D., et al.: The Kaldi speech recognition toolkit. In: IEEE 2011 Workshop on Automatic Speech Recognition and Understanding, no. CONF. IEEE Signal Processing Society (2011)"},{"key":"63_CR21","doi-asserted-by":"publisher","first-page":"1788","DOI":"10.1109\/TASLP.2020.3000593","volume":"28","author":"M Sadeghi","year":"2020","unstructured":"Sadeghi, M., Leglaive, S., Alameda-Pineda, X., Girin, L., Horaud, R.: Audio-visual speech enhancement using conditional variational auto-encoders. IEEE\/ACM Trans. Audio Speech Lang. Process. 28, 1788\u20131800 (2020)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"63_CR22","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"414","DOI":"10.1007\/978-3-540-74494-8_52","volume-title":"International Conference on Independent Component Analysis and Signal Separation","author":"P Smaragdis","year":"2007","unstructured":"Smaragdis, P., Raj, B., Shashanka, M.: Supervised and semi-supervised separation of sounds from single-channel mixtures. In: Davies, M.E., James, C.J., Abdallah, S.A., Plumbley, M.D. (eds.) ICA 2007. LNCS, vol. 4666, pp. 414\u2013421. Springer, Heidelberg (2007). https:\/\/doi.org\/10.1007\/978-3-540-74494-8_52"},{"key":"63_CR23","doi-asserted-by":"crossref","unstructured":"Gabbay, A., Shamir, A., Peleg, S.: Visual speech enhancement. arXiv preprint arXiv:1711.08789 (2017)","DOI":"10.21437\/Interspeech.2018-1955"},{"key":"63_CR24","doi-asserted-by":"crossref","unstructured":"Zhou, P., Yang, W., Chen, W., Wang, Y., Jia, J.: Modality attention for end-to-end audio-visual speech recognition. In: ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6565\u20136569. IEEE (2019)","DOI":"10.1109\/ICASSP.2019.8683733"},{"key":"63_CR25","doi-asserted-by":"crossref","unstructured":"Chiu, C.-C., et al.: State-of-the-art speech recognition with sequence-to-sequence models. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4774\u20134778. IEEE (2018)","DOI":"10.1109\/ICASSP.2018.8462105"},{"key":"63_CR26","doi-asserted-by":"crossref","unstructured":"Son Chung, J., Senior, A., Vinyals, O., Zisserman, A.: Lip reading sentences in the wild. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6447\u20136456 (2017)","DOI":"10.1109\/CVPR.2017.367"},{"key":"63_CR27","doi-asserted-by":"crossref","unstructured":"Sterpu, G., Saam, C., Harte, N.: Attention-based audio-visual fusion for robust automatic speech recognition. In: Proceedings of the 20th ACM International Conference on Multimodal Interaction, pp. 111\u2013115 (2018)","DOI":"10.1145\/3242969.3243014"},{"key":"63_CR28","doi-asserted-by":"crossref","unstructured":"Yu, J., et al.: Audio-visual recognition of overlapped speech for the LRS2 dataset. In: ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6984\u20136988. IEEE (2020)","DOI":"10.1109\/ICASSP40776.2020.9054127"},{"key":"63_CR29","doi-asserted-by":"crossref","unstructured":"Braga, O., Makino, T., Siohan, O., Liao, H.: End-to-end multi-person audio\/visual automatic speech recognition. In: ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6994\u20136998. IEEE (2020)","DOI":"10.1109\/ICASSP40776.2020.9053974"},{"key":"63_CR30","doi-asserted-by":"crossref","unstructured":"Ma, P., Petridis, S., Pantic, M.: End-to-end audio-visual speech recognition with conformers. In: ICASSP 2021-2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE (2021)","DOI":"10.1109\/ICASSP39728.2021.9414567"},{"key":"63_CR31","doi-asserted-by":"crossref","unstructured":"Shi, B., Hsu, W.-N., Mohamed, A.: Robust Self-Supervised Audio-Visual Speech Recognition. arXiv preprint arXiv:2201.01763 (2022)","DOI":"10.21437\/Interspeech.2022-99"}],"container-title":["Lecture Notes in Networks and Systems","Information Systems and Technologies"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-45645-9_63","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,10]],"date-time":"2025-03-10T03:26:07Z","timestamp":1741577167000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-45645-9_63"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031456442","9783031456459"],"references-count":31,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-45645-9_63","relation":{},"ISSN":["2367-3370","2367-3389"],"issn-type":[{"type":"print","value":"2367-3370"},{"type":"electronic","value":"2367-3389"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"14 February 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"WorldCIST","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"World Conference on Information Systems and Technologies","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Pisa","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 April 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"6 April 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"worldcist2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/worldcist.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}