{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,31]],"date-time":"2026-03-31T05:36:26Z","timestamp":1774935386450,"version":"3.50.1"},"reference-count":60,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2021,11,24]],"date-time":"2021-11-24T00:00:00Z","timestamp":1637712000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2021,11,24]],"date-time":"2021-11-24T00:00:00Z","timestamp":1637712000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Mach Learn"],"published-print":{"date-parts":[[2023,4]]},"DOI":"10.1007\/s10994-021-06112-5","type":"journal-article","created":{"date-parts":[[2021,11,24]],"date-time":"2021-11-24T21:02:35Z","timestamp":1637787755000},"page":"1201-1226","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":16,"title":["Bimodal variational autoencoder for audiovisual speech recognition"],"prefix":"10.1007","volume":"112","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6136-4823","authenticated-orcid":false,"given":"Hadeer M.","family":"Sayed","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hesham E.","family":"ElDeeb","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shereen A.","family":"Taie","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2021,11,24]]},"reference":[{"key":"6112_CR1","doi-asserted-by":"crossref","unstructured":"Abdelaziz, A. H. (2017). Ntcd-timit: A new database and baseline for noise-robust audio-visual speech recognition. In: INTERSPEECH (pp. 3752\u20133756).","DOI":"10.21437\/Interspeech.2017-860"},{"key":"6112_CR2","first-page":"012050","volume":"1664","author":"S Adnan","year":"2020","unstructured":"Adnan, S., Ali, F., & Abdulmunem, A. A. (2020). Facial feature extraction for face recognition. Journal of Physics: Conference Series, IOP Publishing, 1664, 012050.","journal-title":"Journal of Physics: Conference Series, IOP Publishing"},{"key":"6112_CR3","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2889052","author":"T Afouras","year":"2018","unstructured":"Afouras, T., Chung, J. S., Senior, A., Vinyals, O., & Zisserman, A. (2018). Deep audio-visual speech recognition. IEEE Transactions on Pattern Analysis and Machine Intelligence. https:\/\/doi.org\/10.1109\/TPAMI.2018.2889052","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"issue":"1","key":"6112_CR4","doi-asserted-by":"publisher","first-page":"90","DOI":"10.1109\/T-C.1974.223784","volume":"100","author":"N Ahmed","year":"1974","unstructured":"Ahmed, N., Natarajan, T., & Rao, K. R. (1974). Discrete cosine transform. IEEE Transactions on Computers, 100(1), 90\u201393.","journal-title":"IEEE Transactions on Computers"},{"key":"6112_CR5","doi-asserted-by":"crossref","unstructured":"Amberkar, A., Awasarmol, P., Deshmukh, G., & Dave, P. (2018). Speech recognition using recurrent neural networks. In: 2018 international conference on current trends towards converging technologies (ICCTCT) (pp. 1\u20134). IEEE.","DOI":"10.1109\/ICCTCT.2018.8551185"},{"key":"6112_CR6","doi-asserted-by":"crossref","unstructured":"Anina, I., Zhou, Z., Zhao, G., & Pietik\u00e4inen, M. (2015). Ouluvs2: A multi-view audiovisual database for non-rigid mouth motion analysis. In 2015 11th IEEE international conference and workshops on automatic face and gesture recognition (FG) (vol. 1, pp. 1\u20135). IEEE.","DOI":"10.1109\/FG.2015.7163155"},{"issue":"2","key":"6112_CR7","doi-asserted-by":"publisher","first-page":"423","DOI":"10.1109\/TPAMI.2018.2798607","volume":"41","author":"T Baltru\u0161aitis","year":"2018","unstructured":"Baltru\u0161aitis, T., Ahuja, C., & Morency, L. P. (2018). Multimodal machine learning: A survey and taxonomy. IEEE Transactions on Pattern Analysis and Machine Intelligence, 41(2), 423\u2013443.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"6112_CR8","doi-asserted-by":"crossref","unstructured":"Bengio, Y., Lamblin, P., Popovici, D., & Larochelle, H. (2007). Greedy layer-wise training of deep networks. In Advances in neural information processing systems (pp. 153\u2013160).","DOI":"10.7551\/mitpress\/7503.003.0024"},{"key":"6112_CR9","doi-asserted-by":"crossref","unstructured":"Bokade, R., Navato, A., Ouyang, R., Jin, X., Chou, C. A., Ostadabbas, S., & Mueller, A. V. (2020). A cross-disciplinary comparison of multimodal data fusion approaches and applications: Accelerating learning through trans-disciplinary information sharing. In Expert Systems with Applications (pp. 113885).","DOI":"10.1016\/j.eswa.2020.113885"},{"key":"6112_CR10","doi-asserted-by":"crossref","unstructured":"Cao, Q., Shen, L., Xie, W., Parkhi, O. M., & Zisserman, A. (2018). Vggface2: A dataset for recognising faces across pose and age. In 2018 13th IEEE international conference on automatic face and gesture recognition (FG 2018) (pp. 67\u201374). IEEE","DOI":"10.1109\/FG.2018.00020"},{"key":"6112_CR11","doi-asserted-by":"publisher","unstructured":"Cho, K., Van Merri\u00ebnboer, B., Gulcehre, C., Bahdanau, D., Bougares, F., Schwenk, H., & Bengio, Y. (2014). Learning phrase representations using rnn encoder-decoder for statistical machine translation. In Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP), association for computational linguistics, Doha, Qatar (pp. 1724\u20131734). https:\/\/doi.org\/10.3115\/v1\/D14-1179, https:\/\/aclanthology.org\/D14-1179","DOI":"10.3115\/v1\/D14-1179"},{"issue":"3","key":"6112_CR12","doi-asserted-by":"publisher","first-page":"273","DOI":"10.1007\/BF00994018","volume":"20","author":"C Cortes","year":"1995","unstructured":"Cortes, C., & Vapnik, V. (1995). Support-vector networks. Machine Learning, 20(3), 273\u2013297.","journal-title":"Machine Learning"},{"key":"6112_CR13","doi-asserted-by":"crossref","unstructured":"Dalal, N., Triggs, B. (2005). Histograms of oriented gradients for human detection. In 2005 IEEE computer society conference on computer vision and pattern recognition (CVPR05) (Vol. 1, pp. 886\u2013893). IEEE.","DOI":"10.1109\/CVPR.2005.177"},{"issue":"4","key":"6112_CR14","doi-asserted-by":"publisher","first-page":"357","DOI":"10.1109\/TASSP.1980.1163420","volume":"28","author":"S Davis","year":"1980","unstructured":"Davis, S., & Mermelstein, P. (1980). Comparison of parametric representations for monosyllabic word recognition in continuously spoken sentences. IEEE Transactions on Acoustics, Speech, and Signal Processing, 28(4), 357\u2013366.","journal-title":"IEEE Transactions on Acoustics, Speech, and Signal Processing"},{"key":"6112_CR15","first-page":"54","volume":"41","author":"JS Deery","year":"2007","unstructured":"Deery, J. S. (2007). The \u2018real\u2019 history of real-time spectrum analyzers a 50-year trip down memory lane. Sound and Vibration, 41, 54\u201359.","journal-title":"Sound and Vibration"},{"key":"6112_CR16","unstructured":"Doersch, C. (2016). Tutorial on variational autoencoders. arXiv:160605908"},{"issue":"7","key":"6112_CR17","doi-asserted-by":"publisher","first-page":"1553","DOI":"10.1109\/TMM.2013.2267205","volume":"15","author":"G Evangelopoulos","year":"2013","unstructured":"Evangelopoulos, G., Zlatintsi, A., Potamianos, A., Maragos, P., Rapantzikos, K., Skoumas, G., & Avrithis, Y. (2013). Multimodal saliency and fusion for movie summarization based on aural, visual, and textual attention. IEEE Transactions on Multimedia, 15(7), 1553\u20131568.","journal-title":"IEEE Transactions on Multimedia"},{"key":"6112_CR18","doi-asserted-by":"crossref","unstructured":"Faruk, A., Faraby, H. A., Azad, M. M., Fedous, M. R., & Morol, M. K. (2020). Image to Bengali caption generation using deep cnn and bidirectional gated recurrent unit. In 2020 23rd international conference on computer and information technology (ICCIT) (pp. 1\u20136).","DOI":"10.1109\/ICCIT51783.2020.9392697"},{"issue":"1","key":"6112_CR19","first-page":"540","volume":"2","author":"R Fathima","year":"2013","unstructured":"Fathima, R., & Raseena, P. (2013). Gammatone cepstral coefficient for speaker identification. International Journal of Advanced Research in Electrical, Electronics and Instrumentation Engineering, 2(1), 540\u2013545.","journal-title":"International Journal of Advanced Research in Electrical, Electronics and Instrumentation Engineering"},{"key":"6112_CR20","doi-asserted-by":"publisher","first-page":"829","DOI":"10.1162\/neco_a_01273","volume":"32","author":"J Gao","year":"2020","unstructured":"Gao, J., Li, P., Chen, Z., & Zhang, J. (2020). A survey on deep learning for multimodal data fusion. Neural Computation, 32, 829\u2013864.","journal-title":"Neural Computation"},{"key":"6112_CR21","unstructured":"Garg, A., Noyola, J., Bagadia, S. (2016). Lip reading using cnn and lstm. Technical report, Stanford University, CS231 n project report."},{"key":"6112_CR22","unstructured":"Goodfellow, I., Bengio, Y., & Courville, A. (2016). Deep learning. MIT Press. http:\/\/www.deeplearningbook.org"},{"key":"6112_CR23","doi-asserted-by":"crossref","unstructured":"Graves, A., Fern\u00e1ndez, S., & Schmidhuber, J. (2005). Bidirectional lstm networks for improved phoneme classification and recognition. In International conference on artificial neural networks (pp. 799\u2013804). Springer.","DOI":"10.1007\/11550907_126"},{"issue":"8","key":"6112_CR24","doi-asserted-by":"publisher","first-page":"1771","DOI":"10.1162\/089976602760128018","volume":"14","author":"GE Hinton","year":"2002","unstructured":"Hinton, G. E. (2002). Training products of experts by minimizing contrastive divergence. Neural Computation, 14(8), 1771\u20131800.","journal-title":"Neural Computation"},{"key":"6112_CR25","unstructured":"Hinton, G. E., & Zemel, R. S. (1994). Autoencoders, minimum description length and helmholtz free energy. In Advances in neural information processing systems (pp. 3\u201310)."},{"issue":"8","key":"6112_CR26","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., & Schmidhuber, J. (1997). Long short-term memory. Neural Computation, 9(8), 1735\u20131780.","journal-title":"Neural Computation"},{"key":"6112_CR27","doi-asserted-by":"crossref","unstructured":"Jogin, M., Madhulika, M., Divya, G., Meghana, R., Apoorva, S., et al. (2018). Feature extraction using convolution neural networks (cnn) and deep learning. In 2018 3rd IEEE international conference on recent trends in electronics, information and communication technology (RTEICT) (pp. 2319\u20132323). IEEE.","DOI":"10.1109\/RTEICT42901.2018.9012507"},{"key":"6112_CR28","doi-asserted-by":"publisher","first-page":"720","DOI":"10.1007\/978-3-642-04898-2_327","volume-title":"Kullback\u2013Leibler divergence","author":"JM Joyce","year":"2011","unstructured":"Joyce, J. M. (2011). Kullback\u2013Leibler divergence (pp. 720\u2013722). Berlin: Springer. https:\/\/doi.org\/10.1007\/978-3-642-04898-2_327."},{"issue":"4","key":"6112_CR29","first-page":"111","volume":"1","author":"B Karlik","year":"2011","unstructured":"Karlik, B., & Olgac, A. V. (2011). Performance analysis of various activation functions in generalized mlp architectures of neural networks. International Journal of Artificial Intelligence and Expert Systems, 1(4), 111\u2013122.","journal-title":"International Journal of Artificial Intelligence and Expert Systems"},{"key":"6112_CR30","doi-asserted-by":"crossref","unstructured":"Kazemi, V., & Sullivan, J. (2014). One millisecond face alignment with an ensemble of regression trees. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 1867\u20131874).","DOI":"10.1109\/CVPR.2014.241"},{"issue":"6","key":"6112_CR31","first-page":"1719","volume":"30","author":"S Kim","year":"2014","unstructured":"Kim, S., & Cho, K. (2014). Fast calculation of histogram of oriented gradient feature by removing redundancy in overlapping block. J Inf Sci Eng, 30(6), 1719\u20131731.","journal-title":"J Inf Sci Eng"},{"key":"6112_CR32","unstructured":"Kingma, D. P, & Welling, M. (2014). Auto-encoding variational bayes. CoRR arXiv:1312.6114"},{"issue":"2","key":"6112_CR33","doi-asserted-by":"publisher","first-page":"233","DOI":"10.1002\/aic.690370209","volume":"37","author":"MA Kramer","year":"1991","unstructured":"Kramer, M. A. (1991). Nonlinear principal component analysis using autoassociative neural networks. AIChE Journal, 37(2), 233\u2013243.","journal-title":"AIChE Journal"},{"issue":"6","key":"6112_CR34","doi-asserted-by":"publisher","first-page":"84","DOI":"10.1145\/3065386","volume":"60","author":"A Krizhevsky","year":"2017","unstructured":"Krizhevsky, A., Sutskever, I., & Hinton, G. E. (2017). Imagenet classification with deep convolutional neural networks. Communications of the ACM, 60(6), 84\u201390.","journal-title":"Communications of the ACM"},{"key":"6112_CR35","doi-asserted-by":"publisher","DOI":"10.14569\/IJACSA.2020.0110321","author":"KP Lakshmi","year":"2020","unstructured":"Lakshmi, K. P., Solanki, M., Dara, J. S., & Kompalli, A. B. (2020). Video genre classification using convolutional recurrent neural networks. International Journal of Advanced Computer Science and Applications. https:\/\/doi.org\/10.14569\/IJACSA.2020.0110321.","journal-title":"International Journal of Advanced Computer Science and Applications"},{"key":"6112_CR36","doi-asserted-by":"crossref","unstructured":"Li, L., Zhao, Y., Jiang, D., Zhang, Y., Wang, F., Gonzalez, I., Valentin, E., & Sahli, H. (2013). Hybrid deep neural network\u2013hidden markov model (dnn-hmm) based speech emotion recognition. In 2013 Humaine association conference on affective computing and intelligent interaction (pp. 312\u2013317). IEEE.","DOI":"10.1109\/ACII.2013.58"},{"key":"6112_CR37","doi-asserted-by":"crossref","unstructured":"Morvant, E., Habrard, A., & Ayache, S. (2014). Majority vote of diverse classifiers for late fusion. In Joint IAPR international workshops on statistical techniques in pattern recognition (SPR) and structural and syntactic pattern recognition (SSPR) (pp 153\u2013162). Springer.","DOI":"10.1007\/978-3-662-44415-3_16"},{"key":"6112_CR38","unstructured":"Ngiam, J., Khosla, A., Kim, M., Nam, J., Lee, H., & Ng, A. Y. (2011). Multimodal deep learning. In L. Getoor & T. Scheffer (Eds.), ICML (pp. 689\u2013696). Omnipress. http:\/\/dblp.uni-trier.de\/db\/conf\/icml\/icml2011.html#NgiamKKNLN11."},{"key":"6112_CR39","doi-asserted-by":"publisher","unstructured":"Parkhi, O. M., Vedaldi, A., & Zisserman, A. (2015). Deep face recognition. In British machine vision conference (pp. 41.1\u201341.12,). BMVA Press. https:\/\/doi.org\/10.5244\/C.29.41","DOI":"10.5244\/C.29.41"},{"key":"6112_CR40","doi-asserted-by":"crossref","unstructured":"Patterson, E. K., Gurbuz, S., Tufekci, Z., & Gowdy, J. N. (2002). Cuave: A new audio-visual database for multimodal human-computer interface research. In 2002 IEEE international conference on acoustics, speech, and signal processing (Vol. 2, pp. II\u20132017). IEEE.","DOI":"10.1109\/ICASSP.2002.1006168"},{"key":"6112_CR41","doi-asserted-by":"crossref","unstructured":"Petridis, S., Li, Z., & Pantic, M. (2017). End-to-end visual speech recognition with lstms. In 2017 IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp 2592\u20132596). IEEE.","DOI":"10.1109\/ICASSP.2017.7952625"},{"issue":"9","key":"6112_CR42","doi-asserted-by":"publisher","first-page":"1306","DOI":"10.1109\/JPROC.2003.817150","volume":"91","author":"G Potamianos","year":"2003","unstructured":"Potamianos, G., Neti, C., Gravier, G., Garg, A., & Senior, A. W. (2003). Recent advances in the automatic recognition of audiovisual speech. Proceedings of the IEEE, 91(9), 1306\u20131326.","journal-title":"Proceedings of the IEEE"},{"key":"6112_CR43","unstructured":"Povey, D., Ghoshal, A., Boulianne, G., Burget, L., Glembek, O., Goel, N., Hannemann, M., Motlicek, P., Qian, Y., Schwarz, P., et al. (2011). The kaldi speech recognition toolkit. In IEEE 2011 workshop on automatic speech recognition and understanding, IEEE Signal Processing Society, CONF."},{"key":"6112_CR44","doi-asserted-by":"crossref","unstructured":"Povey, D., Peddinti, V., Galvez, D., Ghahremani, P., Manohar, V., Na, X., Wang, Y., & Khudanpur, S. (2016). Purely sequence-trained neural networks for asr based on lattice-free mmi. In Interspeech (pp. 2751\u20132755).","DOI":"10.21437\/Interspeech.2016-595"},{"key":"6112_CR45","doi-asserted-by":"publisher","first-page":"54","DOI":"10.1016\/j.dsp.2018.06.004","volume":"82","author":"MH Rahmani","year":"2018","unstructured":"Rahmani, M. H., Almasganj, F., & Seyyedsalehi, S. A. (2018). Audio-visual feature fusion via deep neural networks for automatic speech recognition. Digital Signal Processing, 82, 54\u201363.","journal-title":"Digital Signal Processing"},{"key":"6112_CR46","unstructured":"Ranganath, R., Gerrish, S., & Blei, D. (2014). Black box variational inference. In Artificial intelligence and statistics, PMLR (pp. 814\u2013822)."},{"issue":"6088","key":"6112_CR47","doi-asserted-by":"publisher","first-page":"533","DOI":"10.1038\/323533a0","volume":"323","author":"DE Rumelhart","year":"1986","unstructured":"Rumelhart, D. E., Hinton, G. E., & Williams, R. J. (1986). Learning representations by back-propagating errors. Nature, 323(6088), 533\u2013536.","journal-title":"Nature"},{"key":"6112_CR48","doi-asserted-by":"crossref","unstructured":"Sagonas, C., Tzimiropoulos, G., Zafeiriou, S., & Pantic, M. (2013). 300 faces in-the-wild challenge: The first facial landmark localization challenge. In Proceedings of the IEEE international conference on computer vision workshops (pp. 397\u2013403).","DOI":"10.1109\/ICCVW.2013.59"},{"key":"6112_CR49","doi-asserted-by":"publisher","first-page":"107020","DOI":"10.1016\/j.apacoust.2019.107020","volume":"158","author":"G Sharma","year":"2020","unstructured":"Sharma, G., Umapathy, K., & Krishnan, S. (2020). Trends in audio signal feature extraction methods. Applied Acoustics, 158, 107020.","journal-title":"Applied Acoustics"},{"key":"6112_CR50","doi-asserted-by":"crossref","unstructured":"Shekar, B., & Dagnew, G. (2019). Grid search-based hyperparameter tuning and classification of microarray cancer data. In 2019 second international conference on advanced computational and communication paradigms (ICACCP) (pp. 1\u20138). IEEE.","DOI":"10.1109\/ICACCP.2019.8882943"},{"issue":"2","key":"6112_CR51","doi-asserted-by":"publisher","first-page":"216","DOI":"10.1016\/S1007-0214(11)70032-3","volume":"16","author":"C Shu","year":"2011","unstructured":"Shu, C., Ding, X., & Fang, C. (2011). Histogram of the oriented gradient for face recognition. Tsinghua Science and Technology, 16(2), 216\u2013224.","journal-title":"Tsinghua Science and Technology"},{"key":"6112_CR52","doi-asserted-by":"crossref","unstructured":"Shutova, E., Kiela, D., & Maillard, J. (2016). Black holes and white rabbits: Metaphor identification with visual features. In Proceedings of the 2016 conference of the north american chapter of the association for computational linguistics: human language technologies (pp. 160\u2013170).","DOI":"10.18653\/v1\/N16-1020"},{"key":"6112_CR53","doi-asserted-by":"publisher","first-page":"301","DOI":"10.14445\/22315381\/IJETT-V48P253","volume":"48","author":"KM Tarwani","year":"2017","unstructured":"Tarwani, K. M., & Edem, S. (2017). Survey on recurrent neural network in natural language processing. International Journal of Engineering Trends and Technology, 48, 301\u2013304.","journal-title":"International Journal of Engineering Trends and Technology"},{"issue":"3","key":"6112_CR54","doi-asserted-by":"publisher","first-page":"441","DOI":"10.1109\/tcbb.2007.1015","volume":"4","author":"T Thireou","year":"2007","unstructured":"Thireou, T., & Reczko, M. (2007). Bidirectional long short-term memory networks for predicting the subcellular localization of eukaryotic proteins. IEEE\/ACM Transactions on Computational Biology and Bioinformatics, 4(3), 441\u2013446.","journal-title":"IEEE\/ACM Transactions on Computational Biology and Bioinformatics"},{"issue":"4","key":"6112_CR55","doi-asserted-by":"publisher","first-page":"611","DOI":"10.1007\/s13244-018-0639-9","volume":"9","author":"R Yamashita","year":"2018","unstructured":"Yamashita, R., Nishio, M., Do, R. K. G., & Togashi, K. (2018). Convolutional neural networks: An overview and application in radiology. Insights into Imaging, 9(4), 611\u2013629.","journal-title":"Insights into Imaging"},{"key":"6112_CR56","doi-asserted-by":"crossref","unstructured":"Yang, X., Ramesh, P., Chitta, R., Madhvanath, S., Bernal, E. A., & Luo, J. (2017). Deep multimodal representation learning from temporal data. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 5447\u20135455).","DOI":"10.1109\/CVPR.2017.538"},{"key":"6112_CR57","doi-asserted-by":"crossref","unstructured":"Yu, J., Zhang, S. X., Wu, J., Ghorbani, S., Wu, B., Kang, S., Liu, S., Liu, X., Meng, H., & Yu, D. (2020). Audio-visual recognition of overlapped speech for the lrs2 dataset. In ICASSP 2020\u20132020 IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp 6984\u20136988). IEEE.","DOI":"10.1109\/ICASSP40776.2020.9054127"},{"key":"6112_CR58","doi-asserted-by":"crossref","unstructured":"Zaytseva, E., Segu\u00ed, S., & Vitria, J. (2012). Sketchable histograms of oriented gradients for object detection. In Iberoamerican congress on pattern recognition (pp. 374\u2013381). Springer.","DOI":"10.1007\/978-3-642-33275-3_46"},{"key":"6112_CR59","doi-asserted-by":"crossref","unstructured":"Zhang, S., Lei, M., Ma, B., & Xie, L. (2019). Robust audio-visual speech recognition using bimodal dfsmn with multi-condition training and dropout regularization. In ICASSP 2019-2019 IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp. 6570\u20136574). IEEE.","DOI":"10.1109\/ICASSP.2019.8682566"},{"issue":"1","key":"6112_CR60","first-page":"1799","volume":"15","author":"J Zhu","year":"2014","unstructured":"Zhu, J., Chen, N., & Xing, E. P. (2014). Bayesian inference with posterior regularization and applications to infinite latent svms. The Journal of Machine Learning Research, 15(1), 1799\u20131847.","journal-title":"The Journal of Machine Learning Research"}],"container-title":["Machine Learning"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-021-06112-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10994-021-06112-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-021-06112-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,11,13]],"date-time":"2023-11-13T02:35:09Z","timestamp":1699842909000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10994-021-06112-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,11,24]]},"references-count":60,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2023,4]]}},"alternative-id":["6112"],"URL":"https:\/\/doi.org\/10.1007\/s10994-021-06112-5","relation":{},"ISSN":["0885-6125","1573-0565"],"issn-type":[{"value":"0885-6125","type":"print"},{"value":"1573-0565","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,11,24]]},"assertion":[{"value":"1 February 2021","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 October 2021","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 October 2021","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 November 2021","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no relevant financial or non-financial interests to disclose.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflicts of interest"}},{"value":"Not Applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics approval"}},{"value":"Not Applicable.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent to participate"}},{"value":"Not Applicable.","order":5,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for publication"}},{"value":"CUAVE dataset is currently stored on the Google Drive account of Prof. John Gowdy. I contacted Prof. John Gowdy via email (jgowdy@clemson.edu) to request access to it.","order":6,"name":"Ethics","group":{"name":"EthicsHeading","label":"Availability of data"}},{"value":"The code hasn\u2019t been publicly available yet.","order":7,"name":"Ethics","group":{"name":"EthicsHeading","label":"Code availability"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}