{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,12,30]],"date-time":"2024-12-30T18:43:27Z","timestamp":1735584207871},"reference-count":53,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2018,8,1]],"date-time":"2018-08-01T00:00:00Z","timestamp":1533081600000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"name":"Government of Russia and DAAD","award":["8.9957.2017\/5.2"],"award-info":[{"award-number":["8.9957.2017\/5.2"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Multimodal User Interfaces"],"published-print":{"date-parts":[[2018,12]]},"DOI":"10.1007\/s12193-018-0267-1","type":"journal-article","created":{"date-parts":[[2018,8,1]],"date-time":"2018-08-01T06:40:27Z","timestamp":1533105627000},"page":"319-328","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":20,"title":["Multimodal speech recognition: increasing accuracy using high speed video data"],"prefix":"10.1007","volume":"12","author":[{"given":"Denis","family":"Ivanko","sequence":"first","affiliation":[]},{"given":"Alexey","family":"Karpov","sequence":"additional","affiliation":[]},{"given":"Dmitrii","family":"Fedotov","sequence":"additional","affiliation":[]},{"given":"Irina","family":"Kipyatkova","sequence":"additional","affiliation":[]},{"given":"Dmitry","family":"Ryumin","sequence":"additional","affiliation":[]},{"given":"Dmitriy","family":"Ivanko","sequence":"additional","affiliation":[]},{"given":"Wolfgang","family":"Minker","sequence":"additional","affiliation":[]},{"given":"Milos","family":"Zelezny","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2018,8,1]]},"reference":[{"key":"267_CR1","doi-asserted-by":"publisher","first-page":"746","DOI":"10.1038\/264746a0","volume":"264","author":"H McGurk","year":"1976","unstructured":"McGurk H, MacDonald J (1976) Hearing lips and seeing voices. Nature 264:746\u2013748","journal-title":"Nature"},{"key":"267_CR2","unstructured":"Neti C, Potamianos G, Luettin J, Matthews I, Glotin H, Vergyri D, Sison J, Mashari A, Zhou J (2000) Audio visual speech recognition. In: Final workshop 2000 report. Center for Language and Speech Processing, The Johns Hopkins University, Baltimore"},{"issue":"9","key":"267_CR3","doi-asserted-by":"publisher","first-page":"1635","DOI":"10.1109\/JPROC.2015.2459017","volume":"103","author":"Aggelos K. Katsaggelos","year":"2015","unstructured":"Katsaggelos K, Bahaadini S, Molina R (2015) Audiovisual fusion: challenges and new approaches. In: Proceedings of the IEEE, vol 103(9), pp 1635\u20131653","journal-title":"Proceedings of the IEEE"},{"issue":"2","key":"267_CR4","doi-asserted-by":"publisher","first-page":"136","DOI":"10.1016\/j.csl.2009.03.007","volume":"24","author":"D Dean","year":"2010","unstructured":"Dean D, Sridharan S (2010) Dynamic visual features for audio\u2013visual speaker verification. Comput Speech Lang 24(2):136\u2013149","journal-title":"Comput Speech Lang"},{"key":"267_CR5","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/978-3-319-66429-3_20","volume-title":"Speech and Computer","author":"Eugene Luckyanets","year":"2017","unstructured":"Luckyanets E, Melnikov A, Kudashev O, Novoselov S, Lavrentyeva G (2017) Bimodal anti-spoofing system for mobile security. In: SPECOM 2017, LNAI 10458, pp 211\u2013220"},{"key":"267_CR6","doi-asserted-by":"crossref","unstructured":"Akhtiamov O, Sidorov M, Karpov A, Minker W (2017) Speech and text analysis for multimodal addressee detection in human\u2013human\u2013computer interaction. In: Proceedings of the interspeech 2017, pp 2521\u20132525","DOI":"10.21437\/Interspeech.2017-501"},{"issue":"4","key":"267_CR7","doi-asserted-by":"publisher","first-page":"325","DOI":"10.1007\/s12193-015-0207-2","volume":"10","author":"HM Shamim","year":"2016","unstructured":"Shamim HM, Muhammad G (2016) Audio\u2013visual emotion recognition using multi-directional regression and ridgelet transform. J Multimodal User Interfaces (JMUI) 10(4):325\u2013333","journal-title":"J Multimodal User Interfaces (JMUI)"},{"key":"267_CR8","first-page":"59","volume-title":"Lecture Notes in Computer Science","author":"Dmitrii Fedotov","year":"2017","unstructured":"Fedotov D, Sidorov M, Minker W (2017) Context-awarded models in time-continuous multidimensional affect recognition. In: ICR 2017, LNAI 10459, pp 59\u201366"},{"key":"267_CR9","unstructured":"Liu Q, Wang W, Jackson P (2011) A visual voice activity detection method with adaboosting. In: Proceedings of the sensor signal process defence, pp 1\u20135"},{"issue":"3","key":"267_CR10","doi-asserted-by":"publisher","first-page":"864","DOI":"10.1109\/TMM.2014.2301977","volume":"16","author":"M Barnard","year":"2014","unstructured":"Barnard M et al (2014) Robust multi-speaker tracking via dictionary learning and identity modeling. IEEE Trans Multimed 16(3):864\u2013880","journal-title":"IEEE Trans Multimed"},{"key":"267_CR11","doi-asserted-by":"crossref","unstructured":"Kaya H, Karpov A (2017) Introducing weighted kernel classifiers for handling imbalanced paralinguistic corpora: snoring, addressee and cold. In: Proceedings of the interspeech 2017, pp 3527\u20133531","DOI":"10.21437\/Interspeech.2017-653"},{"issue":"10","key":"267_CR12","doi-asserted-by":"publisher","first-page":"1692","DOI":"10.1109\/JPROC.2010.2057231","volume":"98","author":"ST Shivappa","year":"2010","unstructured":"Shivappa ST, Trivedi ST (2010) Audiovisual information fusion in human\u2013computer interfaces and intelligent environments: a survey. Proc IEEE 98(10):1692\u20131715","journal-title":"Proc IEEE"},{"key":"267_CR13","doi-asserted-by":"crossref","unstructured":"Khokhlov Y, Tomashenko N, Medennikov I, Romanenko A (2017) Fast and accurate OOV decoder on high-level features. In: Proceedings of the interspeech 2017, pp 2884\u20132888","DOI":"10.21437\/Interspeech.2017-1367"},{"key":"267_CR14","unstructured":"Ngiam J et\u00a0al (2011) Multimodal deep learning. In: Proceedings of the 28th international conference of machine learning, pp 689\u2013696"},{"key":"267_CR15","unstructured":"Chetty G, Wagner M (2006) Audio\u2013visual multimodal fusion for biometric person authentication and liveness verification. In: Proceedings of the NICTA-HCSNet multimodal user interaction workshop, vol 57, pp 17\u201324"},{"issue":"6","key":"267_CR16","doi-asserted-by":"publisher","first-page":"345","DOI":"10.1007\/s00530-010-0182-0","volume":"16","author":"PK Atrey","year":"2010","unstructured":"Atrey PK, Hossain MA, Saddik E, Kankanhalli MS (2010) Multimodal fusion for multimedia analysis: a survey. Multimed Syst 16(6):345\u2013379","journal-title":"Multimed Syst"},{"issue":"1","key":"267_CR17","doi-asserted-by":"publisher","first-page":"44","DOI":"10.1145\/1126004.1126007","volume":"2","author":"H Xu","year":"2006","unstructured":"Xu H, Chua TS (2006) Fusion of AV features and external information sources for event detection in team sport video. ACM Trans Multimed Comput Commun Appl 2(1):44\u201367","journal-title":"ACM Trans Multimed Comput Commun Appl"},{"key":"267_CR18","unstructured":"Dean D.B (2008) Synchronous HMMs for audio\u2013visual speech processing. Ph.D. dissertation, Queensland University"},{"issue":"1","key":"267_CR19","doi-asserted-by":"publisher","first-page":"70","DOI":"10.1007\/s10458-009-9092-y","volume":"20","author":"LP Morency","year":"2010","unstructured":"Morency LP, Kok I, Gratch J (2010) A probabilistic multimodal approach for predicting listener backchannels. Auton Agents Multi-Agents Syst 20(1):70\u201384","journal-title":"Auton Agents Multi-Agents Syst"},{"key":"267_CR20","doi-asserted-by":"crossref","unstructured":"Lv G, Jiang D, Zhao R, Hou Y (2007) Multi-stream asynchrony modeling for audio\u2013visual speech recognition. In: Proceedings of the 9th IEEE international symposium multimedia, pp 37\u201344","DOI":"10.1109\/ISM.2007.4412354"},{"issue":"1","key":"267_CR21","doi-asserted-by":"publisher","first-page":"9","DOI":"10.1007\/s12193-016-0222-y","volume":"11","author":"C Torres-Valencia","year":"2017","unstructured":"Torres-Valencia C, Alvarez-Lopez M, Orozco-Gutierrez A (2017) SVM-based feature selection methods for emotion recognition from multimodal data. J Multimodal User Interfaces (JMUI) 11(1):9\u201323","journal-title":"J Multimodal User Interfaces (JMUI)"},{"key":"267_CR22","unstructured":"Terry L (2011) Audio\u2013visual asynchrony modeling and analysis for speech alignment and recognition. Ph.D. dissertation, Northwestern University"},{"key":"267_CR23","unstructured":"Nefian AV et al (2002) A coupled HMM for audio\u2013visual speech recognition. In: Proceedings of the IEEE international conference acoustic speech signal processing, vol 2, pp 2009\u20132013"},{"issue":"4","key":"267_CR24","doi-asserted-by":"publisher","first-page":"1145","DOI":"10.1109\/TASL.2011.2172427","volume":"20","author":"V Estellers","year":"2012","unstructured":"Estellers V, Gurban M, Thiran J (2012) On dynamic stream weighting for audio\u2013visual speech recognition. IEEE Trans Audio Speech Lang Process 20(4):1145\u20131157","journal-title":"IEEE Trans Audio Speech Lang Process"},{"key":"267_CR25","unstructured":"Abdelaziz AH, Kolossa D (2014) Dynamic stream weight estimation in coupled HMM-based audio\u2013visual speech recognition using multilayer perceptrons. In: Proceedings of the interspeech, pp 1144\u20131148"},{"key":"267_CR26","unstructured":"Chitu AG, Rothkrantz LJM (2007) The influence of video sampling rate on lipreading performance. In: Proceedings of the international conference on speech and computer SPECOM 2007. Moscow, pp 678\u2013684"},{"key":"267_CR27","doi-asserted-by":"publisher","first-page":"259","DOI":"10.1007\/978-3-642-15760-8_33","volume-title":"Text, Speech and Dialogue","author":"Alin Gavril Chitu","year":"2010","unstructured":"Chitu AG, Driel K, Rothkrantz LJM (2010) Automatic lip reading in the Dutch language using active appearance models on high speed recordings. In: Text, speech and dialogue, Springer LNCS (LNAI) 2010, vol 6231, pp 259\u2013266"},{"key":"267_CR28","doi-asserted-by":"crossref","unstructured":"Polykovsky S, Kameda Y, Ohta Y (2009) Facial micro-expressions recognition using high speed camera and 3D-gradient descriptor. In: Proceedings of the 3rd international conference on crime detection and prevention (ICDP). Tsukuba, pp 1\u20136","DOI":"10.1049\/ic.2009.0244"},{"key":"267_CR29","unstructured":"Bettadapura V (2012) Face expression recognition and analysis: the state of the art. Technical Report, College of Computing, Georgia Institute of Technology, pp 1\u201327"},{"key":"267_CR30","unstructured":"Ohzeki K (2006) Video analysis for detecting eye blinking using a high-speed camera. In: Proceedings of the 40th Asilomar conference on signals, systems and computers (ACSSC). Pacific Grove, Part 1, pp 1081\u20131085"},{"key":"267_CR31","unstructured":"Chitu AG, Rothkrantz LJM (2008) On dual view lipreading using high speed camera. In: Proceedings of the 14th annual scientific conference euromedia. Ghent, pp 43\u201351"},{"key":"267_CR32","doi-asserted-by":"publisher","first-page":"338","DOI":"10.1007\/978-3-319-43958-7_40","volume-title":"Speech and Computer","author":"Vasilisa Verkhodanova","year":"2016","unstructured":"Verkhodanova V, Ronzhin A, Kipyatkova I, Ivanko D, Karpov A, Zelezny M (2016) HAVRUS corpus: high-speed recordings of audio\u2013visual Russian speech. In: Ronzhin A, Potapova R, Nmeth G (eds) Speech and computer. SPECOM 2016. Lecture notes in computer science, vol 9811. Springer, Cham"},{"key":"267_CR33","doi-asserted-by":"crossref","unstructured":"Karpov A, Ronzhin A, Markov K, Zelezny M (2010) Viseme-dependent weight optimization for CHMM-based audio\u2013visual speech recognition. In: Proceedings of the interspeech 2010, pp 2678\u20132681","DOI":"10.21437\/Interspeech.2010-710"},{"issue":"12","key":"267_CR34","doi-asserted-by":"publisher","first-page":"2190","DOI":"10.1134\/S000511791412008X","volume":"75","author":"A Karpov","year":"2014","unstructured":"Karpov A (2014) An automatic multimodal speech recognition system with audio and video information. Autom Remote Control 75(12):2190\u20132200","journal-title":"Autom Remote Control"},{"key":"267_CR35","doi-asserted-by":"publisher","first-page":"757","DOI":"10.1007\/978-3-319-66429-3_76","volume-title":"Speech and Computer","author":"Denis Ivanko","year":"2017","unstructured":"Ivanko D, Karpov A, Ryumin D, Kipyatkova I, Saveliev A, Budkov V, Ivanko D, Zelezny M (2017) Using a high-speed video Camera for robust audio\u2013visual speech recognition in acoustically noisy conditions. In: SPECOM 2017, LNAI 10458, pp 757\u2013766"},{"key":"267_CR36","doi-asserted-by":"crossref","unstructured":"Lee B, Hasegawa-Johnson M, Goudeseune C, Kamdar S, Borys S, Liu M, Huang T (2004) AVICAR: audio\u2013visual speech corpus in a car environment. In: Proceedings of the interspeech, pp 380\u2013383","DOI":"10.21437\/Interspeech.2004-424"},{"key":"267_CR37","unstructured":"Cox S, Harvey R, Lan Y, Newman J, Theobald B (2008) The challenge of multispeaker lip-reading. In: Proceedings of the international conference auditory-visual speech process (AVSP), pp 179\u2013184"},{"key":"267_CR38","unstructured":"Patterson E, Gurbuz S, Tufekci Z, Gowdy J (2002) CUAVE: a new audio\u2013visual database for multimodal human\u2013computer interface research. In: Proceedings of the IEEE ICASSP 2002, vol 2, pp 2017\u20132020"},{"key":"267_CR39","doi-asserted-by":"crossref","unstructured":"Hazen T, Saenko K, La C, Glass J (2004) A segment-base audio\u2013visual speech recognizer: data collection, development, and initial experiments. In: Proceedings of the international conference multimodal interfaces, pp 235\u2013242","DOI":"10.1145\/1027933.1027972"},{"key":"267_CR40","unstructured":"Lucey P, Potaminanos G, Sridharan S (2008) Patch-based analysis of visual speech from multiple views. In: Proceedings of the AVSP 2008, pp 69\u201374"},{"key":"267_CR41","unstructured":"Abhishek N, Prasanta KG (2017) PRAV: a phonetically rich audio visual corpus. In: Proceedings of the interspeech 2017, pp 3747\u20133751"},{"issue":"9","key":"267_CR42","doi-asserted-by":"publisher","first-page":"590","DOI":"10.1016\/j.imavis.2014.06.004","volume":"32","author":"Ziheng Zhou","year":"2014","unstructured":"Zhou Z, Zhao G, Hong X, Pietikainen M (2014) A review of recent advances in visual speech decoding. In: Proceedings of the image and vision computing, vol 32, pp 590\u2013605","journal-title":"Image and Vision Computing"},{"key":"267_CR43","doi-asserted-by":"crossref","first-page":"50","DOI":"10.1007\/978-3-319-11581-8_6","volume-title":"Speech and Computer","author":"Alexey Karpov","year":"2014","unstructured":"Karpov A, Kipyatkova I, Zelezny M (2014) A framework for recording audio\u2013visual speech corpora with a microphone and a high-speed camera. In: Speech and computer. SPECOM 2014. Lecture notes in computer science, vol 8773. Springer, Cham"},{"issue":"1","key":"267_CR44","doi-asserted-by":"publisher","first-page":"40","DOI":"10.1109\/TPAMI.2007.250598","volume":"29","author":"S Yan","year":"2007","unstructured":"Yan S, Xu D, Zhang H, Yang Q, Lin S (2007) Graph embedding and extensions: a general framework for dimensionality reduction. IEEE Trans Pattern Anal Mach Intell 29(1):40\u201351","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"267_CR45","doi-asserted-by":"crossref","unstructured":"Hong S, Yao H, Wan Y, Chen R (2006) A PCA based visual DCT feature extraction method for lip-reading. In: Proceedings of the intelligent informatics hiding multimedia, signal process, pp 321\u2013326","DOI":"10.1109\/IIH-MSP.2006.265008"},{"key":"267_CR46","unstructured":"Yoshinaga T, Tamura S, Iwano K, Furui S (2003) Audio\u2013visual speech recognition using lip movement extracted from side-face images. In: Proceedings of the international conference auditory-visual speech processing (AVSP), pp 117\u2013120"},{"issue":"10","key":"267_CR47","doi-asserted-by":"publisher","first-page":"2879","DOI":"10.1109\/TIP.2006.877528","volume":"15","author":"H Cetingul","year":"2006","unstructured":"Cetingul H, Yemez Y, Erzin E, Tekalp A (2006) Discriminative analysis of lip motion features for speaker identification and speech reading. IEEE Trans Image Process 15(10):2879\u20132891","journal-title":"IEEE Trans Image Process"},{"issue":"2","key":"267_CR48","doi-asserted-by":"publisher","first-page":"227","DOI":"10.1007\/s12193-017-0241-3","volume":"11","author":"S Kumar","year":"2017","unstructured":"Kumar S, Bhuyan MK, Chakraborty BK (2017) Extraction of texture and geometrical features from informative facial regions for sign language recognition. J Multimodal User Interfaces (JMUI) 11(2):227\u2013239","journal-title":"J Multimodal User Interfaces (JMUI)"},{"key":"267_CR49","unstructured":"Lan Y, Theobald B, Harvey E, Ong E, Bowden R (2010) Improving visual features for lip-reading. In: Proceedings of the AVSP 2010, pp 142\u2013147"},{"key":"267_CR50","unstructured":"Chu SM, Huang TS (2002) Multi-modal sensory fusion with application to audio\u2013visual speech recognition. In: Proceedings of the multi-modal speech recognition workshop-2002, Greensboro"},{"key":"267_CR51","doi-asserted-by":"crossref","first-page":"230","DOI":"10.1007\/978-3-319-14364-4_22","volume-title":"Advances in Visual Computing","author":"Helen L. Bear","year":"2014","unstructured":"Bear H, Harvey R, Theobald B, Lan Y (2014) Which phoneme-to-viseme maps best improve visual-only computer lip-reading. In: Advances in visual computing. Springer, Berlin, pp 230\u2013239"},{"issue":"2","key":"267_CR52","doi-asserted-by":"publisher","first-page":"175","DOI":"10.1109\/TCYB.2013.2250954","volume":"44","author":"D Stewart","year":"2014","unstructured":"Stewart D, Seymour R, Pass A, Ming J (2014) Robust audio\u2013visual speech recognition under noisy audio\u2013video conditions. IEEE Trans Cybern 44(2):175\u2013184","journal-title":"IEEE Trans Cybern"},{"key":"267_CR53","doi-asserted-by":"crossref","unstructured":"Huang J, Kingsbury B (2013) Audio\u2013visual deep learning for noise robust speech recognition. In: Proceedings of the IEEE international conference on acoustics, speech, and signal processing, pp 7596\u20137599","DOI":"10.1109\/ICASSP.2013.6639140"}],"container-title":["Journal on Multimodal User Interfaces"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s12193-018-0267-1\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s12193-018-0267-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s12193-018-0267-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,8,28]],"date-time":"2022-08-28T15:07:39Z","timestamp":1661699259000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s12193-018-0267-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,8,1]]},"references-count":53,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2018,12]]}},"alternative-id":["267"],"URL":"https:\/\/doi.org\/10.1007\/s12193-018-0267-1","relation":{},"ISSN":["1783-7677","1783-8738"],"issn-type":[{"value":"1783-7677","type":"print"},{"value":"1783-8738","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018,8,1]]},"assertion":[{"value":"3 December 2017","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 June 2018","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"1 August 2018","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}