{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T07:25:46Z","timestamp":1740122746743,"version":"3.37.3"},"reference-count":44,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2023,11,14]],"date-time":"2023-11-14T00:00:00Z","timestamp":1699920000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,11,14]],"date-time":"2023-11-14T00:00:00Z","timestamp":1699920000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100008628","name":"Ministry of Electronics and Information technology","doi-asserted-by":"publisher","award":["12(5)\/2015-ESD"],"award-info":[{"award-number":["12(5)\/2015-ESD"]}],"id":[{"id":"10.13039\/501100008628","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Speech Technol"],"published-print":{"date-parts":[[2023,12]]},"DOI":"10.1007\/s10772-023-10060-x","type":"journal-article","created":{"date-parts":[[2023,11,14]],"date-time":"2023-11-14T09:02:23Z","timestamp":1699952543000},"page":"933-945","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Usefulness of glottal excitation source information for audio-visual speech recognition system"],"prefix":"10.1007","volume":"26","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-2478-909X","authenticated-orcid":false,"given":"Salam","family":"Nandakishor","sequence":"first","affiliation":[]},{"given":"Debadatta","family":"Pati","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,11,14]]},"reference":[{"issue":"23","key":"10060_CR1","doi-asserted-by":"publisher","first-page":"109","DOI":"10.1016\/0167-6393(92)90005-R","volume":"11","author":"P Alku","year":"1992","unstructured":"Alku, P. (1992). Glottal wave analysis with pitch synchronous iterative adaptive inverse filtering. Speech Communication, 11(23), 109\u2013118.","journal-title":"Speech Communication"},{"issue":"5","key":"10060_CR2","first-page":"240","volume":"48","author":"P Alku","year":"1996","unstructured":"Alku, P., & Vilkman, E. (1996). A comparison of glottal voice source quantification parameters in breathy, normal and pressed phonation of female and male speakers. IEEE Transactions on Audio, Speech and Language Processing, 48(5), 240\u2013254.","journal-title":"IEEE Transactions on Audio, Speech and Language Processing"},{"key":"10060_CR3","doi-asserted-by":"publisher","first-page":"40","DOI":"10.1016\/j.specom.2017.07.001","volume":"95","author":"HL Bear","year":"2017","unstructured":"Bear, H. L., & Harvey, R. (2017). Phoneme-to-viseme mappings: The good, the bad, and the ugly. Speech Communication, 95, 40\u201367.","journal-title":"Speech Communication"},{"key":"10060_CR4","doi-asserted-by":"crossref","unstructured":"Bozkurt, E., et al. (2007). Comparison of phoneme and viseme based acoustic units for speech driven realistic lip animation. In IEEE 15th signal processing and communications applications (pp. 1\u20134).","DOI":"10.1109\/SIU.2007.4298572"},{"key":"10060_CR5","doi-asserted-by":"crossref","unstructured":"Chengalvarayan, R. (1998). On the use of normalized LPC error towards better large vocabulary speech recognition systems. In IEEE international conference on acoustics, speech and signal processing (ICASSP).","DOI":"10.1109\/ICASSP.1998.674356"},{"issue":"5","key":"10060_CR6","doi-asserted-by":"publisher","first-page":"2421","DOI":"10.1121\/1.2229005","volume":"120","author":"M Cooke","year":"2006","unstructured":"Cooke, M., et al. (2006). An audio-visual corpus for speech perception and automatic speech recognition. The Journal of the Acoustical Society of America, 120(5), 2421\u20132424.","journal-title":"The Journal of the Acoustical Society of America"},{"key":"10060_CR7","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1504\/IJAIP.2022.122193","volume":"21","author":"S Debnath","year":"2022","unstructured":"Debnath, S., & Roy, P. (2022). Audio-visual speech recognition based on machine learning approach. International Journal of Advanced Intelligence Paradigms, 21, 3\u20134.","journal-title":"International Journal of Advanced Intelligence Paradigms"},{"issue":"3","key":"10060_CR8","doi-asserted-by":"publisher","first-page":"994","DOI":"10.1109\/TASL.2011.2170835","volume":"20","author":"T Drugman","year":"2012","unstructured":"Drugman, T., et al. (2012). Detection of glottal closure instants from speech signals: A quantitative review. IEEE Transactions on Audio, Speech, and Language Processing, 20(3), 994\u20131006.","journal-title":"IEEE Transactions on Audio, Speech, and Language Processing"},{"issue":"1","key":"10060_CR9","doi-asserted-by":"publisher","first-page":"20","DOI":"10.1016\/j.csl.2011.03.003","volume":"26","author":"T Drugmana","year":"2012","unstructured":"Drugmana, T., Bozkurtb, B., & Dutoita, T. (2012). A comparative study of glottal source estimation techniques. Computer Speech and Language, 26(1), 20\u201334.","journal-title":"Computer Speech and Language"},{"key":"10060_CR10","doi-asserted-by":"publisher","first-page":"141","DOI":"10.1109\/6046.865479","volume":"2","author":"S Dupont","year":"2000","unstructured":"Dupont, S., & Luettin, J. (2000). Audio-visual speech modeling for continuous speech recognition. IEEE Transactions on Multimedia, 2, 141\u2013151.","journal-title":"EEE Transactions on Multimedia"},{"issue":"9","key":"10060_CR11","first-page":"497","volume":"24","author":"K Dutta","year":"2009","unstructured":"Dutta, K., Singh, M., & Pati, D. (2009). Detection of replay signals using excitation source and shifted CQCC features. International Journal of Speech Technology, 24(9), 497\u2013507.","journal-title":"International Journal of Speech Technology"},{"key":"10060_CR12","unstructured":"He, J., Liu, L., & Palm, G. (1996). On the use of residual cepstrum in speech recognition. In IEEE international conference on acoustics, speech and signal processing (ICASSP) (Vol. 1, pp. 5\u20138)."},{"key":"10060_CR13","doi-asserted-by":"crossref","unstructured":"Huang, J., & Kingsbury, B. (2013). Audio-visual deep learning for noise robust speech recognition. In International conference on acoustics, speech, and signal processing (ICASSP) (pp. 7596\u20137599).","DOI":"10.1109\/ICASSP.2013.6639140"},{"key":"10060_CR14","doi-asserted-by":"publisher","DOI":"10.1017\/9780511807954","volume-title":"Hand Book of the International Phonetic Association","author":"International Phonetic Association","year":"1999","unstructured":"International Phonetic Association. (1999). Handbook of the international phonetic association. Cambridge University Press."},{"issue":"4","key":"10060_CR15","doi-asserted-by":"publisher","first-page":"564","DOI":"10.1109\/TSMCA.2004.826274","volume":"34","author":"MN Kaynak","year":"2004","unstructured":"Kaynak, M. N., et al. (2004). Analysis of lip geometric features for audio-visual speech recognition. IEEE Transactions on Systems, Man, and Cybernetics - Part A: Systems and Humans, 34(4), 564\u2013570.","journal-title":"IEEE Transactions on Systems, Man, and Cybernetics - Part A: Systems and Humans"},{"key":"10060_CR16","doi-asserted-by":"publisher","first-page":"495","DOI":"10.1109\/TMM.2005.846777","volume":"7","author":"S Lucey","year":"2005","unstructured":"Lucey, S., et al. (2005). Integration strategies for audio-visual speech processing: Applied to text-dependent speaker recognition. IEEE Transactions on Multimedia, 7, 495\u2013506.","journal-title":"IEEE Transactions on Multimedia"},{"issue":"10","key":"10060_CR17","doi-asserted-by":"publisher","first-page":"1243","DOI":"10.1016\/j.specom.2006.06.002","volume":"48","author":"SR Mahadeva Prasanna","year":"2006","unstructured":"Mahadeva Prasanna, S. R., & Yengnanarayana, B. (2006). Extraction of speaker-specific excitation information from linear prediction residual of speech. Speech Communication, 48(10), 1243\u20131261.","journal-title":"Speech Communication"},{"issue":"2","key":"10060_CR18","doi-asserted-by":"publisher","first-page":"257","DOI":"10.1007\/s10772-014-9266-0","volume":"18","author":"K Manjunath","year":"2015","unstructured":"Manjunath, K., & Rao, K. S. (2015). Source and system features for phone recognition. International Journal of Speech Technology, 18(2), 257\u2013270.","journal-title":"International Journal of Speech Technology"},{"key":"10060_CR19","doi-asserted-by":"crossref","unstructured":"Matthews, I., Bangham, J. A., & Cox, S. (1996). Audiovisual speech recognition using multiscale nonlinear image decomposition. In International conference on spoken language processing (ICSLP) (pp. 38\u201341).","DOI":"10.21437\/ICSLP.1996-10"},{"key":"10060_CR20","doi-asserted-by":"crossref","unstructured":"Meutzner, H., et al. (2017). Improving audio-visual speech recognition using deep neural networks with dynamic stream reliability estimates. In International conference on acoustics, speech and signal processing, (ICASSP).","DOI":"10.1109\/ICASSP.2017.7953172"},{"issue":"8","key":"10060_CR21","doi-asserted-by":"publisher","first-page":"1602","DOI":"10.1109\/TASL.2008.2004526","volume":"16","author":"KSR Murthy","year":"2008","unstructured":"Murthy, K. S. R., & Yegnanarayana, B. (2008). Epoch extraction from speech signals. IEEE Transactions on Audio Speech and Language Processing, 16(8), 1602\u20131613.","journal-title":"IEEE Transactions on Audio Speech and Language Processing"},{"key":"10060_CR22","unstructured":"Nandakishor, S., & Pati, D. (2021). Analysis of Lombard effect by using hybrid visual features for ASR. In Pattern recognition and machine intelligence (PReMI\u201921)."},{"key":"10060_CR23","doi-asserted-by":"publisher","first-page":"68","DOI":"10.1016\/j.csl.2016.06.002","volume":"41","author":"D Nandi","year":"2006","unstructured":"Nandi, D., Pati, D., & Sreenivasa Rao, K. (2006). Implicit processing of LP residual for language identification. Computer Speech & Language, 41, 68\u201387.","journal-title":"Computer Speech & Language"},{"issue":"1","key":"10060_CR24","doi-asserted-by":"publisher","first-page":"34","DOI":"10.1109\/TASL.2006.876878","volume":"15","author":"PA Naylor","year":"2007","unstructured":"Naylor, P. A., et al. (2007). Estimation of glottal closure instants in voiced speech using the DYPSA algorithm. IEEE Transactions on Audio, Speech and Language Processing, 15(1), 34\u201343.","journal-title":"IEEE Transactions on Audio, Speech and Language Processing"},{"key":"10060_CR25","doi-asserted-by":"publisher","first-page":"722","DOI":"10.1007\/s10489-014-0629-7","volume":"42","author":"K Noda","year":"2015","unstructured":"Noda, K., et al. (2015). Audio-visual speech recognition using deep learning. Applied Intelligence, 42, 722\u2013737.","journal-title":"Applied Intelligence"},{"issue":"4","key":"10060_CR26","doi-asserted-by":"publisher","first-page":"777","DOI":"10.3390\/sym14040777","volume":"14","author":"K Phapatanaburi","year":"2022","unstructured":"Phapatanaburi, K., et al. (2022). Whispered speech detection using glottal flow-based features. Symmetry, 14(4), 777.","journal-title":"Symmetry"},{"key":"10060_CR27","doi-asserted-by":"publisher","first-page":"1215","DOI":"10.1109\/5.237532","volume":"81","author":"JW Picone","year":"1993","unstructured":"Picone, J. W. (1993). Signal modeling techniques in speech recognition. Proceedings of the IEEE, 81, 1215\u20131247.","journal-title":"Proceedings of the IEEE"},{"issue":"5","key":"10060_CR28","doi-asserted-by":"publisher","first-page":"569","DOI":"10.1109\/89.784109","volume":"7","author":"MD Plumpe","year":"1999","unstructured":"Plumpe, M. D., Quatieri, T. F., & Reynolds, D. A. (1999). Modelling of glottal flow derivative waveform with application to speaker identification. IEEE Transactions on Speech and Audio Processing, 7(5), 569\u2013586.","journal-title":"IEEE Transactions on Speech and Audio Processing"},{"key":"10060_CR29","unstructured":"Povey, D., et\u00a0al. (2011). The Kaldi speech recognition toolkit. In Proceedings of IEEE workshop on automatic speech recognition and understanding."},{"key":"10060_CR30","doi-asserted-by":"crossref","unstructured":"Povey, D., & Saon, G. (2006). Feature and model space speaker adaptation with full covariance gaussians. In Interspeech.","DOI":"10.21437\/Interspeech.2006-349"},{"issue":"12","key":"10060_CR31","doi-asserted-by":"publisher","first-page":"2471","DOI":"10.1109\/TASL.2013.2273717","volume":"21","author":"A Prathosh","year":"2013","unstructured":"Prathosh, A., Ananthapadmanabha, T., & Ramakrishnan, A. (2013). Epoch extraction based on integrated linear prediction residual using plosion index. IEEE Transactions on Audio, Speech and Language Processing, 21(12), 2471\u20132480.","journal-title":"IEEE Transactions on Audio, Speech and Language Processing"},{"key":"10060_CR32","volume-title":"Fundamentals of speech recognition","author":"LR Rabiner","year":"2012","unstructured":"Rabiner, L. R., Juang, B.-H., & Yegnanarayana, B. (2012). Fundamentals of speech recognition. Pearson Education."},{"key":"10060_CR33","volume-title":"Digital processing of speech signals","author":"LR Rabiner","year":"2009","unstructured":"Rabiner, L. R., & Schafer, R. W. (2009). Digital processing of speech signals. Pearson Education."},{"key":"10060_CR34","doi-asserted-by":"publisher","first-page":"153","DOI":"10.1109\/TASL.2010.2045239","volume":"19","author":"T Raitio","year":"2010","unstructured":"Raitio, T., et al. (2010). HMM-based speech synthesis utilizing glottal inverse filtering. IEEE Transactions on Audio, Speech, and Language Processing, 19, 153\u2013165.","journal-title":"IEEE Transactions on Audio, Speech, and Language Processing"},{"key":"10060_CR35","doi-asserted-by":"crossref","unstructured":"Rath, S. P., et\u00a0al. (2013). Improved feature processing for deep neural networks. In Interspeech.","DOI":"10.21437\/Interspeech.2013-48"},{"key":"10060_CR36","doi-asserted-by":"crossref","unstructured":"Swietojanski, P. et\u00a0al. (2013). Revisiting hybrid and GMM-HMM system combination techniques. In International conference on acoustics, speech, and signal processing (ICASSP).","DOI":"10.1109\/ICASSP.2013.6638967"},{"key":"10060_CR37","unstructured":"Thangthai, K. et\u00a0al. (2015). Improving lip-reading performance for robust audiovisual speech recognition using DNNs. In FAAVSP\u2014the 1st joint conference on facial analysis, animation and auditory-visual speech processing (pp. 127\u2013131)."},{"issue":"1","key":"10060_CR38","doi-asserted-by":"publisher","first-page":"82","DOI":"10.1109\/TASL.2011.2157684","volume":"20","author":"MR Thomas","year":"2012","unstructured":"Thomas, M. R., Gudnason, J., & Naylor, P. A. (2012). Estimation of glottal closing and opening instants in voiced speech using the YAGA algorithm. IEEE Transactions on Audio, Speech and Language Processing, 20(1), 82\u201391.","journal-title":"IEEE Transactions on Audio, Speech and Language Processing"},{"issue":"3","key":"10060_CR39","doi-asserted-by":"publisher","first-page":"489","DOI":"10.1007\/s10772-017-9483-4","volume":"21","author":"K Tripathi","year":"2018","unstructured":"Tripathi, K., & Rao, K. S. (2018). Improvement of phone recognition accuracy using speech mode classiffication. International Journal of Speech Technology, 21(3), 489\u2013500.","journal-title":"International Journal of Speech Technology"},{"key":"10060_CR40","doi-asserted-by":"crossref","unstructured":"Vesely, K., et\u00a0al. (2013). Sequence-discriminative training of deep neural networks. In Interspeech.","DOI":"10.21437\/Interspeech.2013-548"},{"key":"10060_CR41","doi-asserted-by":"crossref","unstructured":"Vincent, E., et al. (2013). The second \u201cCHiME\u201d speech separation and recognition challenge: Datasets, tasks and baselines. In International conference on acoustics, speech, and signal processing (ICASSP) (pp. 126\u2013130).","DOI":"10.1109\/ICASSP.2013.6637622"},{"key":"10060_CR42","doi-asserted-by":"publisher","first-page":"137","DOI":"10.1023\/B:VISI.0000013087.49260.fb","volume":"57","author":"P Viola","year":"2004","unstructured":"Viola, P., & Jones, M. J. (2004). Robust real-time face detection. International Journal of Computer Vision, 57, 137\u2013154.","journal-title":"International Journal of Computer Vision"},{"key":"10060_CR43","doi-asserted-by":"publisher","first-page":"1110","DOI":"10.1109\/TSA.2005.853005","volume":"13","author":"B Yegnanarayana","year":"2005","unstructured":"Yegnanarayana, B., Mahadeva Prasanna, M. S. R., Duraiswami, R., & Zotkin, D. (2005). Processing of reverberant speech for time-delay estimation. IEEE Transactions on Audio, Speech, and Language Processing, 13, 1110\u20131118.","journal-title":"IEEE Transactions on Audio, Speech, and Language Processing"},{"issue":"3","key":"10060_CR44","doi-asserted-by":"publisher","first-page":"267","DOI":"10.1109\/89.841209","volume":"8","author":"B Yengnanarayana","year":"2000","unstructured":"Yengnanarayana, B., & Satyanarayana Murthy, P. (2000). Enhancement of reverberant speech using LP residual signal. IEEE Transactions on Speech and Audio Processing, 8(3), 267\u2013281.","journal-title":"IEEE Transactions on Speech and Audio Processing"}],"container-title":["International Journal of Speech Technology"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-023-10060-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10772-023-10060-x\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-023-10060-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,11]],"date-time":"2024-01-11T10:13:32Z","timestamp":1704968012000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10772-023-10060-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,11,14]]},"references-count":44,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2023,12]]}},"alternative-id":["10060"],"URL":"https:\/\/doi.org\/10.1007\/s10772-023-10060-x","relation":{},"ISSN":["1381-2416","1572-8110"],"issn-type":[{"type":"print","value":"1381-2416"},{"type":"electronic","value":"1572-8110"}],"subject":[],"published":{"date-parts":[[2023,11,14]]},"assertion":[{"value":"5 July 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 October 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"14 November 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no Conflict of Interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"The authors give the consent to publisher for this article publication.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for publication"}}]}}