{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,10]],"date-time":"2025-05-10T07:07:54Z","timestamp":1746860874432},"reference-count":43,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2021,2,2]],"date-time":"2021-02-02T00:00:00Z","timestamp":1612224000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,2,2]],"date-time":"2021-02-02T00:00:00Z","timestamp":1612224000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Speech Technol"],"published-print":{"date-parts":[[2021,6]]},"DOI":"10.1007\/s10772-021-09797-0","type":"journal-article","created":{"date-parts":[[2021,2,2]],"date-time":"2021-02-02T03:04:54Z","timestamp":1612235094000},"page":"473-481","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":15,"title":["Training augmentation with TANDEM acoustic modelling in Punjabi adult speech recognition system"],"prefix":"10.1007","volume":"24","author":[{"given":"Virender","family":"Kadyan","sequence":"first","affiliation":[]},{"given":"Shashi","family":"Bala","sequence":"additional","affiliation":[]},{"given":"Puneet","family":"Bawa","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,2,2]]},"reference":[{"key":"9797_CR1","doi-asserted-by":"publisher","unstructured":"Bahari, M. H., Saeidi, R., & Van Leeuwen, D. (2013). Accent recognition using i-vector, gaussian mean supervector and gaussian posterior probability supervector for spontaneous telephone speech. In 2013 IEEE international conference on acoustics, speech and signal processing (pp. 7344\u20137348). IEEE. https:\/\/doi.org\/10.1109\/ICASSP.2013.6639089","DOI":"10.1109\/ICASSP.2013.6639089"},{"key":"9797_CR2","doi-asserted-by":"publisher","unstructured":"Bell, P., Swietojanski, P., & Renals, S. (2013). Multi-level adaptive networks in tandem and hybrid ASR systems. In 2013 IEEE international conference on acoustics, speech and signal processing (pp. 6975\u20136979). IEEE. https:\/\/doi.org\/10.1109\/ICASSP.2013.6639014","DOI":"10.1109\/ICASSP.2013.6639014"},{"issue":"2","key":"9797_CR3","doi-asserted-by":"publisher","first-page":"113","DOI":"10.1109\/TASSP.1979.1163209","volume":"27","author":"S Boll","year":"1979","unstructured":"Boll, S. (1979). Suppression of acoustic noise in speech using spectral subtraction. IEEE Transactions on Acoustics, Speech, and Signal Processing, 27(2), 113\u2013120. https:\/\/doi.org\/10.1109\/TASSP.1979.1163209.","journal-title":"IEEE Transactions on Acoustics, Speech, and Signal Processing"},{"issue":"6","key":"9797_CR4","doi-asserted-by":"publisher","first-page":"752","DOI":"10.1109\/TASSP.1980.1163472","volume":"28","author":"S Boll","year":"1980","unstructured":"Boll, S., & Pulsipher, D. C. (1980). Suppression of acoustic noise in speech using two microphone adaptive noise cancellation. IEEE Transactions on Acoustics, Speech, and Signal Processing, 28(6), 752\u2013753. https:\/\/doi.org\/10.1109\/TASSP.1980.1163472.","journal-title":"IEEE Transactions on Acoustics, Speech, and Signal Processing"},{"issue":"6","key":"9797_CR5","doi-asserted-by":"publisher","first-page":"1379","DOI":"10.1109\/TASL.2009.2034770","volume":"18","author":"H Boril","year":"2009","unstructured":"Boril, H., & Hansen, J. H. (2009). Unsupervised equalization of Lombard effect for speech recognition in noisy adverse environments. IEEE Transactions on Audio, Speech, and Language Processing, 18(6), 1379\u20131393. https:\/\/doi.org\/10.1109\/TASL.2009.2034770.","journal-title":"IEEE Transactions on Audio, Speech, and Language Processing"},{"key":"9797_CR6","volume-title":"Neural networks for optimization and signal processing","author":"A Cichocki","year":"1993","unstructured":"Cichocki, A., Unbehauen, R., & Swiniarski, R. W. (1993). Neural networks for optimization and signal processing (Vol. 253). New York: Wiley."},{"key":"9797_CR7","doi-asserted-by":"publisher","unstructured":"Ellis, D. P., Singh, R., & Sivadas, S. (2001). Tandem acoustic modeling in large-vocabulary recognition. In 2001 IEEE international conference on acoustics, speech, and signal processing. Proceedings (Cat. No. 01CH37221) (Vol. 1, pp. 517\u2013520). IEEE. https:\/\/doi.org\/10.1109\/ICASSP.2001.940881","DOI":"10.1109\/ICASSP.2001.940881"},{"issue":"1","key":"9797_CR8","doi-asserted-by":"publisher","first-page":"109","DOI":"10.1016\/S0095-4470(19)30469-3","volume":"16","author":"O Ghitza","year":"1988","unstructured":"Ghitza, O. (1988). Temporal non-place information in the auditory-nerve firing patterns as a front-end for speech recognition in a noisy environment. Journal of Phonetics, 16(1), 109\u2013123. https:\/\/doi.org\/10.1016\/S0095-4470(19)30469-3.","journal-title":"Journal of Phonetics"},{"key":"9797_CR9","doi-asserted-by":"crossref","unstructured":"Gr\u00e9zl, F., Karafi\u00e1t, M., & Burget, L. (2009). Investigation into bottle-neck features for meeting speech recognition. In Tenth annual conference of the international speech communication association.","DOI":"10.21437\/Interspeech.2009-746"},{"issue":"4","key":"9797_CR10","doi-asserted-by":"publisher","first-page":"598","DOI":"10.1109\/89.326618","volume":"2","author":"JH Hansen","year":"1994","unstructured":"Hansen, J. H. (1994). Morphological constrained feature enhancement with adaptive cepstral compensation (MCE-ACC) for speech recognition in noise and Lombard effect. IEEE Transactions on Speech and Audio Processing, 2(4), 598\u2013614. https:\/\/doi.org\/10.1109\/89.326618.","journal-title":"IEEE Transactions on Speech and Audio Processing"},{"key":"9797_CR11","doi-asserted-by":"crossref","unstructured":"Hansen, J. H., & Bria, O. N. (1990). Lombard effect compensation for robust automatic speech recognition in noise. In First International Conference on Spoken Language Processing.","DOI":"10.21437\/ICSLP.1990-298"},{"issue":"4","key":"9797_CR12","doi-asserted-by":"publisher","first-page":"391","DOI":"10.1016\/0167-6393(95)00007-B","volume":"16","author":"JH Hansen","year":"1995","unstructured":"Hansen, J. H., & Cairns, D. A. (1995). Icarus: Source generator based real-time recognition of speech in noisy stressful and lombard effect environments. Speech Communication, 16(4), 391\u2013422. https:\/\/doi.org\/10.1016\/0167-6393(95)00007-B.","journal-title":"Speech Communication"},{"key":"9797_CR13","doi-asserted-by":"publisher","unstructured":"Hermansky, H., Ellis, D. P., & Sharma, S. (2000). Tandem connectionist feature extraction for conventional HMM systems. In 2000 IEEE international conference on acoustics, speech, and signal processing. Proceedings (Cat. No. 00CH37100) (Vol. 3, pp. 1635\u20131638). IEEE. https:\/\/doi.org\/10.1109\/ICASSP.2000.862024","DOI":"10.1109\/ICASSP.2000.862024"},{"key":"9797_CR14","doi-asserted-by":"publisher","unstructured":"Hirsch, H. G., & Ehrlicher, C. (1995). Noise estimation techniques for robust speech recognition. In 1995 International conference on acoustics, speech, and signal processing (Vol. 1, pp. 153\u2013156). IEEE. https:\/\/doi.org\/10.1109\/ICASSP.1995.479387","DOI":"10.1109\/ICASSP.1995.479387"},{"key":"9797_CR15","doi-asserted-by":"publisher","unstructured":"Hsu, W. N., Zhang, Y., Weiss, R. J., Chung, Y. A., Wang, Y., Wu, Y., & Glass, J. (2019). Disentangling correlated speaker and noise for speech synthesis via data augmentation and adversarial factorization. In ICASSP 2019\u20132019 IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp. 5901\u20135905). IEEE. https:\/\/doi.org\/10.1109\/ICASSP.2019.8683561","DOI":"10.1109\/ICASSP.2019.8683561"},{"key":"9797_CR16","doi-asserted-by":"publisher","unstructured":"Huang, J., & Kingsbury, B. (2013). Audio-visual deep learning for noise robust speech recognition. In 2013 IEEE international conference on acoustics, speech and signal processing (pp. 7596\u20137599). IEEE. https:\/\/doi.org\/10.1109\/ICASSP.2013.6639140","DOI":"10.1109\/ICASSP.2013.6639140"},{"issue":"1","key":"9797_CR17","doi-asserted-by":"publisher","first-page":"8","DOI":"10.1109\/79.180705","volume":"10","author":"DR Hush","year":"1993","unstructured":"Hush, D. R., & Horne, B. G. (1993). Progress in supervised neural networks. IEEE Signal Processing Magazine, 10(1), 8\u201339. https:\/\/doi.org\/10.1109\/79.180705.","journal-title":"IEEE Signal Processing Magazine"},{"issue":"4","key":"9797_CR18","doi-asserted-by":"publisher","first-page":"761","DOI":"10.1007\/s10772-017-9446-9","volume":"20","author":"V Kadyan","year":"2017","unstructured":"Kadyan, V., Mantri, A., & Aggarwal, R. K. (2017). A heterogeneous speech feature vectors generation approach with hybrid hmm classifiers. International Journal of Speech Technology, 20(4), 761\u2013769. https:\/\/doi.org\/10.1007\/s10772-017-9446-9.","journal-title":"International Journal of Speech Technology"},{"issue":"1","key":"9797_CR19","doi-asserted-by":"publisher","first-page":"111","DOI":"10.1007\/s10772-018-09577-3","volume":"22","author":"V Kadyan","year":"2019","unstructured":"Kadyan, V., Mantri, A., Aggarwal, R. K., & Singh, A. (2019). A comparative study of deep neural network based Punjabi-ASR system. International Journal of Speech Technology, 22(1), 111\u2013119. https:\/\/doi.org\/10.1007\/s10772-018-09577-3.","journal-title":"International Journal of Speech Technology"},{"key":"9797_CR20","doi-asserted-by":"publisher","DOI":"10.1007\/s11831-020-09414-4","author":"J Kaur","year":"2020","unstructured":"Kaur, J., Singh, A., & Kadyan, V. (2020). Automatic speech recognition system for tonal languages: state-of-the-art survey. Archives of Computational Methods in Engineering. https:\/\/doi.org\/10.1007\/s11831-020-09414-4.","journal-title":"Archives of Computational Methods in Engineering"},{"issue":"12","key":"9797_CR21","doi-asserted-by":"publisher","first-page":"2506","DOI":"10.1109\/TASL.2013.2277932","volume":"21","author":"P Lal","year":"2013","unstructured":"Lal, P., & King, S. (2013). Cross-lingual automatic speech recognition using tandem features. IEEE Transactions on Audio, Speech, and Language Processing, 21(12), 2506\u20132515. https:\/\/doi.org\/10.1109\/TASL.2013.2277932.","journal-title":"IEEE Transactions on Audio, Speech, and Language Processing"},{"key":"9797_CR22","doi-asserted-by":"publisher","unstructured":"Kinnunen, T., Juvela, L., Alku, P., & Yamagishi, J. (2017). Non-parallel voice conversion using i-vector PLDA: Towards unifying speaker verification and transformation. In 2017 IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp. 5535\u20135539). IEEE. https:\/\/doi.org\/10.1109\/ICASSP.2017.7953215","DOI":"10.1109\/ICASSP.2017.7953215"},{"key":"9797_CR23","doi-asserted-by":"crossref","unstructured":"Ko, T., Peddinti, V., Povey, D., & Khudanpur, S. (2015). Audio augmentation for speech recognition. In Sixteenth annual conference of the international speech communication association.","DOI":"10.21437\/Interspeech.2015-711"},{"key":"9797_CR24","doi-asserted-by":"crossref","unstructured":"Kubat, M. (1999). Neural networks: a comprehensive foundation by Simon Haykin, Macmillan, 1994, ISBN 0\u201302\u2013352781\u20137. The Knowledge Engineering Review, 13(4), 409\u2013412.","DOI":"10.1017\/S0269888998214044"},{"key":"9797_CR25","doi-asserted-by":"publisher","unstructured":"Lippmann, R., Martin, E., & Paul, D. (1987). Multi-style training for robust isolated-word speech recognition. In ICASSP'87. IEEE international conference on acoustics, speech, and signal processing (Vol. 12, pp. 705\u2013708). IEEE. https:\/\/doi.org\/10.1109\/ICASSP.1987.1169544","DOI":"10.1109\/ICASSP.1987.1169544"},{"key":"9797_CR26","doi-asserted-by":"publisher","unstructured":"Lyon, R. (1984). Computational models of neural auditory processing. In ICASSP'84. IEEE international conference on acoustics, speech, and signal processing (Vol. 9, pp. 41\u201344). IEEE. https:\/\/doi.org\/10.1109\/ICASSP.1984.1172756","DOI":"10.1109\/ICASSP.1984.1172756"},{"key":"9797_CR27","doi-asserted-by":"publisher","DOI":"10.1007\/s00034-020-01565-w","author":"K Maity","year":"2020","unstructured":"Maity, K., Pradhan, G., & Singh, J. P. (2020). A pitch and noise robust keyword spotting system using SMAC features with prosody modification. Circuits, Systems, and Signal Processing. https:\/\/doi.org\/10.1007\/s00034-020-01565-w.","journal-title":"Circuits, Systems, and Signal Processing"},{"key":"9797_CR28","first-page":"216","volume-title":"Parallel distributed processing: Explorations in the Microstructure of Cognition","author":"JL McClelland","year":"1986","unstructured":"McClelland, J. L., & Rumelhart, D. E. (1986). Parallel distributed processing: Explorations in the Microstructure of Cognition (Vol. 2, pp. 216\u2013271). Cambridge: MIT Press."},{"key":"9797_CR29","doi-asserted-by":"publisher","unstructured":"Naik, J. M., & Lubensky, D. M. (1994). A hybrid HMM-MLP speaker verification algorithm for telephone speech. In Proceedings of ICASSP'94. IEEE international conference on acoustics, speech and signal processing (Vol. 1, pp. I\u2013153). IEEE. https:\/\/doi.org\/10.1109\/ICASSP.1994.389332","DOI":"10.1109\/ICASSP.1994.389332"},{"key":"9797_CR30","doi-asserted-by":"crossref","unstructured":"Parihar, N., & Picone, J. (2003). Analysis of the Aurora large vocabulary evaluations. In Eighth European conference on speech communication and technology.","DOI":"10.21437\/Eurospeech.2003-139"},{"key":"9797_CR31","doi-asserted-by":"crossref","unstructured":"Plahl, C., Schl\u00fcter, R., & Ney, H. (2010). Hierarchical bottle neck features for LVCSR. In Eleventh annual conference of the international speech communication association.","DOI":"10.21437\/Interspeech.2010-375"},{"key":"9797_CR32","doi-asserted-by":"publisher","unstructured":"Povey, D., Burget, L., Agarwal, M., Akyazi, P., Feng, K., Ghoshal, A., ... & Rose, R. C. (2010). Subspace Gaussian mixture models for speech recognition. In 2010 IEEE international conference on acoustics, speech and signal processing (pp. 4330\u20134333). IEEE. https:\/\/doi.org\/10.1109\/ICASSP.2010.5495662","DOI":"10.1109\/ICASSP.2010.5495662"},{"key":"9797_CR33","unstructured":"Povey, D., Ghoshal, A., Boulianne, G., Burget, L., Glembek, O., Goel, N., ... & Silovsky, J. (2011). The Kaldi speech recognition toolkit. In IEEE 2011 workshop on automatic speech recognition and understanding (No. CONF). IEEE Signal Processing Society."},{"key":"9797_CR34","doi-asserted-by":"publisher","unstructured":"Ravanelli, M., & Janin, A. (2014). TANDEM-bottleneck feature combination using hierarchical Deep Neural Networks. In The 9th international symposium on chinese spoken language processing (pp. 113\u2013117). IEEE. https:\/\/doi.org\/10.1109\/ISCSLP.2014.6936576","DOI":"10.1109\/ISCSLP.2014.6936576"},{"key":"9797_CR35","doi-asserted-by":"crossref","unstructured":"Rosenberg, A., Zhang, Y., Ramabhadran, B., Jia, Y., Moreno, P., Wu, Y., & Wu, Z. (2019). Speech recognition with augmented synthesized speech. In 2019 IEEE automatic speech recognition and understanding workshop (ASRU) (pp. 996\u20131002). IEEE.","DOI":"10.1109\/ASRU46091.2019.9003990"},{"key":"9797_CR36","doi-asserted-by":"publisher","unstructured":"Saon, G., T\u00fcske, Z., Audhkhasi, K., & Kingsbury, B. (2019). Sequence noise injected training for end-to-end speech recognition. In ICASSP 2019\u20132019 IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp. 6261\u20136265). IEEE. https:\/\/doi.org\/10.1109\/ICASSP.2019.8683706","DOI":"10.1109\/ICASSP.2019.8683706"},{"issue":"1","key":"9797_CR37","doi-asserted-by":"publisher","first-page":"55","DOI":"10.1016\/S0095-4470(19)30466-8","volume":"16","author":"S Seneff","year":"1988","unstructured":"Seneff, S. (1988). A joint synchrony\/mean-rate model of auditory speech processing. Journal of Phonetics, 16(1), 55\u201376. https:\/\/doi.org\/10.1016\/S0095-4470(19)30466-8.","journal-title":"Journal of Phonetics"},{"key":"9797_CR38","unstructured":"Serdyuk, D., Audhkhasi, K., Brakel, P., Ramabhadran, B., Thomas, S., & Bengio, Y. (2016). Invariant representations for noisy speech recognition. arXiv preprint. arXiv:1612.01928"},{"key":"9797_CR39","doi-asserted-by":"publisher","DOI":"10.1007\/s10462-019-09775-8","author":"A Singh","year":"2019","unstructured":"Singh, A., Kadyan, V., Kumar, M., & Bassan, N. (2019). ASRoIL: A comprehensive survey for automatic speech recognition of Indian languages. Artificial Intelligence Review. https:\/\/doi.org\/10.1007\/s10462-019-09775-8.","journal-title":"Artificial Intelligence Review"},{"key":"9797_CR40","doi-asserted-by":"publisher","unstructured":"Tebelskis, J., & Waibel, A. (1990). Large vocabulary recognition using linked predictive neural networks. In International conference on acoustics, speech, and signal processing (pp. 437\u2013440). IEEE. https:\/\/doi.org\/10.1109\/ICASSP.1990.115742","DOI":"10.1109\/ICASSP.1990.115742"},{"issue":"1","key":"9797_CR41","doi-asserted-by":"publisher","first-page":"161","DOI":"10.1109\/TASLP.2013.2286906","volume":"22","author":"VS Tomar","year":"2013","unstructured":"Tomar, V. S., & Rose, R. C. (2013). A family of discriminative manifold learning algorithms and their application to speech recognition. IEEE\/ACM Transactions on Audio, Speech, and Language Processing, 22(1), 161\u2013171. https:\/\/doi.org\/10.1109\/TASLP.2013.2286906.","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"issue":"3","key":"9797_CR42","doi-asserted-by":"publisher","first-page":"247","DOI":"10.1016\/0167-6393(93)90095-3","volume":"12","author":"A Varga","year":"1993","unstructured":"Varga, A., & Steeneken, H. J. (1993). Assessment for automatic speech recognition: II. NOISEX-92: A database and an experiment to study the effect of additive noise on speech recognition systems. Speech Communication, 12(3), 247\u2013251. https:\/\/doi.org\/10.1016\/0167-6393(93)90095-3.","journal-title":"Speech Communication"},{"key":"9797_CR43","doi-asserted-by":"publisher","unstructured":"Zeng, Y. M., Wu, Z. Y., Falk, T., & Chan, W. Y. (2006). Robust GMM based gender classification using pitch and RASTA-PLP parameters of speech. In 2006 International conference on machine learning and cybernetics (pp. 3376\u20133379). IEEE. https:\/\/doi.org\/10.1109\/ICMLC.2006.258497","DOI":"10.1109\/ICMLC.2006.258497"}],"container-title":["International Journal of Speech Technology"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-021-09797-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10772-021-09797-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10772-021-09797-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,12,14]],"date-time":"2022-12-14T08:13:35Z","timestamp":1671005615000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10772-021-09797-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,2,2]]},"references-count":43,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2021,6]]}},"alternative-id":["9797"],"URL":"https:\/\/doi.org\/10.1007\/s10772-021-09797-0","relation":{},"ISSN":["1381-2416","1572-8110"],"issn-type":[{"value":"1381-2416","type":"print"},{"value":"1572-8110","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,2,2]]},"assertion":[{"value":"5 February 2020","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 January 2021","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 February 2021","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}