{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,15]],"date-time":"2025-06-15T04:05:13Z","timestamp":1749960313629,"version":"3.41.0"},"publisher-location":"Singapore","reference-count":44,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819682973","type":"print"},{"value":"9789819682980","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-8298-0_12","type":"book-chapter","created":{"date-parts":[[2025,6,14]],"date-time":"2025-06-14T18:21:52Z","timestamp":1749925312000},"page":"147-158","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Late Fusion Ensembles for\u00a0Speech Recognition on\u00a0Diverse Input Audio Representations"],"prefix":"10.1007","author":[{"given":"Marin","family":"Jezid\u017ei\u0107","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1023-8413","authenticated-orcid":false,"given":"Matej","family":"Mihel\u010di\u0107","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,6,15]]},"reference":[{"key":"12_CR1","unstructured":"Baevski, A., Zhou, Y., Mohamed, A., Auli, M.: wav2vec 2.0: a framework for self-supervised learning of speech representations. In: Advances in Neural Information Processing Systems (NeurIPS20). pp. 12449\u201312460 (2020)"},{"key":"12_CR2","doi-asserted-by":"crossref","unstructured":"Binbin, Z., et\u00a0al.: Wenetspeech: a 10000+ hours multi-domain mandarin corpus for speech recognition. In: Proceedings of the 47th International Conference on Acoustics, Speech and Signal Processing (ICASSP22), pp. 6182\u20136186. IEEE (2022)","DOI":"10.1109\/ICASSP43922.2022.9746682"},{"key":"12_CR3","doi-asserted-by":"publisher","first-page":"425","DOI":"10.1121\/1.400476","volume":"89","author":"J Brown","year":"1991","unstructured":"Brown, J.: Calculation of a constant q spectral transform. J. Acoust. Soc. Am. 89, 425\u2013434 (1991)","journal-title":"J. Acoust. Soc. Am."},{"key":"12_CR4","doi-asserted-by":"crossref","unstructured":"Bu, H., Du, J., Na, X., Wu, B., Zheng, H.: Aishell-1: an open-source mandarin speech corpus and a speech recognition baseline. In: Proceedings of the 20th Conference of the Oriental Chapter of the International Coordinating Committee on Speech Databases and Speech I\/O Systems and Assessment (O-COCOSDA17), pp.\u00a01\u20135. IEEE (2017)","DOI":"10.1109\/ICSDA.2017.8384449"},{"key":"12_CR5","doi-asserted-by":"crossref","unstructured":"Cao, B., et\u00a0al.: Data augmentation for end-to-end silent speech recognition for laryngectomees. In: Proceedings of the 23rd Conference of the International Speech Communication Association (Interspeech22), pp. 3653\u20133657 (2022)","DOI":"10.21437\/Interspeech.2022-10868"},{"key":"12_CR6","doi-asserted-by":"crossref","unstructured":"Chen, G., et\u00a0al.: Gigaspeech: an evolving, multi-domain ASR corpus with 10,000 hours of transcribed audio. In: Proceedings of the 22nd Conference of the International Speech Communication Association (Interspeech21), pp. 3670\u20133674 (2021)","DOI":"10.21437\/Interspeech.2021-1965"},{"key":"12_CR7","doi-asserted-by":"crossref","unstructured":"Chung, Y.A., et\u00a0al.: w2v-Bert: combining contrastive learning and masked language modeling for self-supervised speech pre-training. In: Proceedings of the IEEE Automatic Speech Recognition and Understanding Workshop (ASRU21). pp. 244\u2013250 (2021)","DOI":"10.1109\/ASRU51503.2021.9688253"},{"issue":"3","key":"12_CR8","doi-asserted-by":"publisher","first-page":"733","DOI":"10.1162\/coli_a_00445","volume":"48","author":"P Dufter","year":"2022","unstructured":"Dufter, P., Schmitt, M., Sch\u00fctze, H.: Position information in transformers: an overview. Comput. Linguist. 48(3), 733\u2013763 (2022)","journal-title":"Comput. Linguist."},{"key":"12_CR9","doi-asserted-by":"crossref","unstructured":"Farrell, K.R., Mammone, R.J.: Data fusion Techniques for Speaker Recognition, pp. 279\u2013297. Springer, Boston (1995)","DOI":"10.1007\/978-1-4615-2281-2_12"},{"key":"12_CR10","doi-asserted-by":"crossref","unstructured":"Ge, T., Si-Qing, C., Wei, F.: EdgeFormer: a parameter-efficient transformer for on-device seq2seq generation. In: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, pp. 10786\u201310798. Association for Computational Linguistics (2022)","DOI":"10.18653\/v1\/2022.emnlp-main.741"},{"key":"12_CR11","unstructured":"Gulati, A., et\u00a0al.: Conformer: Convolution-augmented transformer for speech recognition. In: Proceedings of the 21st Conference of the International Speech Communication Association (Interspeech20), pp. 5036\u20135040. ISCA (2020)"},{"key":"12_CR12","doi-asserted-by":"crossref","unstructured":"Han, W., et\u00a0al.: ContextNet: improving convolutional neural networks for automatic speech recognition with global context. In: Proceedings of the 21st Conference of the International Speech Communication Association (Interspeech20), pp. 3610\u20133614 (2020)","DOI":"10.21437\/Interspeech.2020-2059"},{"issue":"1","key":"12_CR13","doi-asserted-by":"publisher","first-page":"208","DOI":"10.1109\/TASLP.2016.2632307","volume":"25","author":"Y Han","year":"2016","unstructured":"Han, Y., Kim, J., Lee, K.: Deep convolutional neural networks for predominant instrument recognition in polyphonic music. IEEE\/ACM Trans. Audio Speech Lang. Process. 25(1), 208\u2013221 (2016)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"12_CR14","doi-asserted-by":"publisher","first-page":"3320","DOI":"10.1109\/TASLP.2023.3306709","volume":"31","author":"Z Han","year":"2023","unstructured":"Han, Z., Gao, D., Cheng, G., Povey, D., Pengyuan, Z., Yan, Y.: Alternative pseudo-labeling for semi-supervised automatic speech recognition. IEEE\/ACM Trans. Audio Speech Lang. Process. 31, 3320\u20133330 (2023)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"12_CR15","unstructured":"Hu, E.J., et\u00a0al.: LoRA: low-rank adaptation of large language models. In: International Conference on Learning Representations (2022)"},{"key":"12_CR16","doi-asserted-by":"crossref","unstructured":"Huang, M., Leung, H., Hou, M.: 3D object detection using multiple-frame proposal features fusion. Sensors 23(22) (2023)","DOI":"10.3390\/s23229162"},{"key":"12_CR17","doi-asserted-by":"publisher","DOI":"10.1016\/j.chaos.2022.112512","volume":"162","author":"S Jothimani","year":"2022","unstructured":"Jothimani, S., Premalatha, K.: MFF-SAUG: multi feature fusion with spectrogram augmentation of speech emotion recognition using convolution neural network. Chaos, Solitons Fractals 162, 112512 (2022)","journal-title":"Chaos, Solitons Fractals"},{"key":"12_CR18","unstructured":"Kim, K., et\u00a0al.: E-branchformer: Branchformer with enhanced merging for speech recognition. In: IEEE Spoken Language Technology Workshop SLT22, pp. 84\u201391. IEEE (2022)"},{"key":"12_CR19","doi-asserted-by":"crossref","unstructured":"Kuncheva, I.L.: Diversity in Classifier Ensembles, chap.\u00a08, pp. 247\u2013289. Wiley (2014)","DOI":"10.1002\/9781118914564.ch8"},{"key":"12_CR20","unstructured":"Linhao, D., Xu, S., Bo, X.: Speech-transformer: a no-recurrence sequence-to-sequence model for speech recognition. In: Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP18), pp. 5884\u20135888 (2018)"},{"key":"12_CR21","doi-asserted-by":"crossref","unstructured":"Lohrenz, T., Fingscheidt, T.: BLSTM-driven stream fusion for automatic speech recognition: novel methods and a multi-size window fusion example. In: Proceedings of the 21th Conference of the International Speech Communication Association (Interspeech20). IEEE (2020)","DOI":"10.21437\/Interspeech.2020-2560"},{"key":"12_CR22","doi-asserted-by":"crossref","unstructured":"Lohrenz, T., Li, Z., Fingscheidt, T.: Multi-encoder learning and stream fusion for transformer-based end-to-end automatic speech recognition. In: Proceedings of the 22nd Conference of the International Speech Communication Association (Interspeech21), pp. 2846\u20132850. ISCA (2021)","DOI":"10.21437\/Interspeech.2021-555"},{"key":"12_CR23","doi-asserted-by":"crossref","unstructured":"Loweimi, E., Ahadi, S.M., Drugman, T.: A new phase-based feature representation for robust speech recognition. In: Proceedings of the 2013 IEEE International Conference on Acoustics, Speech and Signal Processing, pp. 7155\u20137159 (2013)","DOI":"10.1109\/ICASSP.2013.6639051"},{"issue":"1","key":"12_CR24","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1006\/csla.2001.0184","volume":"16","author":"M Mohri","year":"2002","unstructured":"Mohri, M., Pereira, F., Riley, M.: Weighted finite-state transducers in speech recognition. Comput. Speech Lang. 16(1), 69\u201388 (2002)","journal-title":"Comput. Speech Lang."},{"key":"12_CR25","doi-asserted-by":"crossref","unstructured":"Mu, X., Min, C.H.: MFCC as features for speaker classification using machine learning. In: 2023 IEEE World AI IoT Congress (AIIoT), pp. 0566\u20130570 (2023)","DOI":"10.1109\/AIIoT58121.2023.10174566"},{"key":"12_CR26","doi-asserted-by":"crossref","unstructured":"Panayotov, V., Chen, G., Povey, D., Khudanpur, S.: Librispeech: an ASR corpus based on public domain audio books. In: 2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP15), pp. 5206\u20135210 (2015)","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"12_CR27","unstructured":"Peng, Y., Dalmia, S., Lane, I.R., Watanabe, S.: Branchformer: parallel MLP-attention architectures to capture local and global context for speech recognition and understanding. In: International Conference on Machine Learning (ICML22), vol.\u00a0162, pp. 17627\u201317643. PMLR (2022)"},{"key":"12_CR28","unstructured":"Radford, A., Kim, J.W., Xu, T., Brockman, G., McLeavey, C., Sutskever, I.: Robust speech recognition via large-scale weak supervision. In: Proceedings of the International Conference on Machine Learning (ICML23), Proceedings of Machine Learning Research, vol.\u00a0202, pp. 28492\u201328518. PMLR (2023)"},{"key":"12_CR29","doi-asserted-by":"crossref","unstructured":"Rajesh, M.H., M., H.A., Venkata, G., Rao, R.: Significance of the modified group delay feature in speech recognition. Trans. Audio, Speech and Lang. Proc. 15(1), 190-202 (2007)","DOI":"10.1109\/TASL.2006.876858"},{"key":"12_CR30","unstructured":"Rousseau, A., Del\u00e9glise, P., Est\u00e8ve, Y.: Enhancing the TED-LIUM corpus with selected data for language modeling and more TED talks. In: Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC\u201914), pp. 3935\u20133939. European Language Resources Association (ELRA) (2014)"},{"key":"12_CR31","unstructured":"Sehoon, K., et\u00a0al.: Squeezeformer: an efficient transformer for automatic speech recognition. In: Advances in Neural Information Processing Systems (NeurIPS22), vol.\u00a035, pp. 9361\u20139373. Curran Associates, Inc. (2022)"},{"key":"12_CR32","doi-asserted-by":"crossref","unstructured":"Shaw, P., Uszkoreit, J., Vaswani, A.: Self-attention with relative position representations. In: Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 464\u2013468. Association for Computational Linguistics (2018)","DOI":"10.18653\/v1\/N18-2074"},{"key":"12_CR33","unstructured":"Shi, H., Wang, L., Sheng, L.C., F., Dang, J., Kawahara, T.: Spectrograms fusion-based end-to-end robust automatic speech recognition. In: Proceedings of the 2021 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC), pp. 438\u2013442 (2021)"},{"key":"12_CR34","doi-asserted-by":"crossref","unstructured":"Subramanian, S., Ganapathiraman, V.: Zeroth order greedylr: an adaptive learning rate scheduler for deep neural network training. In: PRML 2023 (2023)","DOI":"10.1109\/PRML59573.2023.10348370"},{"key":"12_CR35","doi-asserted-by":"crossref","unstructured":"Sun, S., et\u00a0al.: Unified multi-modal unsupervised representation learning for skeleton-based action understanding. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 2973\u20132984. ACM (2023)","DOI":"10.1145\/3581783.3612449"},{"key":"12_CR36","unstructured":"Tufekci, Z., Gowdy, J.: Feature extraction using discrete wavelet transform for speech recognition. In: Proceedings of the IEEE SoutheastCon 2000. \u2018Preparing for The New Millennium\u2019, pp. 116\u2013123 (2000)"},{"key":"12_CR37","doi-asserted-by":"crossref","unstructured":"Varanis, M., Pederiva, R.: Wavelet packet energy-entropy feature extraction and principal component analysis for signal classification. In: Proceeding Series of the Brazilian Society of Computational and Applied Mathematics, vol.\u00a03 (2015)","DOI":"10.5540\/03.2015.003.01.0471"},{"key":"12_CR38","unstructured":"Vaswani, A., et\u00a0al.: Attention is all you need. In: Advances in Neural Information Processing Systems (NeurIPS17), vol.\u00a030. Curran Associates, Inc. (2017)"},{"key":"12_CR39","unstructured":"Watanabe, S., et\u00a0al.: ESPnet: end-to-end speech processing toolkit. In: Proceedings of the 19th Conference of the International Speech Communication Association (Interspeech18). pp. 2207\u20132211 (2018)"},{"issue":"8","key":"12_CR40","doi-asserted-by":"publisher","first-page":"1240","DOI":"10.1109\/JSTSP.2017.2763455","volume":"11","author":"S Watanabe","year":"2017","unstructured":"Watanabe, S., Hori, T., Kim, S., Hershey, J.R., Hayashi, T.: Hybrid CTC\/attention architecture for end-to-end speech recognition. IEEE J. Sel. Top. Signal Process. 11(8), 1240\u20131253 (2017)","journal-title":"IEEE J. Sel. Top. Signal Process."},{"key":"12_CR41","unstructured":"Yao, Z., et\u00a0al.: Zipformer: a faster and better encoder for automatic speech recognition. In: The 12th International Conference on Learning Representations (ICLR24) (2024)"},{"key":"12_CR42","doi-asserted-by":"crossref","unstructured":"Yao, Z., et\u00a0al.: Wenet: production oriented streaming and non-streaming end-to-end speech recognition toolkit. In: Proceedings of the 22nd Conference of the International Speech Communication Association (Interspeech21), pp. 4054\u20134058 (2021)","DOI":"10.21437\/Interspeech.2021-1983"},{"issue":"5","key":"12_CR43","doi-asserted-by":"publisher","first-page":"707","DOI":"10.1016\/j.specom.2010.04.008","volume":"53","author":"H Yin","year":"2011","unstructured":"Yin, H., Hohmann, V., Nadeu, C.: Acoustic features for speech recognition based on gammatone filterbank and instantaneous frequency. Speech Commun. 53(5), 707\u2013715 (2011)","journal-title":"Speech Commun."},{"key":"12_CR44","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Xue, J., Chen, X.: Ensemble learning approaches in speech recognition. In: Speech and Audio Processing for Coding, Enhancement and Recognition, pp. 113\u2013152. Springer (2014)","DOI":"10.1007\/978-1-4939-1456-2_5"}],"container-title":["Lecture Notes in Computer Science","Data Science: Foundations and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-8298-0_12","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,14]],"date-time":"2025-06-14T18:22:07Z","timestamp":1749925327000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-8298-0_12"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9789819682973","9789819682980"],"references-count":44,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-8298-0_12","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"15 June 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PAKDD","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Pacific-Asia Conference on Knowledge Discovery and Data Mining","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Sydney, NSW","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Australia","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"10 June 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13 June 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"pakdd2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/pakdd2025.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}