{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,27]],"date-time":"2025-11-27T06:48:01Z","timestamp":1764226081786,"version":"build-2065373602"},"publisher-location":"Cham","reference-count":25,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032079589","type":"print"},{"value":"9783032079596","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,10,13]],"date-time":"2025-10-13T00:00:00Z","timestamp":1760313600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,10,13]],"date-time":"2025-10-13T00:00:00Z","timestamp":1760313600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-07959-6_5","type":"book-chapter","created":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T09:22:00Z","timestamp":1760260920000},"page":"60-69","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Best Data is more Supervised Data \u2013 Even for\u00a0Hungarian ASR"],"prefix":"10.1007","author":[{"given":"Gergely","family":"Dobsinszki","sequence":"first","affiliation":[]},{"given":"P\u00e9ter","family":"Mihajlik","sequence":"additional","affiliation":[]},{"given":"M\u00e1t\u00e9 Soma","family":"K\u00e1d\u00e1r","sequence":"additional","affiliation":[]},{"given":"Tibor","family":"Fegy\u00f3","sequence":"additional","affiliation":[]},{"given":"Katalin","family":"M\u00e1dy","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,10,13]]},"reference":[{"key":"5_CR1","unstructured":"Ardila, R., et al.: Common voice: a massively-multilingual speech corpus. arXiv preprint arXiv:1912.06670 (2019)"},{"key":"5_CR2","unstructured":"Baevski, A., Zhou, H., Mohamed, A., Auli, M.: Wav2vec 2.0: a framework for self-supervised learning of speech representations (2020). https:\/\/arxiv.org\/abs\/2006.11477"},{"key":"5_CR3","doi-asserted-by":"publisher","unstructured":"Cho, E., Li, J., Kim, S., Jinyu, L.: Cross-language transfer learning and domain adaptation for end-to-end automatic speech recognition. In: Proceedings of the 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 7494\u20137498. IEEE (2020). https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9053538","DOI":"10.1109\/ICASSP40776.2020.9053538"},{"key":"5_CR4","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: Pre-training of deep bidirectional transformers for language understanding (2019). https:\/\/arxiv.org\/abs\/1810.04805"},{"key":"5_CR5","first-page":"50","volume":"105","author":"M G\u00f3sy","year":"2013","unstructured":"G\u00f3sy, M.: Bea-a multifunctional Hungarian spoken language database. Phonetica 105, 50\u201361 (2013)","journal-title":"Phonetica"},{"key":"5_CR6","unstructured":"Governmental Agency for IT Development (KIF\u00dc): Komondor supercomputer (2023). https:\/\/ncc.dkf.hu\/en.html. Hungary\u2019s most powerful supercomputer, located at the University of Debrecen"},{"key":"5_CR7","doi-asserted-by":"crossref","unstructured":"Graves, A., Fern\u00e1ndez, S., Gomez, F., Schmidhuber, J.: Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In: Proceedings of the 23rd International Conference on Machine Learning, pp. 369\u2013376 (2006)","DOI":"10.1145\/1143844.1143891"},{"key":"5_CR8","doi-asserted-by":"crossref","unstructured":"Gulati, A., et\u00a0al.: Conformer: convolution-augmented transformer for speech recognition. arXiv preprint arXiv:2005.08100 (2020)","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"5_CR9","unstructured":"Harper, E., et al.: Nemo: a toolkit for conversational ai and large language models (2023). https:\/\/nvidia.github.io\/NeMo\/"},{"key":"5_CR10","doi-asserted-by":"crossref","unstructured":"Hsu, W.N., Bolte, B., Tsai, Y.H.H., Lakhotia, K., Salakhutdinov, R., Mohamed, A.: Hubert: self-supervised speech representation learning by masked prediction of hidden units (2021). https:\/\/arxiv.org\/abs\/2106.07447","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"5_CR11","unstructured":"Huang, J., et al.: Cross-language transfer learning, continuous learning, and domain adaptation for end-to-end automatic speech recognition (2020). https:\/\/arxiv.org\/abs\/2005.04290"},{"key":"5_CR12","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)"},{"key":"5_CR13","doi-asserted-by":"crossref","unstructured":"Kriman, S., et al.: Quartznet: deep automatic speech recognition with 1d time-channel separable convolutions. In: ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6124\u20136128. IEEE (2020)","DOI":"10.1109\/ICASSP40776.2020.9053889"},{"key":"5_CR14","doi-asserted-by":"crossref","unstructured":"Kudo, T., Richardson, J.: Sentencepiece: a simple and language independent subword tokenizer and detokenizer for neural text processing. arXiv preprint arXiv:1808.06226 (2018)","DOI":"10.18653\/v1\/D18-2012"},{"key":"5_CR15","unstructured":"K\u00e1d\u00e1r, M.S., Dobsinszki, G., M\u00e1dy, K., Mihajlik, P.: Feeding the beast \u2013 the latest developments on the BEA Speech Transcriber and its integration with language model \u2013 in Hungarian. In: XIX. Hungarian Conference on Computational Linguistics, pp. 135\u2013143 (2023)"},{"key":"5_CR16","unstructured":"Lee, Y., Willette, J.R., Kim, J., Hwang, S.J.: Visualizing the loss landscape of self-supervised vision transformer (2024). https:\/\/arxiv.org\/abs\/2405.18042"},{"key":"5_CR17","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization (2019). https:\/\/arxiv.org\/abs\/1711.05101"},{"key":"5_CR18","doi-asserted-by":"publisher","unstructured":"Mihajlik, P., et al.: What kind of multi- or cross-lingual pre-training is the most effective for a spontaneous, less-resourced ASR task? In: Proceedings of the 2nd Annual Meeting of the ELRA\/ISCA SIG on Under-resourced Languages (SIGUL 2023), pp. 58\u201362 (2023). https:\/\/doi.org\/10.21437\/SIGUL.2023-13","DOI":"10.21437\/SIGUL.2023-13"},{"key":"5_CR19","unstructured":"Mihajlik, P., Balog, A., Graczi, T.E., Kohari, A., Tarj\u00e1n, B., Mady, K.: BEA-base: a benchmark for ASR of spontaneous Hungarian. In: Calzolari, N., et al. (eds.) Proceedings of the Thirteenth Language Resources and Evaluation Conference, pp. 1970\u20131977. European Language Resources Association, Marseille (2022). https:\/\/aclanthology.org\/2022.lrec-1.211\/"},{"key":"5_CR20","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"424","DOI":"10.1007\/978-3-319-10816-2_51","volume-title":"Text, Speech and Dialogue","author":"T Neuberger","year":"2014","unstructured":"Neuberger, T., Gyarmathy, D., Gr\u00e1czi, T.E., Horv\u00e1th, V., G\u00f3sy, M., Beke, A.: Development of a large spontaneous speech database of agglutinative hungarian language. In: Sojka, P., Hor\u00e1k, A., Kope\u010dek, I., Pala, K. (eds.) TSD 2014. LNCS (LNAI), vol. 8655, pp. 424\u2013431. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10816-2_51"},{"key":"5_CR21","unstructured":"Radford, A., Kim, J.W., Xu, T., Brockman, G., McLeavey, C., Sutskever, I.: Robust speech recognition via large-scale weak supervision (2022). https:\/\/arxiv.org\/abs\/2212.04356"},{"key":"5_CR22","doi-asserted-by":"crossref","unstructured":"Rekesh, D., et al.: Fast conformer with linearly scalable attention for efficient speech recognition (2023). https:\/\/arxiv.org\/abs\/2305.05084","DOI":"10.1109\/ASRU57964.2023.10389701"},{"key":"5_CR23","unstructured":"Vaswani, A., et al.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"5_CR24","doi-asserted-by":"publisher","unstructured":"Wang, C., et al.: VoxPopuli: a large-scale multilingual speech corpus for representation learning, semi-supervised learning and interpretation. In: Zong, C., Xia, F., Li, W., Navigli, R. (eds.) Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing, vol. 1: Long Papers, pp. 993\u20131003. Association for Computational Linguistics, Online (2021). https:\/\/doi.org\/10.18653\/v1\/2021.acl-long.80. https:\/\/aclanthology.org\/2021.acl-long.80","DOI":"10.18653\/v1\/2021.acl-long.80"},{"key":"5_CR25","doi-asserted-by":"publisher","unstructured":"Yang, H., Zhao, J., Haffari, G., Shareghi, E.: Self-supervised rewiring of pre-trained speech encoders: towards faster fine-tuning with less labels in speech processing. In: Goldberg, Y., Kozareva, Z., Zhang, Y. (eds.) Findings of the Association for Computational Linguistics: EMNLP 2022, pp. 1952\u20131959. Association for Computational Linguistics, Abu Dhabi (2022). https:\/\/doi.org\/10.18653\/v1\/2022.findings-emnlp.141. https:\/\/aclanthology.org\/2022.findings-emnlp.141","DOI":"10.18653\/v1\/2022.findings-emnlp.141"}],"container-title":["Lecture Notes in Computer Science","Speech and Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-07959-6_5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T09:22:09Z","timestamp":1760260929000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-07959-6_5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,13]]},"ISBN":["9783032079589","9783032079596"],"references-count":25,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-07959-6_5","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,10,13]]},"assertion":[{"value":"13 October 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interest"}},{"value":"SPECOM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Speech and Computer","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Szeged","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Hungary","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 October 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"specom2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/specom.inf.u-szeged.hu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}