{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,27]],"date-time":"2025-11-27T05:41:02Z","timestamp":1764222062966,"version":"3.46.0"},"reference-count":33,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/sped67700.2025.11252367","type":"proceedings-article","created":{"date-parts":[[2025,11,26]],"date-time":"2025-11-26T19:00:27Z","timestamp":1764183627000},"page":"102-107","source":"Crossref","is-referenced-by-count":0,"title":["Open Source State-Of-the-Art Solution for Romanian Speech Recognition"],"prefix":"10.1109","author":[{"given":"Gabriel","family":"P\u00eerlogeanu","sequence":"first","affiliation":[{"name":"National University of Science and Technology Politehnica Bucharest,SpeeD,Bucharest,Romania"}]},{"given":"Alexandru-Lucian","family":"Georgescu","sequence":"additional","affiliation":[{"name":"National University of Science and Technology Politehnica Bucharest,SpeeD,Bucharest,Romania"}]},{"given":"Horia","family":"Cucu","sequence":"additional","affiliation":[{"name":"National University of Science and Technology Politehnica Bucharest,SpeeD,Bucharest,Romania"}]}],"member":"263","reference":[{"volume-title":"End-to-end speech recognition: A survey","year":"2023","author":"Prabhavalkar","key":"ref1"},{"volume-title":"A survey on speech large language models","year":"2025","author":"Peng","key":"ref2"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053040"},{"volume-title":"Efficient sequence transduction by jointly predicting tokens and durations","year":"2023","author":"Xu","key":"ref4"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446861"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/SpeD53181.2021.9587383"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/SpeD53181.2021.9587345"},{"key":"ref8","doi-asserted-by":"crossref","DOI":"10.1109\/TSP59544.2023.10197791","volume-title":"Towards improving the performance of pre-trained speech models for low-resource languages through lateral inhibition","author":"Avram","year":"2023"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/SpeD59241.2023.10314923"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2020-3015"},{"volume-title":"Fast conformer with linearly scalable attention for efficient speech recognition","author":"Rekesh","key":"ref11","doi-asserted-by":"crossref","DOI":"10.1109\/ASRU57964.2023.10389701"},{"article-title":"RSC: A Romanian read speech corpus for automatic speech recognition","volume-title":"Proc. LREC","author":"Georgescu","key":"ref12"},{"article-title":"CoBiLiRo: A research platform for bimodal corpora","volume-title":"Proceedings of the 1st International Workshop on Language Technology Platforms","author":"Cristea","key":"ref13"},{"article-title":"CoRoLa \u2014 the reference corpus of contemporary Romanian language","volume-title":"Proc. LREC","author":"Mititelu","key":"ref14"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.3390\/app14199043"},{"volume-title":"End-to-end asr: from supervised to semi-supervised learning with modern architectures","year":"2020","author":"Synnaeve","key":"ref16"},{"volume-title":"Robust speech recognition via large-scale weak supervision","year":"2022","author":"Radford","key":"ref17"},{"volume-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","year":"2020","author":"Baevski","key":"ref18"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.5555\/3295222.3295349"},{"volume-title":"Longformer: The longdocument transformer","year":"2020","author":"Beltagy","key":"ref20"},{"volume-title":"Sequence transduction with recurrent neural networks","year":"2012","author":"Graves","key":"ref21"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"article-title":"Common voice: A massively-multilingual speech corpus","volume-title":"Proceedings of the Twelfth Language Resources and Evaluation Conference","author":"Ardila","key":"ref23"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/SLT54892.2023.10023141"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/n19-1006"},{"volume-title":"Nemo: a toolkit for building ai applications using neural modules","year":"2019","author":"Kuchaiev","key":"ref26"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052942"},{"key":"ref28","first-page":"66","article-title":"SentencePiece: A simple and language independent subword tokenizer and detokenizer for neural text processing","volume-title":"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing: System Demonstrations","author":"Kudo"},{"volume-title":"Musan: A music, speech, and noise corpus","year":"2015","author":"Snyder","key":"ref29"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2680"},{"volume-title":"Decoupled weight decay regularization","year":"2019","author":"Loshchilov","key":"ref31"},{"article-title":"KenLM: Faster and smaller language model queries","volume-title":"Proceedings of the Sixth Workshop on Statistical Machine Translation","author":"Heafield","key":"ref32"},{"article-title":"The kaldi speech recognition toolkit","volume-title":"Proc. ASRU","author":"Ghoshal","key":"ref33"}],"event":{"name":"2025 International Conference on Speech Technology and Human-Computer Dialogue (SpeD)","start":{"date-parts":[[2025,10,19]]},"location":"Cluj-Napoca, Romania","end":{"date-parts":[[2025,10,22]]}},"container-title":["2025 International Conference on Speech Technology and Human-Computer Dialogue (SpeD)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11251505\/11251597\/11252367.pdf?arnumber=11252367","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,27]],"date-time":"2025-11-27T05:37:39Z","timestamp":1764221859000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11252367\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":33,"URL":"https:\/\/doi.org\/10.1109\/sped67700.2025.11252367","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}