{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T15:42:38Z","timestamp":1780501358118,"version":"3.54.1"},"reference-count":44,"publisher":"Springer Science and Business Media LLC","issue":"11","license":[{"start":{"date-parts":[[2024,4,15]],"date-time":"2024-04-15T00:00:00Z","timestamp":1713139200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,4,15]],"date-time":"2024-04-15T00:00:00Z","timestamp":1713139200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100002322","name":"Coordena\u00e7\u00e3o de Aperfei\u00e7oamento de Pessoal de N\u00edvel Superior","doi-asserted-by":"crossref","award":["001"],"award-info":[{"award-number":["001"]}],"id":[{"id":"10.13039\/501100002322","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100002322","name":"Coordena\u00e7\u00e3o de Aperfei\u00e7oamento de Pessoal de N\u00edvel Superior","doi-asserted-by":"crossref","award":["001"],"award-info":[{"award-number":["001"]}],"id":[{"id":"10.13039\/501100002322","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"published-print":{"date-parts":[[2024,7]]},"DOI":"10.1007\/s11227-024-06098-6","type":"journal-article","created":{"date-parts":[[2024,4,15]],"date-time":"2024-04-15T07:01:50Z","timestamp":1713164510000},"page":"16654-16678","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["Automatic phoneme recognition by deep neural networks"],"prefix":"10.1007","volume":"80","author":[{"given":"Bianca Val\u00e9ria L.","family":"Pereira","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Mateus B. F.","family":"de Carvalho","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Pedro Augusto A. da S. de A. Nava","family":"Alves","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Paulo Rogerio de A.","family":"Ribeiro","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Alexandre Cesar M.","family":"de Oliveira","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Areolino","family":"de Almeida Neto","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,4,15]]},"reference":[{"key":"6098_CR1","doi-asserted-by":"publisher","first-page":"54663","DOI":"10.1109\/ACCESS.2020.2980452","volume":"8","author":"M Algabri","year":"2020","unstructured":"Algabri M, Mathkour H, Bencherif MA et al (2020) Towards deep object detection techniques for phoneme recognition. IEEE Access 8:54663\u201354680","journal-title":"IEEE Access"},{"key":"6098_CR2","unstructured":"Bresolin AdA (2008) Speech recognition through units smaller than the word, using wavelet packet and svm, in a new hierarchical decision structure. Ph.D. thesis, Federal University of Rio Grande do Norte"},{"issue":"1","key":"6098_CR3","doi-asserted-by":"publisher","first-page":"49","DOI":"10.1016\/S0346-251X(98)00049-9","volume":"27","author":"D Coniam","year":"1999","unstructured":"Coniam D (1999) Voice recognition software accuracy with second language speakers of English. System 27(1):49\u201364","journal-title":"System"},{"key":"6098_CR4","doi-asserted-by":"crossref","unstructured":"Dai W, Dai C, Qu S et al (2017) Very deep convolutional neural networks for raw waveforms. In: 2017 IEEE International Conference on Acoustics. Speech and Signal Processing ICASSP. IEEE, pp 421\u2013425","DOI":"10.1109\/ICASSP.2017.7952190"},{"key":"6098_CR5","doi-asserted-by":"crossref","unstructured":"Deng J, Dong W, Socher R et\u00a0al (2009) Imagenet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition. IEEE, pp 248\u2013255","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"6098_CR6","doi-asserted-by":"crossref","unstructured":"Erhan D, Szegedy C, Toshev A et\u00a0al (2014) Scalable object detection using deep neural networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 2147\u20132154","DOI":"10.1109\/CVPR.2014.276"},{"issue":"2","key":"6098_CR7","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1007\/s11263-009-0275-4","volume":"88","author":"M Everingham","year":"2010","unstructured":"Everingham M, Van Gool L, Williams CK et al (2010) The Pascal visual object classes VOC challenge. Int J Comput Vis 88(2):303\u2013338","journal-title":"Int J Comput Vis"},{"key":"6098_CR8","doi-asserted-by":"crossref","unstructured":"Fan R, Liu G (2018) CNN-based audio front end processing on speech recognition. In: 2018 International Conference on Audio, Language and Image Processing ICALIP. IEEE, pp 349\u2013354","DOI":"10.1109\/ICALIP.2018.8455731"},{"issue":"2","key":"6098_CR9","doi-asserted-by":"publisher","first-page":"254","DOI":"10.1109\/TASSP.1981.1163530","volume":"29","author":"S Furui","year":"1981","unstructured":"Furui S (1981) Cepstral analysis technique for automatic speaker verification. IEEE Trans Acoust Speech Signal Process 29(2):254\u2013272","journal-title":"IEEE Trans Acoust Speech Signal Process"},{"key":"6098_CR10","doi-asserted-by":"crossref","unstructured":"Glackin C, Wall JA, Chollet G et\u00a0al (2018) Convolutional neural networks for phoneme recognition. In: ICPRAM, pp 190\u2013195","DOI":"10.5220\/0006653001900195"},{"key":"6098_CR11","unstructured":"Gordillo CDA (2013) Recognition of continuous voice combining MFCC and PNCC attributes with SS, WD, map and FRN robustness methods. Ph.D. thesis, Pontifical Catholic University of Rio de Janeiro"},{"key":"6098_CR12","doi-asserted-by":"crossref","unstructured":"Graves A, Fern\u00e1ndez S, Gomez F et\u00a0al (2006) Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In: Proceedings of the 23rd International Conference on Machine Learning, pp 369\u2013376","DOI":"10.1145\/1143844.1143891"},{"key":"6098_CR13","doi-asserted-by":"crossref","unstructured":"Graves A, Mohamed Ar, Hinton G (2013) Speech recognition with deep recurrent neural networks. In: 2013 IEEE International Conference on Acoustics, Speech and Signal Processing. IEEE, pp 6645\u20136649","DOI":"10.1109\/ICASSP.2013.6638947"},{"key":"6098_CR14","doi-asserted-by":"publisher","first-page":"15","DOI":"10.1016\/j.engappai.2016.12.012","volume":"59","author":"DT Grozdic","year":"2017","unstructured":"Grozdic DT, Jovicic S, Suboti\u0107 M (2017) Whispered speech recognition using deep denoising autoencoder. Eng Appl Artif Intell 59:15\u201322","journal-title":"Eng Appl Artif Intell"},{"key":"6098_CR15","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s11227-021-04124-5","volume":"78","author":"V Gupta","year":"2022","unstructured":"Gupta V, Juyal S, Hu YC (2022) Understanding human emotions through speech spectrograms using deep neural network. J Supercomput 78:1\u201330","journal-title":"J Supercomput"},{"key":"6098_CR16","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S et\u00a0al (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"issue":"6","key":"6098_CR17","doi-asserted-by":"publisher","first-page":"82","DOI":"10.1109\/MSP.2012.2205597","volume":"29","author":"G Hinton","year":"2012","unstructured":"Hinton G, Deng L, Yu D et al (2012) Deep neural networks for acoustic modeling in speech recognition: the shared views of four research groups. IEEE Signal Process Mag 29(6):82\u201397","journal-title":"IEEE Signal Process Mag"},{"key":"6098_CR18","doi-asserted-by":"crossref","unstructured":"Howard A, Sandler M, Chu G et\u00a0al (2019) Searching for mobilenetv3. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 1314\u20131324","DOI":"10.1109\/ICCV.2019.00140"},{"key":"6098_CR19","unstructured":"Howard AG, Zhu M, Chen B et\u00a0al (2017) Mobilenets: efficient convolutional neural networks for mobile vision applications. arXiv preprint arXiv:1704.04861"},{"key":"6098_CR20","unstructured":"Iandola FN, Han S, Moskewicz MW et\u00a0al (2016) Squeezenet: alexnet-level accuracy with 50x fewer parameters and< 0.5 mb model size. arXiv preprint arXiv:1602.07360"},{"issue":"2","key":"6098_CR21","doi-asserted-by":"publisher","first-page":"307","DOI":"10.1109\/TIT.1986.1057145","volume":"32","author":"BH Juang","year":"1986","unstructured":"Juang BH, Levinson S, Sondhi M (1986) Maximum likelihood estimation for multivariate mixture observations of Markov chains Corresp. IEEE Trans Inf Theory 32(2):307\u2013309","journal-title":"IEEE Trans Inf Theory"},{"key":"6098_CR22","first-page":"1097","volume":"25","author":"A Krizhevsky","year":"2012","unstructured":"Krizhevsky A, Sutskever I, Hinton GE (2012) Imagenet classification with deep convolutional neural networks. Adv Neural Inf Process Syst 25:1097\u20131105","journal-title":"Adv Neural Inf Process Syst"},{"key":"6098_CR23","unstructured":"Kuznetsova A, Rom H, Alldrin N et\u00a0al (2018) The open images dataset v4: unified image classification, object detection, and visual relationship detection at scale. arXiv e-prints pp arXiv\u20131811"},{"key":"6098_CR24","unstructured":"Lathi BP (2006) Linear signals and systems-2. Bookman"},{"key":"6098_CR25","unstructured":"Lin M, Chen Q, Yan S (2013) Network in network. arXiv preprint arXiv:1312.4400"},{"key":"6098_CR26","doi-asserted-by":"crossref","unstructured":"Lin TY, Maire M, Belongie S et\u00a0al (2014) Microsoft coco: common objects in context. In: European Conference on Computer Vision. Springer, pp 740\u2013755","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"6098_CR27","doi-asserted-by":"crossref","unstructured":"Liu W, Anguelov D, Erhan D et\u00a0al (2016) Ssd: single shot multibox detector. In: European Conference on Computer Vision. pringer, pp 21\u201337","DOI":"10.1007\/978-3-319-46448-0_2"},{"key":"6098_CR28","doi-asserted-by":"crossref","unstructured":"Lugosch L, Ravanelli M, Ignoto P et\u00a0al (2019) Speech model pre-training for end-to-end spoken language understanding. arXiv preprint arXiv:1904.03670","DOI":"10.21437\/Interspeech.2019-2396"},{"key":"6098_CR29","doi-asserted-by":"crossref","unstructured":"McAuliffe M, Socolof M, Mihuc S et\u00a0al (2017) Montreal forced aligner: trainable text-speech alignment using kaldi. In: Interspeech, pp 498\u2013502","DOI":"10.21437\/Interspeech.2017-1386"},{"key":"6098_CR30","doi-asserted-by":"crossref","unstructured":"Meftah A, Alotaibi YA, Selouani SA (2016) A comparative study of different speech features for Arabic phonemes classification. In: 2016 European Modelling Symposium EMS. EEE, pp 47\u201352","DOI":"10.1109\/EMS.2016.018"},{"key":"6098_CR31","doi-asserted-by":"crossref","unstructured":"Muckenhirn H, Doss MM, Marcell S (2018) Towards directly modeling raw speech signal for speaker verification using CNNs. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing ICASSP. IEEE, pp 4884\u20134888","DOI":"10.1109\/ICASSP.2018.8462165"},{"key":"6098_CR32","doi-asserted-by":"crossref","unstructured":"Palaz D, Doss MM, Collobert R (2015) Convolutional neural networks-based continuous speech recognition using raw speech signal. In: 2015 IEEE International Conference on Acoustics, Speech and Signal Processing ICASSP. IEEE, pp 4295\u20134299","DOI":"10.1109\/ICASSP.2015.7178781"},{"key":"6098_CR33","doi-asserted-by":"crossref","unstructured":"Panayotov V, Chen G, Povey D et\u00a0al (2015) Librispeech: an ASR corpus based on public domain audio books. In: 2015 IEEE International Conference on Acoustics, Speech and Signal Processing ICASSP. IEEE, pp 5206\u20135210","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"6098_CR34","doi-asserted-by":"crossref","unstructured":"Quintanilha IM, Biscainho LWP, Netto SL (2017) Towards an end-to-end speech recognizer for portuguese using deep neural networks. In: Proceedings of 35th Simp\u00f3sio Brasileiro de Telecomunica\u00e7\u00e3ues e Processamento de Sinais","DOI":"10.14209\/sbrt.2017.73"},{"issue":"2","key":"6098_CR35","doi-asserted-by":"publisher","first-page":"257","DOI":"10.1109\/5.18626","volume":"77","author":"L Rabiner","year":"1989","unstructured":"Rabiner L (1989) A tutorial on hidden Markov models and selected applications in speech recognition. Proc IEEE 77(2):257\u2013286. https:\/\/doi.org\/10.1109\/5.18626","journal-title":"Proc IEEE"},{"key":"6098_CR36","doi-asserted-by":"crossref","unstructured":"Redmon J, Divvala S, Girshick R et\u00a0al (2016) You only look once: unified, real-time object detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition CVPR","DOI":"10.1109\/CVPR.2016.91"},{"key":"6098_CR37","first-page":"91","volume":"28","author":"S Ren","year":"2015","unstructured":"Ren S, He K, Girshick R et al (2015) Faster R-CNN: towards real-time object detection with region proposal networks. Adv Neural Inf Process Syst 28:91\u201399","journal-title":"Adv Neural Inf Process Syst"},{"issue":"3","key":"6098_CR38","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky O, Deng J, Su H et al (2015) Imagenet large scale visual recognition challenge. Int J Comput Vis 115(3):211\u2013252","journal-title":"Int J Comput Vis"},{"key":"6098_CR39","doi-asserted-by":"crossref","unstructured":"Sak H, Senior AW, Beaufays F (2014) Long short-term memory recurrent neural network architectures for large scale acoustic modeling. Google","DOI":"10.21437\/Interspeech.2014-80"},{"key":"6098_CR40","doi-asserted-by":"crossref","unstructured":"Sak H, Vinyals O, Heigold G et\u00a0al (2014) Sequence discriminative distributed training of long short-term memory recurrent neural networks. Google","DOI":"10.21437\/Interspeech.2014-305"},{"key":"6098_CR41","doi-asserted-by":"crossref","unstructured":"Sandler M, Howard A, Zhu M et\u00a0al (2018) Mobilenetv2: inverted residuals and linear bottlenecks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 4510\u20134520","DOI":"10.1109\/CVPR.2018.00474"},{"issue":"4","key":"6098_CR42","first-page":"506","volume":"16","author":"BL Sree","year":"2020","unstructured":"Sree BL, Vijaya M (2020) Building acoustic model for phoneme recognition using PSO-DBN. Int J Bus Intell Data Min 16(4):506\u2013523","journal-title":"Int J Bus Intell Data Min"},{"key":"6098_CR43","doi-asserted-by":"publisher","unstructured":"Wang L, Feng S, Hasegawa-Johnson M et\u00a0al (2022) Self-supervised semantic-driven phoneme discovery for zero-resource speech recognition. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) pp 8027\u20138047. https:\/\/doi.org\/10.18653\/v1\/2022.acl-long.553, https:\/\/aclanthology.org\/2022.acl-long.553","DOI":"10.18653\/v1\/2022.acl-long.553"},{"key":"6098_CR44","doi-asserted-by":"crossref","unstructured":"Zhang X, Zhou X, Lin M et\u00a0al (2018) Shufflenet: an extremely efficient convolutional neural network for mobile devices. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. Elsevier, pp 6848\u20136856","DOI":"10.1109\/CVPR.2018.00716"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-024-06098-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11227-024-06098-6\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-024-06098-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,6,25]],"date-time":"2024-06-25T11:11:04Z","timestamp":1719313864000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11227-024-06098-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,15]]},"references-count":44,"journal-issue":{"issue":"11","published-print":{"date-parts":[[2024,7]]}},"alternative-id":["6098"],"URL":"https:\/\/doi.org\/10.1007\/s11227-024-06098-6","relation":{},"ISSN":["0920-8542","1573-0484"],"issn-type":[{"value":"0920-8542","type":"print"},{"value":"1573-0484","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,4,15]]},"assertion":[{"value":"23 March 2024","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 April 2024","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}