{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T19:02:31Z","timestamp":1757617351792,"version":"3.44.0"},"reference-count":36,"publisher":"Springer Science and Business Media LLC","issue":"27","license":[{"start":{"date-parts":[[2024,12,23]],"date-time":"2024-12-23T00:00:00Z","timestamp":1734912000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,23]],"date-time":"2024-12-23T00:00:00Z","timestamp":1734912000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"DOI":"10.1007\/s11042-024-20496-1","type":"journal-article","created":{"date-parts":[[2024,12,23]],"date-time":"2024-12-23T05:46:18Z","timestamp":1734932778000},"page":"32959-32983","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["APEDM: a new voice casting system using acoustic\u2013phonetic encoder-decoder mapping"],"prefix":"10.1007","volume":"84","author":[{"given":"Sogol","family":"Alipour Esgandani","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6733-3702","authenticated-orcid":false,"given":"Yasser","family":"Shekofteh","sequence":"additional","affiliation":[]},{"given":"Ashkan","family":"Moradi","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,23]]},"reference":[{"issue":"9","key":"20496_CR1","doi-asserted-by":"publisher","first-page":"1642","DOI":"10.1109\/TASLP.2016.2580302","volume":"24","author":"N Obin","year":"2016","unstructured":"Obin N, Roebel A (2016) Similarity search of acted voices for automatic voice casting. IEEE\/ACM Trans Audio Speech Lang Process 24(9):1642\u20131651","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process"},{"key":"20496_CR2","doi-asserted-by":"publisher","unstructured":"Obin N, Roebel A, Bachman G (2014) On automatic voice casting for expressive speech: Speaker recognition vs. speech classification. In 2014 IEEE international conference on acoustics, speech and signal processing (ICASSP)\u00a0(pp. 950\u2013954). IEEE, Florence, Italy, 04-09 May 2014. https:\/\/doi.org\/10.1109\/ICASSP.2014.6853737","DOI":"10.1109\/ICASSP.2014.6853737"},{"key":"20496_CR3","doi-asserted-by":"publisher","unstructured":"Gresse A, Rouvier M, Dufour R, Labatut V, Bonastre JF (2017) Acoustic pairing of original and dubbed voices in the context of video game localization. In: Interspeech. pp 2839\u20132843.\u00a0https:\/\/doi.org\/10.21437\/Interspeech.2017-1311","DOI":"10.21437\/Interspeech.2017-1311"},{"key":"20496_CR4","doi-asserted-by":"publisher","unstructured":"Gresse A, Quillot M, Dufour R, Labatut V, Bonastre JF (2019) Similarity metric based on siamese neural networks for voice casting. In\u00a0ICASSP 2019-2019 IEEE international conference on acoustics, speech and signal processing (ICASSP)\u00a0(pp. 6585\u20136589). IEEE, Brighton, UK, 12-17 May 2019. https:\/\/doi.org\/10.1109\/ICASSP.2019.8683178","DOI":"10.1109\/ICASSP.2019.8683178"},{"key":"20496_CR5","doi-asserted-by":"publisher","unstructured":"Gresse A, Quillot M, Dufour R, Bonastre JF (2020) Learning voice representation using knowledge distillation for automatic voice casting. In: 21st annual conference of the international speech communication association, Interspeech 2020, Virtual Event. ISCA, Shanghai, China, pp 160\u2013164. https:\/\/doi.org\/10.21437\/Interspeech.2020-2236","DOI":"10.21437\/Interspeech.2020-2236"},{"issue":"3","key":"20496_CR6","doi-asserted-by":"publisher","first-page":"12","DOI":"10.5594\/JMI.2021.3057695","volume":"130","author":"A Malik","year":"2021","unstructured":"Malik A, Nguyen H (2021) Exploring automated voice casting for content localization using deep learning. SMPTE Motion Imaging J 130(3):12\u201318","journal-title":"SMPTE Motion Imaging J"},{"key":"20496_CR7","doi-asserted-by":"crossref","unstructured":"Quillot M et al (2021) Influence of speaker pre-training on character voice representation. In international conference on speech and computer\u00a0(pp. 577\u2013588). Cham: Springer International Publishing","DOI":"10.1007\/978-3-030-87802-3_52"},{"key":"20496_CR8","doi-asserted-by":"crossref","unstructured":"Quillot M, Dufour R, Bonastre JF (2021) Assessing speaker-independent character information for acted voices. In international conference on speech and computer (pp. 565\u2013576). Cham: Springer International Publishing","DOI":"10.1007\/978-3-030-87802-3_51"},{"key":"20496_CR9","unstructured":"Jia Y, Zhang Y, Weiss R, Wang Q, Shen J, Ren F, Nguyen P, Pang R, Lopez Moreno I, Wu Y (2018) Transfer learning from speaker verification to multispeaker text-to-speech synthesis. In: 32nd conference on neural information processing systems (NeurIPS 2018) Advances in neural information processing systems.\u00a0Montr\u00e9al, Canada, p 31"},{"issue":"2","key":"20496_CR10","doi-asserted-by":"publisher","first-page":"5309","DOI":"10.1007\/s11042-023-15555-y","volume":"83","author":"A Verma","year":"2024","unstructured":"Verma A et al (2024) Automatic image caption generation using deep learning. Multimed Tools Appl 83(2):5309\u20135325","journal-title":"Multimed Tools Appl"},{"issue":"4","key":"20496_CR11","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3573891","volume":"22","author":"SK Mishra","year":"2023","unstructured":"Mishra SK et al (2023) Dynamic convolution-based encoder-decoder framework for image captioning in Hindi. ACM Trans Asian Low-Resour Lang Inform Process 22(4):1\u201318","journal-title":"ACM Trans Asian Low-Resour Lang Inform Process"},{"key":"20496_CR12","doi-asserted-by":"publisher","unstructured":"Wu F et al (2023) Wav2seq: pre-training speech-to-text encoder-decoder models using pseudo languages. In\u00a0ICASSP 2023-2023 IEEE international conference on acoustics, speech and signal processing (ICASSP)\u00a0(pp. 1\u20135). IEEE, Rhodes Island, Greece, 04-10 June 2023. https:\/\/doi.org\/10.1109\/ICASSP49357.2023.10096988","DOI":"10.1109\/ICASSP49357.2023.10096988"},{"key":"20496_CR13","doi-asserted-by":"publisher","unstructured":"Dalmia S, Okhonko D, Lewis M, Edunov S, Watanabe S, Metze F, Zettlemoyer L, Mohamed A (2023) Legonn: building modular encoder-decoder models. IEEE\/ACM transactions on audio, speech, and language processing 31:3112\u20133126. https:\/\/doi.org\/10.1109\/TASLP.2023.3296019","DOI":"10.1109\/TASLP.2023.3296019"},{"issue":"4","key":"20496_CR14","doi-asserted-by":"publisher","first-page":"354","DOI":"10.1007\/s42979-023-01678-4","volume":"4","author":"M Mahanty","year":"2023","unstructured":"Mahanty M, Vamsi B, Madhavi D (2023) A corpus-based auto-encoder-and-decoder machine translation using deep neural network for translation from English to Telugu language. SN Comp Sci 4(4):354","journal-title":"SN Comp Sci"},{"key":"20496_CR15","doi-asserted-by":"publisher","unstructured":"Kano T et al (2023) Speech summarization of long spoken document: Improving memory efficiency of speech\/text encoders. In\u00a0ICASSP 2023-2023 IEEE international conference on acoustics, speech and signal processing (ICASSP)\u00a0(pp. 1\u20135). IEEE, Rhodes Island, Greece, 04-10 June 2023. https:\/\/doi.org\/10.1109\/ICASSP49357.2023.10095019","DOI":"10.1109\/ICASSP49357.2023.10095019"},{"issue":"11","key":"20496_CR16","doi-asserted-by":"publisher","first-page":"17075","DOI":"10.1007\/s11042-022-14099-x","volume":"82","author":"GA Babu","year":"2023","unstructured":"Babu GA, Badugu S (2023) Deep learning based sequence to sequence model for abstractive telugu text summarization. Multimedia Tools Appl 82(11):17075\u201317096","journal-title":"Multimedia Tools Appl"},{"key":"20496_CR17","doi-asserted-by":"publisher","unstructured":"Bhosale S, Chakraborty R, Kopparapu SK (2020) Deep encoded linguistic and acoustic cues for attention based end to end speech emotion recognition. In ICASSP 2020-2020 IEEE international conference on acoustics, speech and signal processing (ICASSP) (pp. 7189\u20137193). IEEE, Barcelona, Spain, 04-08 May 2020. https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9054621","DOI":"10.1109\/ICASSP40776.2020.9054621"},{"key":"20496_CR18","doi-asserted-by":"publisher","unstructured":"Maheshwari U, Goel P, Uthra RA, Patage VV, Tiwari S, Goyal S (2022) Convolutional encoder\u2013decoder architecture for speech enhancement. In: Subramani C, Vijayakumar K, Dakyo B, Dash SS (eds) Proceedings of international conference on power electronics and renewable energy systems. Lecture Notes in Electrical Engineering, vol 795. Springer, Singapore. https:\/\/doi.org\/10.1007\/978-981-16-4943-1_34","DOI":"10.1007\/978-981-16-4943-1_34"},{"key":"20496_CR19","doi-asserted-by":"publisher","unstructured":"Asadi A, Safabakhsh R (2020) The encoder-decoder framework and its applications. In: Pedrycz W, Chen SM (eds) Deep learning: concepts and architectures. Studies in Computational Intelligence, vol 866. Springer, Cham. https:\/\/doi.org\/10.1007\/978-3-030-31756-0_5","DOI":"10.1007\/978-3-030-31756-0_5"},{"key":"20496_CR20","doi-asserted-by":"crossref","unstructured":"Wang B, Ma Lin, Zhang Wei, Li Wei (2018) Reconstruction network for video captioning. In proceedings of the IEEE conference on computer vision and pattern recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR). Salt Lake City, Utah, pp 7622\u20137631","DOI":"10.1109\/CVPR.2018.00795"},{"key":"20496_CR21","unstructured":"Zhang A, Lipton ZC, Li M, Smola AJ (2023) Dive into deep learning. Cambridge University Press"},{"key":"20496_CR22","doi-asserted-by":"publisher","first-page":"108549","DOI":"10.1016\/j.compeleceng.2022.108549","volume":"105","author":"A Moradi","year":"2023","unstructured":"Moradi A, Shekofteh Y (2023) Spoken language identification using a genetic-based fusion approach to combine acoustic and universal phonetic results. Comput Electr Eng 105:108549","journal-title":"Comput Electr Eng"},{"key":"20496_CR23","doi-asserted-by":"publisher","unstructured":"Reza S, Azadi TE, Kabudian J, Shekofteh Y (2014) A robust speaker recognition system combining factor analysis techniques. In: 21th Iranian conference on biomedical engineering (ICBME). Tehran, Iran, pp 343\u2013347.\u00a0https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9054362","DOI":"10.1109\/ICASSP40776.2020.9054362"},{"key":"20496_CR24","doi-asserted-by":"publisher","first-page":"107232","DOI":"10.1016\/j.engappai.2023.107232","volume":"127","author":"M Jakubec","year":"2024","unstructured":"Jakubec M et al (2024) Deep speaker embeddings for speaker verification: review and experimental comparison. Eng Appl Artif Intell 127:107232","journal-title":"Eng Appl Artif Intell"},{"issue":"4","key":"20496_CR25","doi-asserted-by":"publisher","first-page":"817","DOI":"10.1007\/s10772-023-10058-5","volume":"26","author":"Y Zhang","year":"2023","unstructured":"Zhang Y, Liu L (2023) Multi-task learning for X-vector based speaker recognition. Int J Speech Technol 26(4):817\u2013823","journal-title":"Int J Speech Technol"},{"key":"20496_CR26","doi-asserted-by":"publisher","first-page":"3054","DOI":"10.1109\/TIFS.2021.3071574","volume":"16","author":"SE Tandogan","year":"2021","unstructured":"Tandogan SE, Sencar HT (2021) Estimating uniqueness of i-vector-based representation of human voice. IEEE Trans Inf Forensics Secur 16:3054\u20133067","journal-title":"IEEE Trans Inf Forensics Secur"},{"key":"20496_CR27","doi-asserted-by":"publisher","unstructured":"Li X et al (2020) Universal phone recognition with a multilingual allophone system. In ICASSP 2020-2020 IEEE international conference on acoustics, speech and signal processing (ICASSP)\u00a0(pp. 8249-8253). IEEE, Barcelona, Spain, 04-08 May 2020. https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9054362","DOI":"10.1109\/ICASSP40776.2020.9054362"},{"key":"20496_CR28","doi-asserted-by":"publisher","unstructured":"Rostami D, Shekofteh Y (2023) A Persian wake word detection system based on the fine tuning of a universal phone decoder and levenshtein distance. In 2023 9th international conference on web research (ICWR)\u00a0(pp. 35\u201340). IEEE, Tehran, Iran, Islamic Republic of, 03-04 May 2023. https:\/\/doi.org\/10.1109\/ICWR57742.2023.10139277","DOI":"10.1109\/ICWR57742.2023.10139277"},{"issue":"4","key":"20496_CR29","first-page":"1","volume":"1","author":"SO Sadjadi","year":"2013","unstructured":"Sadjadi SO, Slaney M, Heck L (2013) MSR identity toolbox v1.0: a MATLAB toolbox for speaker-recognition research. Speech Lang Process Tech Committee Newslett 1(4):1\u201332","journal-title":"Speech Lang Process Tech Committee Newslett"},{"key":"20496_CR30","unstructured":"Shekofteh Y, Almasganj F, Goodarzi MM (2011) Comparison of linear based feature transformations to improve speech recognition performance. In\u00a02011 19th Iranian conference on electrical engineering\u00a0(pp. 1\u20134). IEEE, Tehran, Iran, 17-19 May 2011"},{"key":"20496_CR31","doi-asserted-by":"publisher","unstructured":"Garofolo JS, Lamel LF, Fisher WM, Pallett DS, Dahlgren NL, Zue Victor, Fiscus JG (1993) Timit acoustic phonetic continuous speech corpus. Linguistic Data Consortium. https:\/\/cir.nii.ac.jp\/crid\/1881146593179904768. https:\/\/doi.org\/10.1109\/ICWR57742.2023.10139277","DOI":"10.1109\/ICWR57742.2023.10139277"},{"key":"20496_CR32","unstructured":"Senoussaoui M, Kenny P, Dehak N, Dumouchel P (2010) An i-vector extractor suitable for speaker recognition with both Microphone and telephone Speech. Proc. The speaker and language recognition workshop.\u00a0Odyssey, p 6"},{"key":"20496_CR33","doi-asserted-by":"publisher","unstructured":"Snyder D et al (2018) X-vectors: robust dnn embeddings for speaker recognition. In 2018 IEEE international conference on acoustics, speech and signal processing (ICASSP)\u00a0(pp. 5329\u20135333). IEEE, Calgary, AB, Canada, 15-20 April 2018. https:\/\/doi.org\/10.1109\/ICASSP.2018.8461375","DOI":"10.1109\/ICASSP.2018.8461375"},{"issue":"1","key":"20496_CR34","doi-asserted-by":"publisher","first-page":"5415","DOI":"10.1038\/s41467-019-13055-y","volume":"10","author":"AC Belkina","year":"2019","unstructured":"Belkina AC et al (2019) Automated optimized parameters for T-distributed stochastic neighbor embedding improve visualization and analysis of large datasets. Nat Commun 10(1):5415","journal-title":"Nat Commun"},{"key":"20496_CR35","doi-asserted-by":"publisher","first-page":"367","DOI":"10.1109\/TASLP.2022.3140549","volume":"30","author":"X Shi","year":"2022","unstructured":"Shi X, Cooper E, Yamagishi J (2022) Use of speaker recognition approaches for learning and evaluating embedding representations of musical instrument sounds. IEEE\/ACM Trans Audio Speech Lang Process 30:367\u2013377","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process"},{"key":"20496_CR36","doi-asserted-by":"publisher","first-page":"109635","DOI":"10.1016\/j.apacoust.2023.109635","volume":"213","author":"H Fayyazi","year":"2023","unstructured":"Fayyazi H, Shekofteh Y (2023) Exploiting auditory filter models as interpretable convolutional frontends to obtain optimal architectures for speaker gender recognition. Appl Acoust 213:109635","journal-title":"Appl Acoust"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-024-20496-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-024-20496-1\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-024-20496-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,6]],"date-time":"2025-09-06T03:45:42Z","timestamp":1757130342000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-024-20496-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,23]]},"references-count":36,"journal-issue":{"issue":"27","published-online":{"date-parts":[[2025,8]]}},"alternative-id":["20496"],"URL":"https:\/\/doi.org\/10.1007\/s11042-024-20496-1","relation":{},"ISSN":["1573-7721"],"issn-type":[{"type":"electronic","value":"1573-7721"}],"subject":[],"published":{"date-parts":[[2024,12,23]]},"assertion":[{"value":"21 March 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 September 2024","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 December 2024","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 December 2024","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}