{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,1,21]],"date-time":"2025-01-21T05:22:08Z","timestamp":1737436928773,"version":"3.33.0"},"reference-count":44,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"JSPS KAKENHI","award":["JP21H05054","JP23K21681"],"award-info":[{"award-number":["JP21H05054","JP23K21681"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Access"],"published-print":{"date-parts":[[2025]]},"DOI":"10.1109\/access.2025.3527012","type":"journal-article","created":{"date-parts":[[2025,1,8]],"date-time":"2025-01-08T20:25:08Z","timestamp":1736367908000},"page":"8638-8648","source":"Crossref","is-referenced-by-count":0,"title":["ZeST: A Zero-Resourced Speech-to-Speech Translation Approach for Unknown, Unpaired, and Untranscribed Languages"],"prefix":"10.1109","volume":"13","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4882-8336","authenticated-orcid":false,"given":"Luan","family":"Thanh Nguyen","sequence":"first","affiliation":[{"name":"Japan Advanced Institute of Science and Technology, Nomi, Ishikawa, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5509-8963","authenticated-orcid":false,"given":"Sakriani","family":"Sakti","sequence":"additional","affiliation":[{"name":"Japan Advanced Institute of Science and Technology, Nomi, Ishikawa, Japan"}]}],"member":"263","reference":[{"key":"ref1","first-page":"1","article-title":"Sequence to sequence learning with neural networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Sutskever"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D15-1166"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-1162"},{"key":"ref4","article-title":"Neural machine translation by jointly learning to align and translate","volume-title":"Optical Spectroscopic Techniques and Instrumentation for Atmospheric and Space Research","author":"Bahdanau","year":"2016"},{"key":"ref5","article-title":"Google\u2019s neural machine translation system: Bridging the gap between human and machine translation","author":"Wu","year":"2016","journal-title":"arXiv:1609.08144"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-1388"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1353\/lan.0.0054"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4615-2732-9"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.661"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383496"},{"volume-title":"Text-to-Speech Synthesis Using Found Data for Low-resource Languages","year":"2019","author":"Cooper","key":"ref11"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10938"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.21437\/SIGUL.2023-12"},{"key":"ref14","first-page":"1858","article-title":"Unsupervised learning of spoken language with visual context","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"29","author":"Harwath"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-502"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1613\/jair.1.12967"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10652"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462396"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095091"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-903"},{"key":"ref22","first-page":"1336","article-title":"On generative spoken language modeling from raw audio","volume":"9","author":"Lakhotia","year":"2021","journal-title":"Trans. Assoc. Comput. Linguistics"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.235"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3444470"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.951"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.63"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-11032"},{"key":"ref28","article-title":"Representation learning with contrastive predictive coding","author":"van den Oord","year":"2018","journal-title":"arXiv:1807.03748"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3288409"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.411"},{"key":"ref31","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","volume-title":"Proc. Conf. North Amer. Chapter Assoc. Comput. Linguistics, Hum. Lang. Technol.","volume":"1","author":"Devlin"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.4324\/9781003022022-6"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref34","article-title":"Multilingual translation with extensible multilingual pretraining and finetuning","author":"Tang","year":"2020","journal-title":"arXiv:2008.00401"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683069"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/p17-1057"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00140"},{"volume-title":"Ultralytics YOLOv8","year":"2023","author":"Jocher","key":"ref39"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.80"},{"key":"ref42","first-page":"12449","article-title":"Wav2vec 2.0: A framework for self-supervised learning of speech representations","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Baevski"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"}],"container-title":["IEEE Access"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6287639\/10820123\/10833610.pdf?arnumber=10833610","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,20]],"date-time":"2025-01-20T19:06:03Z","timestamp":1737399963000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10833610\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"references-count":44,"URL":"https:\/\/doi.org\/10.1109\/access.2025.3527012","relation":{},"ISSN":["2169-3536"],"issn-type":[{"type":"electronic","value":"2169-3536"}],"subject":[],"published":{"date-parts":[[2025]]}}}