{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,8]],"date-time":"2025-09-08T06:06:36Z","timestamp":1757311596724,"version":"3.37.3"},"reference-count":25,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,4,14]],"date-time":"2024-04-14T00:00:00Z","timestamp":1713052800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,4,14]],"date-time":"2024-04-14T00:00:00Z","timestamp":1713052800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100002642","name":"Korea University","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100002642","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,4,14]]},"DOI":"10.1109\/icassp48485.2024.10447331","type":"proceedings-article","created":{"date-parts":[[2024,3,18]],"date-time":"2024-03-18T18:56:31Z","timestamp":1710788191000},"page":"12722-12726","source":"Crossref","is-referenced-by-count":3,"title":["TranSentence: speech-to-speech Translation via Language-Agnostic Sentence-Level Speech Encoding without Language-Parallel Data"],"prefix":"10.1109","author":[{"given":"Seung-Bin","family":"Kim","sequence":"first","affiliation":[{"name":"Korea University,Department of Artificial Intelligence,Seoul,Korea"}]},{"given":"Sang-Hoon","family":"Lee","sequence":"additional","affiliation":[{"name":"Korea University,Department of Artificial Intelligence,Seoul,Korea"}]},{"given":"Seong-Whan","family":"Lee","sequence":"additional","affiliation":[{"name":"Korea University,Department of Artificial Intelligence,Seoul,Korea"}]}],"member":"263","reference":[{"doi-asserted-by":"publisher","key":"ref1","DOI":"10.1109\/ICASSP.1997.599557"},{"doi-asserted-by":"publisher","key":"ref2","DOI":"10.1109\/TSA.2005.860774"},{"doi-asserted-by":"publisher","key":"ref3","DOI":"10.1109\/ICASSP49357.2023.10096183"},{"doi-asserted-by":"publisher","key":"ref4","DOI":"10.21437\/Interspeech.2019-1951"},{"volume-title":"International Conference on Machine Learning","author":"Jia","article-title":"Translatotron 2: High-quality direct speech-to-speech translation with voice preservation","key":"ref5"},{"doi-asserted-by":"publisher","key":"ref6","DOI":"10.1109\/TASLP.2021.3122291"},{"doi-asserted-by":"publisher","key":"ref7","DOI":"10.18653\/v1\/2022.acl-long.235"},{"doi-asserted-by":"publisher","key":"ref8","DOI":"10.21437\/Interspeech.2021-475"},{"doi-asserted-by":"publisher","key":"ref9","DOI":"10.18653\/v1\/2022.naacl-main.63"},{"doi-asserted-by":"publisher","key":"ref10","DOI":"10.1109\/ICASSP49357.2023.10095578"},{"doi-asserted-by":"publisher","key":"ref11","DOI":"10.1109\/ICASSP49357.2023.10096797"},{"doi-asserted-by":"publisher","key":"ref12","DOI":"10.18653\/v1\/2023.acl-long.872"},{"doi-asserted-by":"publisher","key":"ref13","DOI":"10.1109\/ICASSP49357.2023.10095616"},{"year":"2023","author":"Diwan","article-title":"Unit-based speech-to-speech translation without parallel data","key":"ref14"},{"doi-asserted-by":"publisher","key":"ref15","DOI":"10.1109\/icassp48485.2024.10448426"},{"key":"ref16","article-title":"Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis","author":"Kong","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"doi-asserted-by":"publisher","key":"ref17","DOI":"10.18653\/v1\/2023.acl-long.899"},{"doi-asserted-by":"publisher","key":"ref18","DOI":"10.21437\/Interspeech.2022-143"},{"key":"ref19","article-title":"Hierspeech: Bridging the gap between text and speech by hierarchical variational inference using self-supervised representations for speech synthesis","author":"Lee","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref20","first-page":"4218","article-title":"Common voice: A massively-multilingual speech corpus","volume-title":"Proceedings of the Twelfth Language Resources and Evaluation Conference","author":"Ardila"},{"volume-title":"Proceedings of the Thirteenth Language Resources and Evaluation Conference","author":"Jia","article-title":"CVSS corpus and massively multilingual speech-to-speech translation","key":"ref21"},{"key":"ref22","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","author":"Baevski","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"doi-asserted-by":"publisher","key":"ref23","DOI":"10.21437\/Interspeech.2021-329"},{"volume-title":"The Eleventh International Conference on Learning Representations","author":"Huang","article-title":"Transpeech: Speech-to-speech translation with bilateral perturbation","key":"ref24"},{"doi-asserted-by":"publisher","key":"ref25","DOI":"10.1609\/aaai.v34i03.5632"}],"event":{"name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","start":{"date-parts":[[2024,4,14]]},"location":"Seoul, Korea, Republic of","end":{"date-parts":[[2024,4,19]]}},"container-title":["ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10445798\/10445803\/10447331.pdf?arnumber=10447331","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,2]],"date-time":"2024-08-02T05:35:29Z","timestamp":1722576929000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10447331\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,14]]},"references-count":25,"URL":"https:\/\/doi.org\/10.1109\/icassp48485.2024.10447331","relation":{},"subject":[],"published":{"date-parts":[[2024,4,14]]}}}