{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T21:24:57Z","timestamp":1768339497327,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":30,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3652583.3658029","type":"proceedings-article","created":{"date-parts":[[2024,6,7]],"date-time":"2024-06-07T06:30:40Z","timestamp":1717741840000},"page":"951-959","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["End-to-End Thai Text-to-Speech with Linguistic Unit"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-7927-2779","authenticated-orcid":false,"given":"Kontawat","family":"Wisetpaitoon","sequence":"first","affiliation":[{"name":"Kasikorn Labs Co. Ltd., Nonthaburi, Thailand"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7335-7105","authenticated-orcid":false,"given":"Sattaya","family":"Singkul","sequence":"additional","affiliation":[{"name":"Kasikorn Labs Co. Ltd., Nonthaburi, Thailand"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-9640-2105","authenticated-orcid":false,"given":"Theerat","family":"Sakdejayont","sequence":"additional","affiliation":[{"name":"Kasikorn Labs Co. Ltd., Nonthaburi, Thailand"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4154-8745","authenticated-orcid":false,"given":"Tawunrat","family":"Chalothorn","sequence":"additional","affiliation":[{"name":"Kasikorn Labs Co. Ltd., Nonthaburi, Thailand"}]}],"member":"320","published-online":{"date-parts":[[2024,6,7]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings of the Twelfth Language Resources and Evaluation Conference. 4218--4222","author":"Ardila Rosana","year":"2020","unstructured":"Rosana Ardila, Megan Branson, Kelly Davis, Michael Kohler, Josh Meyer, Michael Henretty, Reuben Morais, Lindsay Saunders, Francis Tyers, and Gregor Weber. 2020. Common Voice: A Massively-Multilingual Speech Corpus. In Proceedings of the Twelfth Language Resources and Evaluation Conference. 4218--4222."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-91699-2_39"},{"key":"e_1_3_2_1_3_1","volume-title":"Frederico Santos de Oliveira, Arnaldo Candido Junior, Anderson da Silva Soares, Sandra Maria Aluisio, and Moacir Antonelli Ponti.","author":"Casanova Edresson","year":"2021","unstructured":"Edresson Casanova, Christopher Shulby, Eren G\u00f6lge, Nicolas Michael M\u00fcller, Frederico Santos de Oliveira, Arnaldo Candido Junior, Anderson da Silva Soares, Sandra Maria Aluisio, and Moacir Antonelli Ponti. 2021b. Sc-glowtts: an efficient zero-shot multi-speaker text-to-speech model. arXiv preprint arXiv:2104.05557 (2021)."},{"key":"e_1_3_2_1_4_1","volume-title":"International Conference on Machine Learning. PMLR, 2709--2720","author":"Casanova Edresson","year":"2022","unstructured":"Edresson Casanova, Julian Weber, Christopher D Shulby, Arnaldo Candido Junior, Eren G\u00f6lge, and Moacir A Ponti. 2022. Yourtts: Towards zero-shot multi-speaker tts and zero-shot voice conversion for everyone. In International Conference on Machine Learning. PMLR, 2709--2720."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.3844\/jcssp.2011.359.365"},{"key":"e_1_3_2_1_6_1","volume-title":"VoxCeleb2: Deep speaker recognition. Interspeech 2018","author":"Chung J","year":"2018","unstructured":"J Chung, A Nagrani, and A Zisserman. 2018. VoxCeleb2: Deep speaker recognition. Interspeech 2018 (2018)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1064"},{"key":"e_1_3_2_1_8_1","unstructured":"Eren G\u00f6lge. 2020. Solving attention problems of tts models with double decoder consistency."},{"key":"e_1_3_2_1_9_1","volume-title":"The Oriental COCOSDA 2003","author":"Hansakunbuntheung Chatchawarn","year":"2003","unstructured":"Chatchawarn Hansakunbuntheung, Virongrong Tesprasit, and Virach Sornlertlamvanich. 2003. Thai tagged speech corpus for speech synthesis. The Oriental COCOSDA 2003 (2003), 97--104."},{"key":"e_1_3_2_1_10_1","volume-title":"International Conference on Machine Learning. PMLR, 5530--5540","author":"Kim Jaehyeon","year":"2021","unstructured":"Jaehyeon Kim, Jungil Kong, and Juhee Son. 2021. Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech. In International Conference on Machine Learning. PMLR, 5530--5540."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-022-13943-4"},{"key":"e_1_3_2_1_12_1","volume-title":"Deep speaker: an end-to-end neural speaker embedding system. arXiv preprint arXiv:1705.02304","author":"Li Chao","year":"2017","unstructured":"Chao Li, Xiaokong Ma, Bing Jiang, Xiangang Li, Xuewei Zhang, Xiao Liu, Ying Cao, Ajay Kannan, and Zhenyao Zhu. 2017. Deep speaker: an end-to-end neural speaker embedding system. arXiv preprint arXiv:1705.02304 (2017)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2019.101027"},{"key":"e_1_3_2_1_14_1","unstructured":"Wannaphong Phatthiyaphaibun. 2020. thai-g2p. https:\/\/github.com\/wannaphong\/thai-g2p\/."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2018.12.003"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/O-COCOSDA46868.2019.9041212"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/iSAI-NLP48611.2019.9045639"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICITEED.2019.8930002"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6639221"},{"key":"e_1_3_2_1_21_1","volume-title":"Text-to-speech synthesis","author":"Taylor Paul","unstructured":"Paul Taylor. 2009. Text-to-speech synthesis. Cambridge university press."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/O-COCOSDA50338.2020.9295001"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2011.04.142"},{"key":"e_1_3_2_1_24_1","volume-title":"Tacotron: Towards end-to-end speech synthesis. arXiv preprint arXiv:1703.10135","author":"Wang Yuxuan","year":"2017","unstructured":"Yuxuan Wang, RJ Skerry-Ryan, Daisy Stanton, Yonghui Wu, Ron J Weiss, Navdeep Jaitly, Zongheng Yang, Ying Xiao, Zhifeng Chen, Samy Bengio, et al. 2017. Tacotron: Towards end-to-end speech synthesis. arXiv preprint arXiv:1703.10135 (2017)."},{"key":"e_1_3_2_1_25_1","unstructured":"Chai Wutiwiwatchai Patcharika Chootrakool Sittipong Saychum Nattanun Thatphithakkul Anocha Rugchatjaroen and Ausdang Thangthai. [n. d.]. TSynC-2: Thai Speech Synthesis Corpus Version 2. ([n. d.])."},{"key":"e_1_3_2_1_26_1","unstructured":"Chai Wutiwiwatchai Patcharika Chootrakool Sittipong Saychum Nattanun Thatphithakkul Anocha Rugchatjaroen and Ausdang Thangthai. 2008. TSynC-2: Thai Speech Synthesis Corpus Version 2 TSynC-2. (2008)."},{"key":"e_1_3_2_1_27_1","first-page":"1","article-title":"Thai text-to-speech synthesis: a review","volume":"2","author":"Wutiwiwatchai Chai","year":"2017","unstructured":"Chai Wutiwiwatchai, Chatchawarn Hansakunbuntheung, Anocha Rugchatjaroen, Sittipong Saychum, Sawit Kasuriya, and Patcharika Chootrakool. 2017. Thai text-to-speech synthesis: a review. Journal of Intelligent Informatics and Smart Technology, Vol. 2, 2 (2017), 1--8.","journal-title":"Journal of Intelligent Informatics and Smart Technology"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"Detai Xin Yuki Saito Shinnosuke Takamichi Tomoki Koriyama and Hiroshi Saruwatari. 2021. Cross-Lingual Speaker Adaptation Using Domain Adaptation and Speaker Consistency Loss for Text-To-Speech Synthesis.. In Interspeech. 1614--1618.","DOI":"10.21437\/Interspeech.2021-897"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCSIT.2008.158"},{"key":"e_1_3_2_1_30_1","volume-title":"ByT5 model for massively multilingual grapheme-to-phoneme conversion. arXiv preprint arXiv:2204.03067","author":"Zhu Jian","year":"2022","unstructured":"Jian Zhu, Cong Zhang, and David Jurgens. 2022. ByT5 model for massively multilingual grapheme-to-phoneme conversion. arXiv preprint arXiv:2204.03067 (2022)."}],"event":{"name":"ICMR '24: International Conference on Multimedia Retrieval","location":"Phuket Thailand","acronym":"ICMR '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia","SIGSOFT ACM Special Interest Group on Software Engineering"]},"container-title":["Proceedings of the 2024 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658029","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3652583.3658029","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T08:48:12Z","timestamp":1755766092000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658029"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":30,"alternative-id":["10.1145\/3652583.3658029","10.1145\/3652583"],"URL":"https:\/\/doi.org\/10.1145\/3652583.3658029","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}