{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:14:14Z","timestamp":1765008854554,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":26,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62261057"],"award-info":[{"award-number":["62261057"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,12,9]]},"DOI":"10.1145\/3743093.3771028","type":"proceedings-article","created":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:08:11Z","timestamp":1765008491000},"page":"1-7","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["ECTSpeech: Enhancing Efficient Speech Synthesis via Easy Consistency Tuning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-4085-921X","authenticated-orcid":false,"given":"tao","family":"zhu","sequence":"first","affiliation":[{"name":"School of Computer Science and Technology, Xinjiang University, Urumqi, China and Joint International Research Laboratory of Silk Road Multilingual Cognitive Computing, Urumqi, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3089-4140","authenticated-orcid":false,"given":"Yinfeng","family":"Yu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Xinjinag University, Urumqi, China and Joint International Research Laboratory of Silk Road Multilingual Cognitive Computing, Urumqi, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0210-2273","authenticated-orcid":false,"given":"Liejun","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Xinjinag University, Urumqi, China and Joint International Research Laboratory of Silk Road Multilingual Cognitive Computing, Urumqi, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3546-6305","authenticated-orcid":false,"given":"Fuchun","family":"Sun","sequence":"additional","affiliation":[{"name":"Tsinghua university, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7575-5876","authenticated-orcid":false,"given":"Wendong","family":"Zheng","sequence":"additional","affiliation":[{"name":"Tianjin University of Technology, Tianjin, China"}]}],"member":"320","published-online":{"date-parts":[[2025,12,6]]},"reference":[{"key":"e_1_3_3_1_2_2","unstructured":"Kingma DP Ba\u00a0J Adam et\u00a0al. 2014. A method for stochastic optimization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1412.6980 1412 6 (2014)."},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/SMC54092.2024.10831778"},{"key":"e_1_3_3_1_4_2","unstructured":"Zehua Chen Yihan Wu Yichong Leng Jiawei Chen Haohe Liu Xu Tan Yang Cui Ke Wang Lei He Sheng Zhao et\u00a0al. 2022. Resgrad: Residual denoising diffusion probabilistic models for text to speech. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2212.14518 (2022)."},{"key":"e_1_3_3_1_5_2","unstructured":"Min Chu and Hu Peng. 2006. Objective measure for estimating mean opinion score of synthesized speech. US Patent 7 024 362."},{"key":"e_1_3_3_1_6_2","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Geng Zhengyang","year":"2025","unstructured":"Zhengyang Geng, Ashwini Pokle, Weijian Luo, Justin Lin, and J\u00a0Zico Kolter. 2025. Consistency Models Made Easy. In The Thirteenth International Conference on Learning Representations."},{"key":"e_1_3_3_1_7_2","unstructured":"Martin Heusel Hubert Ramsauer Thomas Unterthiner Bernhard Nessler and Sepp Hochreiter. 2017. Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_1_8_2","unstructured":"Jonathan Ho Ajay Jain and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems 33 (2020) 6840\u20136851."},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547855"},{"key":"e_1_3_3_1_10_2","unstructured":"Keith Ito and Linda Johnson. 2017. The LJ Speech Dataset. https:\/\/keithito.com\/LJ-Speech-Dataset\/."},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-469"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICME57554.2024.10688053"},{"key":"e_1_3_3_1_13_2","unstructured":"Tero Karras Miika Aittala Timo Aila and Samuli Laine. 2022. Elucidating the design space of diffusion-based generative models. Advances in neural information processing systems 35 (2022) 26565\u201326577."},{"key":"e_1_3_3_1_14_2","unstructured":"Jungil Kong Jaehyeon Kim and Jaekyoung Bae. 2020. Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis. Advances in neural information processing systems 33 (2020) 17022\u201317033."},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33016706"},{"key":"e_1_3_3_1_16_2","unstructured":"Haohe Liu Zehua Chen Yi Yuan Xinhao Mei Xubo Liu Danilo Mandic Wenwu Wang and Mark\u00a0D Plumbley. 2023. Audioldm: Text-to-audio generation with latent diffusion models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2301.12503 (2023)."},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i10.21350"},{"key":"e_1_3_3_1_18_2","unstructured":"Songxiang Liu Dan Su and Dong Yu. 2022. Diffgan-tts: High-fidelity and efficient text-to-speech with denoising diffusion gans. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2201.11972 (2022)."},{"key":"e_1_3_3_1_19_2","first-page":"8599","volume-title":"International conference on machine learning","author":"Popov Vadim","year":"2021","unstructured":"Vadim Popov, Ivan Vovk, Vladimir Gogoryan, Tasnima Sadekova, and Mikhail Kudinov. 2021. Grad-tts: A diffusion probabilistic model for text-to-speech. In International conference on machine learning. PMLR, 8599\u20138608."},{"key":"e_1_3_3_1_20_2","unstructured":"Yi Ren Chenxu Hu Xu Tan Tao Qin Sheng Zhao Zhou Zhao and Tie-Yan Liu. 2020. Fastspeech 2: Fast and high-quality end-to-end text to speech. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2006.04558 (2020)."},{"key":"e_1_3_3_1_21_2","unstructured":"Yi Ren Yangjun Ruan Xu Tan Tao Qin Sheng Zhao Zhou Zhao and Tie-Yan Liu. 2019. Fastspeech: Fast robust and controllable text to speech. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"e_1_3_3_1_23_2","series-title":"(ICML\u201923)","volume-title":"Proceedings of the 40th International Conference on Machine Learning","author":"Song Yang","year":"2023","unstructured":"Yang Song, Prafulla Dhariwal, Mark Chen, and Ilya Sutskever. 2023. Consistency models. In Proceedings of the 40th International Conference on Machine Learning (Honolulu, Hawaii, USA) (ICML\u201923). JMLR.org, Article 1335, 42\u00a0pages."},{"key":"e_1_3_3_1_24_2","volume-title":"International Conference on Learning Representations","author":"Song Yang","year":"2021","unstructured":"Yang Song, Jascha Sohl-Dickstein, Diederik\u00a0P Kingma, Abhishek Kumar, Stefano Ermon, and Ben Poole. 2021. Score-Based Generative Modeling through Stochastic Differential Equations. In International Conference on Learning Representations."},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612061"},{"key":"e_1_3_3_1_27_2","first-page":"3","volume-title":"International Conference on Cognitive Systems and Signal Processing","author":"Yu Yinfeng","year":"2021","unstructured":"Yinfeng Yu, Zhenhong Jia, Fei Shi, Meiling Zhu, Wenjun Wang, and Xiuhong Li. 2021. Weavenet: End-to-end audiovisual sentiment analysis. In International Conference on Cognitive Systems and Signal Processing. Springer, 3\u201316."}],"event":{"name":"MMAsia '25: ACM Multimedia Asia","location":"Kuala Lumpur Malaysia","acronym":"MMAsia '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 7th ACM International Conference on Multimedia in Asia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3743093.3771028","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:10:06Z","timestamp":1765008606000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3743093.3771028"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":26,"alternative-id":["10.1145\/3743093.3771028","10.1145\/3743093"],"URL":"https:\/\/doi.org\/10.1145\/3743093.3771028","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]},"assertion":[{"value":"2025-12-06","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}