{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T15:26:41Z","timestamp":1775230001346,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":24,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"National Natural Science Foundation of China","award":["62076144"],"award-info":[{"award-number":["62076144"]}]},{"name":"The Center for Perceptual and Interactive Intelligence (CPII) Ltd under the Innovation and Technology Commission's InnoHK Scheme"},{"name":"Shenzhen Science and Technology Program","award":["WDZC20220816140515001"],"award-info":[{"award-number":["WDZC20220816140515001"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612485","type":"proceedings-article","created":{"date-parts":[[2023,11,2]],"date-time":"2023-11-02T10:35:57Z","timestamp":1698921357000},"page":"2829-2837","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["SpeechTripleNet: End-to-End Disentangled Speech Representation Learning for Content, Timbre and Prosody"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7698-6262","authenticated-orcid":false,"given":"Hui","family":"Lu","sequence":"first","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong SAR, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9543-1572","authenticated-orcid":false,"given":"Xixin","family":"Wu","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong SAR, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8533-0524","authenticated-orcid":false,"given":"Zhiyong","family":"Wu","sequence":"additional","affiliation":[{"name":"Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4427-3532","authenticated-orcid":false,"given":"Helen","family":"Meng","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong SAR, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Understanding disentangling in (\u03b2)-VAE. CoRR","author":"Burgess Christopher P.","year":"2018","unstructured":"Christopher P. Burgess, Irina Higgins, Arka Pal, Lo\u00efc Matthey, Nick Watters, Guillaume Desjardins, and Alexander Lerchner. 2018. Understanding disentangling in (\u03b2)-VAE. CoRR, Vol. abs\/1804.03599 (2018). [arXiv]1804.03599 http:\/\/arxiv.org\/abs\/1804.03599"},{"key":"e_1_3_2_1_2_1","volume-title":"ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 6332--6336","author":"Chan Chak Ho","year":"2022","unstructured":"Chak Ho Chan, Kaizhi Qian, Yang Zhang, and Mark Hasegawa-Johnson. 2022. Speechsplit2. 0: Unsupervised speech disentanglement for voice conversion without tuning autoencoder bottlenecks. In ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 6332--6336."},{"key":"e_1_3_2_1_3_1","volume-title":"20th Annual Conference of the International Speech Communication Association","author":"Chou Ju-Chieh","year":"2019","unstructured":"Ju-Chieh Chou and Hung-yi Lee. 2019. One-Shot Voice Conversion by Separating Speaker and Content Representations with Instance Normalization. In Interspeech 2019, 20th Annual Conference of the International Speech Communication Association, Graz, Austria, 15-19 September 2019, Gernot Kubin and Zdravko Kacic (Eds.). ISCA, 664--668."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.5555\/3326943.3327009"},{"key":"e_1_3_2_1_5_1","volume-title":"5th International Conference on Learning Representations, ICLR 2017, Toulon, France, April 24-26, 2017, Conference Track Proceedings. OpenReview.net.","author":"Higgins Irina","year":"2017","unstructured":"Irina Higgins, Lo\u00efc Matthey, Arka Pal, Christopher Burgess, Xavier Glorot, Matthew Botvinick, Shakir Mohamed, and Alexander Lerchner. 2017. beta-VAE: Learning Basic Visual Concepts with a Constrained Variational Framework. In 5th International Conference on Learning Representations, ICLR 2017, Toulon, France, April 24-26, 2017, Conference Track Proceedings. OpenReview.net."},{"key":"e_1_3_2_1_6_1","volume-title":"Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017","author":"Hsu Wei-Ning","year":"2017","unstructured":"Wei-Ning Hsu, Yu Zhang, and James R. Glass. 2017. Unsupervised Learning of Disentangled and Interpretable Representations from Sequential Data. In Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4-9, 2017, Long Beach, CA, USA, Isabelle Guyon, Ulrike von Luxburg, Samy Bengio, Hanna M. Wallach, Rob Fergus, S. V. N. Vishwanathan, and Roman Garnett (Eds.). 1878--1889."},{"key":"e_1_3_2_1_7_1","volume-title":"5th International Conference on Learning Representations, ICLR","author":"Jang Eric","year":"2017","unstructured":"Eric Jang, Shixiang Gu, and Ben Poole. 2017. Categorical Reparameterization with Gumbel-Softmax. In 5th International Conference on Learning Representations, ICLR 2017, Toulon, France, April 24-26, 2017, Conference Track Proceedings. OpenReview.net. https:\/\/openreview.net\/forum?id=rkE3y85ee"},{"key":"e_1_3_2_1_8_1","volume-title":"Kingma and Jimmy Ba","author":"Diederik","year":"2015","unstructured":"Diederik P. Kingma and Jimmy Ba. 2015. Adam: A Method for Stochastic Optimization. In 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings, Yoshua Bengio and Yann LeCun (Eds.). http:\/\/arxiv.org\/abs\/1412.6980"},{"key":"e_1_3_2_1_9_1","volume-title":"Kingma and Max Welling","author":"Diederik","year":"2014","unstructured":"Diederik P. Kingma and Max Welling. 2014. Auto-Encoding Variational Bayes. In 2nd International Conference on Learning Representations, ICLR 2014, Banff, AB, Canada, April 14-16, 2014, Conference Track Proceedings, Yoshua Bengio and Yann LeCun (Eds.)."},{"key":"e_1_3_2_1_10_1","first-page":"17022","article-title":"Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis","volume":"33","author":"Kong Jungil","year":"2020","unstructured":"Jungil Kong, Jaehyeon Kim, and Jaekyoung Bae. 2020. Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis. Advances in Neural Information Processing Systems, Vol. 33 (2020), 17022--17033.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_11_1","unstructured":"Oleksii Kuchaiev Jason Li Huyen Nguyen Oleksii Hrinchuk Ryan Leary Boris Ginsburg Samuel Kriman Stanislav Beliaev Vitaly Lavrukhin Jack Cook et al. 2019. Nemo: a toolkit for building ai applications using neural modules. arXiv preprint arXiv:1909.09577 (2019)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9781139166621.003"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT54892.2023.10022787"},{"key":"e_1_3_2_1_14_1","volume-title":"37th International Conference on Machine Learning, ICML 2020 (37th International Conference on Machine Learning, ICML","author":"Qian Kaizhi","year":"2020","unstructured":"Kaizhi Qian, Yang Zhang, Shiyu Chang, David Cox, and Mark Hasegawa-Johnson. 2020. Unsupervised speech decomposition via triple information bottleneck. In 37th International Conference on Machine Learning, ICML 2020 (37th International Conference on Machine Learning, ICML 2020), Hal Daume and Aarti Singh (Eds.). International Machine Learning Society (IMLS), 7792--7802. Publisher Copyright: Copyright textcopyright 2020 by the Authors. All rights reserved.; 37th International Conference on Machine Learning, ICML 2020; Conference date: 13-07-2020 Through 18-07-2020."},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the 36th International Conference on Machine Learning, ICML 2019","volume":"5219","author":"Qian Kaizhi","year":"2019","unstructured":"Kaizhi Qian, Yang Zhang, Shiyu Chang, Xuesong Yang, and Mark Hasegawa-Johnson. 2019. AutoVC: Zero-Shot Voice Style Transfer with Only Autoencoder Loss. In Proceedings of the 36th International Conference on Machine Learning, ICML 2019, 9-15 June 2019, Long Beach, California, USA (Proceedings of Machine Learning Research, Vol. 97), Kamalika Chaudhuri and Ruslan Salakhutdinov (Eds.). PMLR, 5210--5219."},{"key":"e_1_3_2_1_16_1","volume-title":"Garnett (Eds.)","volume":"32","author":"Ren Yi","year":"2019","unstructured":"Yi Ren, Yangjun Ruan, Xu Tan, Tao Qin, Sheng Zhao, Zhou Zhao, and Tie-Yan Liu. 2019. FastSpeech: Fast, Robust and Controllable Text to Speech. In Advances in Neural Information Processing Systems, H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch\u00e9-Buc, E. Fox, and R. Garnett (Eds.), Vol. 32. Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2019\/file\/f63f65b503e22cb970527f23c9ad7db1-Paper.pdf"},{"key":"e_1_3_2_1_17_1","volume-title":"Instance normalization: The missing ingredient for fast stylization. arXiv preprint arXiv:1607.08022","author":"Ulyanov Dmitry","year":"2016","unstructured":"Dmitry Ulyanov, Andrea Vedaldi, and Victor Lempitsky. 2016. Instance normalization: The missing ingredient for fast stylization. arXiv preprint arXiv:1607.08022 (2016)."},{"key":"e_1_3_2_1_18_1","volume-title":"22nd Annual Conference of the International Speech Communication Association, Brno","author":"Wang Disong","year":"2021","unstructured":"Disong Wang, Liqun Deng, Yu Ting Yeung, Xiao Chen, Xunying Liu, and Helen Meng. 2021. VQMIVC: Vector Quantization and Mutual Information-Based Unsupervised Speech Representation Disentanglement for One-Shot Voice Conversion. In Interspeech 2021, 22nd Annual Conference of the International Speech Communication Association, Brno, Czechia, 30 August - 3 September 2021, Hynek Hermansky, Honza Cernock\u00fd, Luk\u00e1s Burget, Lori Lamel, Odette Scharenborg, and Petr Motl\u00edcek (Eds.). ISCA, 1344--1348."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN55064.2022.9892405"},{"key":"e_1_3_2_1_20_1","volume-title":"International Conference on Machine Learning. PMLR, 5180--5189","author":"Wang Yuxuan","year":"2018","unstructured":"Yuxuan Wang, Daisy Stanton, Yu Zhang, RJ-Skerry Ryan, Eric Battenberg, Joel Shor, Ying Xiao, Ye Jia, Fei Ren, and Rif A Saurous. 2018. Style tokens: Unsupervised style modeling, control and transfer in end-to-end speech synthesis. In International Conference on Machine Learning. PMLR, 5180--5189."},{"key":"e_1_3_2_1_21_1","volume-title":"21st Annual Conference of the International Speech Communication Association, Virtual Event","author":"Wu Da-Yi","year":"2020","unstructured":"Da-Yi Wu, Yen-Hao Chen, and Hung-yi Lee. 2020. VQVC: One-Shot Voice Conversion by Vector Quantization and U-Net Architecture. In Interspeech 2020, 21st Annual Conference of the International Speech Communication Association, Virtual Event, Shanghai, China, 25-29 October 2020, Helen Meng, Bo Xu, and Thomas Fang Zheng (Eds.). ISCA, 4691--4695."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053854"},{"key":"e_1_3_2_1_23_1","unstructured":"Junichi Yamagishi Christophe Veaux and Kirsten MacDonald. 2019. CSTR VCTK Corpus: English Multi-speaker Corpus for CSTR Voice Cloning Toolkit (version 0.92)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","unstructured":"Ya-Jie Zhang Shifeng Pan Lei He and Zhen-Hua Ling. 2019. Learning Latent Representations for Style Control and Transfer in End-to-end Speech Synthesis. In ICASSP 2019 - 2019 IEEE International Conference on Acoustics Speech and Signal Processing (ICASSP). 6945--6949. https:\/\/doi.org\/10.1109\/ICASSP.2019.8683623","DOI":"10.1109\/ICASSP.2019.8683623"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612485","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612485","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T23:59:26Z","timestamp":1755820766000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612485"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":24,"alternative-id":["10.1145\/3581783.3612485","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612485","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}