{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,25]],"date-time":"2026-04-25T14:55:18Z","timestamp":1777128918081,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":41,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Natural Science Foundation of China","award":["No. 62222211"],"award-info":[{"award-number":["No. 62222211"]}]},{"name":"National Natural Science Foundation of China","award":["No.62072397"],"award-info":[{"award-number":["No.62072397"]}]},{"name":"National Natural Science Foundation of China","award":["No.61836002"],"award-info":[{"award-number":["No.61836002"]}]},{"name":"National Key R&D Program of China","award":["No.2022ZD0162000"],"award-info":[{"award-number":["No.2022ZD0162000"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612150","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"7569-7579","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["UniSinger: Unified End-to-End Singing Voice Synthesis With Cross-Modality Information Matching"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0907-6621","authenticated-orcid":false,"given":"Zhiqing","family":"Hong","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-9083-1628","authenticated-orcid":false,"given":"Chenye","family":"Cui","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1695-9000","authenticated-orcid":false,"given":"Rongjie","family":"Huang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0389-4090","authenticated-orcid":false,"given":"Lichao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4420-4163","authenticated-orcid":false,"given":"Jinglin","family":"Liu","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-3024-9624","authenticated-orcid":false,"given":"Jinzheng","family":"He","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6121-0384","authenticated-orcid":false,"given":"Zhou","family":"Zhao","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"International Conference on Machine Learning. PMLR, 2709--2720","author":"Casanova Edresson","year":"2022","unstructured":"Edresson Casanova, Julian Weber, Christopher D Shulby, Arnaldo Candido Junior, Eren G\u00f6lge, and Moacir A Ponti. 2022. Yourtts: Towards zero-shot multi-speaker tts and zero-shot voice conversion for everyone. In International Conference on Machine Learning. PMLR, 2709--2720."},{"key":"e_1_3_2_1_2_1","volume-title":"Hifisinger: Towards high-fidelity neural singing voice synthesis. arXiv preprint arXiv:2009.01776","author":"Chen Jiawei","year":"2020","unstructured":"Jiawei Chen, Xu Tan, Jian Luan, Tao Qin, and Tie-Yan Liu. 2020. Hifisinger: Towards high-fidelity neural singing voice synthesis. arXiv preprint arXiv:2009.01776 (2020)."},{"key":"e_1_3_2_1_3_1","volume-title":"International conference on machine learning. PMLR, 1779--1788","author":"Cheng Pengyu","year":"2020","unstructured":"Pengyu Cheng, Weituo Hao, Shuyang Dai, Jiachang Liu, Zhe Gan, and Lawrence Carin. 2020. Club: A contrastive log-ratio upper bound of mutual information. In International conference on machine learning. PMLR, 1779--1788."},{"key":"e_1_3_2_1_4_1","volume-title":"Unsupervised Cross-lingual Representation Learning for Speech Recognition. arxiv","author":"Conneau Alexis","year":"2006","unstructured":"Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, and Michael Auli. 2020. Unsupervised Cross-lingual Representation Learning for Speech Recognition. arxiv: 2006.13979 [cs.CL]"},{"key":"e_1_3_2_1_5_1","volume-title":"EMOVIE: A Mandarin Emotion Speech Dataset with a Simple Emotional Text-to-Speech Model. arXiv preprint arXiv:2106.09317","author":"Cui Chenye","year":"2021","unstructured":"Chenye Cui, Yi Ren, Jinglin Liu, Feiyang Chen, Rongjie Huang, Ming Lei, and Zhou Zhao. 2021. EMOVIE: A Mandarin Emotion Speech Dataset with a Simple Emotional Text-to-Speech Model. arXiv preprint arXiv:2106.09317 (2021)."},{"key":"e_1_3_2_1_6_1","volume-title":"VarietySound: Timbre-Controllable Video to Sound Generation via Unsupervised Information Disentanglement. arXiv preprint arXiv:2211.10666","author":"Cui Chenye","year":"2022","unstructured":"Chenye Cui, Yi Ren, Jinglin Liu, Rongjie Huang, and Zhou Zhao. 2022. VarietySound: Timbre-Controllable Video to Sound Generation via Unsupervised Information Disentanglement. arXiv preprint arXiv:2211.10666 (2022)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054199"},{"key":"e_1_3_2_1_8_1","volume-title":"End-to-End Adversarial Text-to-Speech. arxiv","author":"Donahue Jeff","year":"2006","unstructured":"Jeff Donahue, Sander Dieleman, Miko?aj Bi?kowski, Erich Elsen, and Karen Simonyan. 2021. End-to-End Adversarial Text-to-Speech. arxiv: 2006.03575 [cs.SD]"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3422622"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCSLP49672.2021.9362104"},{"key":"e_1_3_2_1_11_1","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in Neural Information Processing Systems, Vol. 33 (2020), 6840--6851.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475437"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547854"},{"key":"e_1_3_2_1_14_1","volume-title":"Make-an-audio: Text-to-audio generation with prompt-enhanced diffusion models. arXiv preprint arXiv:2301.12661","author":"Huang Rongjie","year":"2023","unstructured":"Rongjie Huang, Jiawei Huang, Dongchao Yang, Yi Ren, Luping Liu, Mingze Li, Zhenhui Ye, Jinglin Liu, Xiang Yin, and Zhou Zhao. 2023. Make-an-audio: Text-to-audio generation with prompt-enhanced diffusion models. arXiv preprint arXiv:2301.12661 (2023)."},{"key":"e_1_3_2_1_15_1","volume-title":"Jun Wang, Dan Su, Dong Yu, Yi Ren, and Zhou Zhao.","author":"Huang Rongjie","year":"2022","unstructured":"Rongjie Huang, Max WY Lam, Jun Wang, Dan Su, Dong Yu, Yi Ren, and Zhou Zhao. 2022b. FastDiff: A Fast Conditional Diffusion Model for High-Quality Speech Synthesis. arXiv preprint arXiv:2204.09934 (2022)."},{"key":"e_1_3_2_1_16_1","unstructured":"Rongjie Huang Yi Ren Jinglin Liu Chenye Cui and Zhou Zhao. [n. d.]. GenerSpeech: Towards Style Transfer for Generalizable Out-Of-Domain Text-to-Speech. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_17_1","volume-title":"TranSpeech: Speech-to-Speech Translation With Bilateral Perturbation. arXiv preprint arXiv:2205.12523","author":"Huang Rongjie","year":"2022","unstructured":"Rongjie Huang, Zhou Zhao, Jinglin Liu, Huadai Liu, Yi Ren, Lichao Zhang, and Jinzheng He. 2022c. TranSpeech: Speech-to-Speech Translation With Bilateral Perturbation. arXiv preprint arXiv:2205.12523 (2022)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639535"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO.2018.8553236"},{"key":"e_1_3_2_1_20_1","volume-title":"Glow: Generative flow with invertible 1x1 convolutions. Advances in neural information processing systems","author":"Kingma Durk P","year":"2018","unstructured":"Durk P Kingma and Prafulla Dhariwal. 2018. Glow: Generative flow with invertible 1x1 convolutions. Advances in neural information processing systems, Vol. 31 (2018)."},{"key":"e_1_3_2_1_21_1","unstructured":"Diederik P Kingma and Max Welling. 2022. Auto-Encoding Variational Bayes. arxiv: 1312.6114 [stat.ML]"},{"key":"e_1_3_2_1_22_1","first-page":"17022","article-title":"Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis","volume":"33","author":"Kong Jungil","year":"2020","unstructured":"Jungil Kong, Jaehyeon Kim, and Jaekyoung Bae. 2020. Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis. Advances in Neural Information Processing Systems, Vol. 33 (2020), 17022--17033.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_23_1","volume-title":"Bilateral denoising diffusion models. arXiv preprint arXiv:2108.11514","author":"Lam Max WY","year":"2021","unstructured":"Max WY Lam, Jun Wang, Rongjie Huang, Dan Su, and Dong Yu. 2021. Bilateral denoising diffusion models. arXiv preprint arXiv:2108.11514 (2021)."},{"key":"e_1_3_2_1_24_1","volume-title":"Adversarially trained end-to-end korean singing voice synthesis system. arXiv preprint arXiv:1908.01919","author":"Lee Juheon","year":"2019","unstructured":"Juheon Lee, Hyeong-Seok Choi, Chang-Bin Jeon, Junghyun Koo, and Kyogu Lee. 2019. Adversarially trained end-to-end korean singing voice synthesis system. arXiv preprint arXiv:1908.01919 (2019)."},{"key":"e_1_3_2_1_25_1","volume-title":"Bigvgan: A universal neural vocoder with large-scale training. arXiv preprint arXiv:2206.04658","author":"Ping Wei","year":"2022","unstructured":"Sang-gil Lee, Wei Ping, Boris Ginsburg, Bryan Catanzaro, and Sungroh Yoon. 2022. Bigvgan: A universal neural vocoder with large-scale training. arXiv preprint arXiv:2206.04658 (2022)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33016706"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i10.21350"},{"key":"e_1_3_2_1_28_1","volume-title":"International conference on machine learning. PMLR, 2391--2400","author":"Mescheder Lars","year":"2017","unstructured":"Lars Mescheder, Sebastian Nowozin, and Andreas Geiger. 2017. Adversarial variational bayes: Unifying variational autoencoders and generative adversarial networks. In International conference on machine learning. PMLR, 2391--2400."},{"key":"e_1_3_2_1_29_1","volume-title":"Unsupervised singing voice conversion. arXiv preprint arXiv:1904.06590","author":"Nachmani Eliya","year":"2019","unstructured":"Eliya Nachmani and Lior Wolf. 2019. Unsupervised singing voice conversion. arXiv preprint arXiv:1904.06590 (2019)."},{"key":"e_1_3_2_1_30_1","volume-title":"Wavenet: A generative model for raw audio. arXiv preprint arXiv:1609.03499","author":"van den Oord Aaron","year":"2016","unstructured":"Aaron van den Oord, Sander Dieleman, Heiga Zen, Karen Simonyan, Oriol Vinyals, Alex Graves, Nal Kalchbrenner, Andrew Senior, and Koray Kavukcuoglu. 2016. Wavenet: A generative model for raw audio. arXiv preprint arXiv:1609.03499 (2016)."},{"key":"e_1_3_2_1_31_1","volume-title":"International Conference on Machine Learning. PMLR, 7836--7846","author":"Qian Kaizhi","year":"2020","unstructured":"Kaizhi Qian, Yang Zhang, Shiyu Chang, Mark Hasegawa-Johnson, and David Cox. 2020. Unsupervised speech decomposition via triple information bottleneck. In International Conference on Machine Learning. PMLR, 7836--7846."},{"key":"e_1_3_2_1_32_1","volume-title":"International Conference on Machine Learning. PMLR, 5210--5219","author":"Qian Kaizhi","year":"2019","unstructured":"Kaizhi Qian, Yang Zhang, Shiyu Chang, Xuesong Yang, and Mark Hasegawa-Johnson. 2019. Autovc: Zero-shot voice style transfer with only autoencoder loss. In International Conference on Machine Learning. PMLR, 5210--5219."},{"key":"e_1_3_2_1_33_1","volume-title":"Fastspeech 2: Fast and high-quality end-to-end text to speech. arXiv preprint arXiv:2006.04558","author":"Ren Yi","year":"2020","unstructured":"Yi Ren, Chenxu Hu, Xu Tan, Tao Qin, Sheng Zhao, Zhou Zhao, and Tie-Yan Liu. 2020a. Fastspeech 2: Fast and high-quality end-to-end text to speech. arXiv preprint arXiv:2006.04558 (2020)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3403249"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"crossref","unstructured":"Eric Tzeng Judy Hoffman Kate Saenko and Trevor Darrell. 2017. Adversarial Discriminative Domain Adaptation. arxiv: 1702.05464 [cs.CV]","DOI":"10.1109\/CVPR.2017.316"},{"key":"e_1_3_2_1_37_1","volume-title":"Xiao Chen, Xunying Liu, and Helen Meng.","author":"Wang Disong","year":"2021","unstructured":"Disong Wang, Liqun Deng, Yu Ting Yeung, Xiao Chen, Xunying Liu, and Helen Meng. 2021. Vqmivc: Vector quantization and mutual information-based unsupervised speech representation disentanglement for one-shot voice conversion. arXiv preprint arXiv:2106.10132 (2021)."},{"key":"e_1_3_2_1_38_1","unstructured":"Zhenhui Ye Zhou Zhao Yi Ren and Fei Wu. 2022. SyntaSpeech: Syntax-Aware Generative Adversarial Text-to-Speech. arxiv: 2204.11792 [cs.SD]"},{"key":"e_1_3_2_1_39_1","first-page":"6914","article-title":"M4Singer: A Multi-Style, Multi-Singer and Musical Score Provided Mandarin Singing Corpus","volume":"35","author":"Zhang Lichao","year":"2022","unstructured":"Lichao Zhang, Ruiqi Li, Shoutong Wang, Liqun Deng, Jinglin Liu, Yi Ren, Jinzheng He, Rongjie Huang, Jieming Zhu, Xiao Chen, et al. 2022b. M4Singer: A Multi-Style, Multi-Singer and Musical Score Provided Mandarin Singing Corpus. Advances in Neural Information Processing Systems, Vol. 35 (2022), 6914--6926.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_40_1","unstructured":"Lichao Zhang Zhou Zhao Yi Ren and Liqun Deng. [n. d.]. EditSinger: Zero-Shot Text-Based Singing Voice Editing System with Diverse Prosody Modeling. ([n. d.])."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747664"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612150","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612150","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:05:02Z","timestamp":1755821102000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612150"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":41,"alternative-id":["10.1145\/3581783.3612150","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612150","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}