{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T18:48:20Z","timestamp":1776883700853,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":46,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62072462"],"award-info":[{"award-number":["62072462"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3685000","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"11279-11281","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Muskits-ESPnet: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-9859-1425","authenticated-orcid":false,"given":"Yuning","family":"Wu","sequence":"first","affiliation":[{"name":"Renmin University of China, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9050-8304","authenticated-orcid":false,"given":"Jiatong","family":"Shi","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-8286-5778","authenticated-orcid":false,"given":"Yifeng","family":"Yu","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology, Atlanta, GA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-4538-7440","authenticated-orcid":false,"given":"Yuxun","family":"Tang","sequence":"additional","affiliation":[{"name":"Renmin University of China, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-1479-9179","authenticated-orcid":false,"given":"Tao","family":"Qian","sequence":"additional","affiliation":[{"name":"Shanghai High School, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1473-8981","authenticated-orcid":false,"given":"Yueqian","family":"Lin","sequence":"additional","affiliation":[{"name":"Duke University, Durham, NC, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-0604-4992","authenticated-orcid":false,"given":"Jionghao","family":"Han","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-5479-9692","authenticated-orcid":false,"given":"Xinyi","family":"Bai","sequence":"additional","affiliation":[{"name":"Cornell University &amp; Multimodal Art Projection, Ithaca, NY, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5970-8631","authenticated-orcid":false,"given":"Shinji","family":"Watanabe","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6486-6020","authenticated-orcid":false,"given":"Qin","family":"Jin","sequence":"additional","affiliation":[{"name":"Renmin University of China, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Philip Anastassiou Jiawei Chen Jitong Chen Yuanzhe Chen Zhuo Chen Ziyi Chen Jian Cong Lelai Deng Chuang Ding Lu Gao et al. 2024. Seed-TTS: A Family of High-Quality Versatile Speech Generation Models. arXiv preprint arXiv:2406.02430 (2024)."},{"key":"e_1_3_2_1_2_1","unstructured":"Alexei Baevski Steffen Schneider and Michael Auli. 2019. vq-wav2vec: Self-Supervised Learning of Discrete Speech Representations. In ICLR."},{"key":"e_1_3_2_1_3_1","unstructured":"Alexei Baevski Yuhao Zhou Abdelrahman Mohamed et al. 2020. wav2vec 2.0: A framework for self-supervised learning of speech representations. NeurIPS (2020)."},{"key":"e_1_3_2_1_4_1","volume-title":"David Dale, Ning Dong, Mark Duppenthaler, Paul-Ambroise Duquenne, Brian Ellis, Hady Elsahar, Justin Haaheim, et al.","author":"Chung Yu-An","year":"2023","unstructured":"Lo\"ic Barrault, Yu-An Chung, Mariano Coria Meglioli, David Dale, Ning Dong, Mark Duppenthaler, Paul-Ambroise Duquenne, Brian Ellis, Hady Elsahar, Justin Haaheim, et al. 2023. Seamless: Multilingual Expressive and Streaming Speech Translation. arXiv preprint arXiv:2312.05187 (2023)."},{"key":"e_1_3_2_1_5_1","volume-title":"Sequence-to-Sequence Singing Synthesis Using the Feed-Forward Transformer. ICASSP","author":"Blaauw Merlijn","year":"2019","unstructured":"Merlijn Blaauw and Jordi Bonada. 2019. Sequence-to-Sequence Singing Synthesis Using the Feed-Forward Transformer. ICASSP (2019), 7229--7233."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Xuankai Chang Jiatong Shi Jinchuan Tian Yuning Wu Yuxun Tang Yihan Wu Shinji Watanabe Yossi Adi Xie Chen and Qin Jin. 2024. The Interspeech 2024 Challenge on Speech Processing Using Discrete Units. In Interspeech.","DOI":"10.21437\/Interspeech.2024-1878"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","unstructured":"Xuankai Chang Brian Yan Kwanghee Choi Jeeweon Jung Yichen Lu Soumi Maiti Roshan Sharma Jiatong Shi Jinchuan Tian Shinji Watanabe et al. 2023. Exploring speech recognition translation and understanding with discrete speech units: A comparative study. In ICASSP.","DOI":"10.1109\/ICASSP48485.2024.10447929"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Xuankai Chang Brian Yan Yuya Fujita Takashi Maekaku and Shinji Watanabe. 2023. Exploration of Efficient End-to-End ASR using Discretized Input from Self-Supervised Learning. In Interspeech.","DOI":"10.21437\/Interspeech.2023-2051"},{"key":"e_1_3_2_1_9_1","first-page":"1505","article-title":"WavLM","volume":"16","author":"Chen Sanyuan","year":"2021","unstructured":"Sanyuan Chen, Chengyi Wang, Zhengyang Chen, et al. 2021. WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing. IJSTSP, Vol. 16 (2021), 1505--1518.","journal-title":"Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing. IJSTSP"},{"key":"e_1_3_2_1_10_1","unstructured":"Alexandre D'efossez Jade Copet Gabriel Synnaeve et al. 2022. High Fidelity Neural Audio Compression. ArXiv Vol. abs\/2210.13438 (2022)."},{"key":"e_1_3_2_1_11_1","unstructured":"Sang gil Lee Wei Ping Boris Ginsburg et al. 2022. BigVGAN: A Universal Neural Vocoder with Large-Scale Training. ArXiv Vol. abs\/2206.04658 (2022)."},{"key":"e_1_3_2_1_12_1","volume-title":"Bytesing: A Chinese singing voice synthesis system using duration allocated encoder-decoder acoustic models and WaveRNN vocoders. In ISCSLP.","author":"Gu Yu","year":"2021","unstructured":"Yu Gu, Xiang Yin, Yonghui Rao, et al. 2021. Bytesing: A Chinese singing voice synthesis system using duration allocated encoder-decoder acoustic models and WaveRNN vocoders. In ISCSLP."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Shuai Guo Jiatong Shi Tao Qian et al. 2022. SingAug: Data Augmentation for Singing Voice Synthesis with Cycle-consistent Training Strategy. In Interspeech.","DOI":"10.21437\/Interspeech.2022-978"},{"key":"e_1_3_2_1_14_1","volume-title":"Discretalk: Text-to-speech as a machine translation problem. arXiv preprint arXiv:2005.05525","author":"Tomoki Hayashi","year":"2020","unstructured":"Tomoki Hayashi et al. 2020. Discretalk: Text-to-speech as a machine translation problem. arXiv preprint arXiv:2005.05525 (2020)."},{"key":"e_1_3_2_1_15_1","first-page":"3451","article-title":"HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units","volume":"29","author":"Hsu Wei-Ning","year":"2021","unstructured":"Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, et al. 2021. HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units. TASLP, Vol. 29 (2021), 3451--3460.","journal-title":"TASLP"},{"key":"e_1_3_2_1_16_1","volume-title":"HiFi-GAN: Generative adversarial networks for efficient and high fidelity speech synthesis. NeurIPS","author":"Kong Jungil","year":"2020","unstructured":"Jungil Kong, Jaehyeon Kim, and Jaekyoung Bae. 2020. HiFi-GAN: Generative adversarial networks for efficient and high fidelity speech synthesis. NeurIPS (2020)."},{"key":"e_1_3_2_1_17_1","unstructured":"Kundan Kumar Rithesh Kumar Thibault de Boissi\u00e8re et al. 2019. MelGAN: Generative Adversarial Networks for Conditional Waveform Synthesis. In NeurIPS."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","unstructured":"Ann Lee Peng-Jen Chen Changhan Wang et al. 2022. Direct Speech-to-Speech Translation With Discrete Units. In ACL. 3327--3339.","DOI":"10.18653\/v1\/2022.acl-long.235"},{"key":"e_1_3_2_1_19_1","volume-title":"MERT: Acoustic Music Understanding Model with Large-Scale Self-supervised Training. ArXiv","author":"Li Yizhi","year":"2023","unstructured":"Yizhi Li, Ruibin Yuan, Ge Zhang, et al. 2023. MERT: Acoustic Music Understanding Model with Large-Scale Self-supervised Training. ArXiv, Vol. abs\/2306.00107 (2023)."},{"key":"e_1_3_2_1_20_1","unstructured":"Jinglin Liu Chengxi Li Yi Ren et al. 2021. DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism. In AAAI."},{"key":"e_1_3_2_1_21_1","unstructured":"Peiling Lu Jie Wu Jian Luan et al. 2020. XiaoiceSing: A High-Quality and Integrated Singing Voice Synthesis System. In Interspeech."},{"key":"e_1_3_2_1_22_1","volume-title":"StyleMelGAN: An Efficient High-Fidelity Adversarial Vocoder with Temporal Adaptive Normalization. ICASSP","author":"Mustafa Ahmed","year":"2020","unstructured":"Ahmed Mustafa, Nicola Pia, and Guillaume Fuchs. 2020. StyleMelGAN: An Efficient High-Fidelity Adversarial Vocoder with Temporal Adaptive Normalization. ICASSP (2020), 6034--6038."},{"key":"e_1_3_2_1_23_1","volume-title":"Seventh ISCA Workshop on Speech Synthesis.","author":"Oura Keiichiro","year":"2010","unstructured":"Keiichiro Oura, Ayami Mase, Tomohiko Yamada, et al. 2010. Recent development of the HMM-based singing voice synthesis system-Sinsy. In Seventh ISCA Workshop on Speech Synthesis."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Adam Polyak Yossi Adi Jade Copet Eugene Kharitonov Kushal Lakhotia Wei-Ning Hsu Abdelrahman Mohamed and Emmanuel Dupoux. 2021. Speech Resynthesis from Discrete Disentangled Self-Supervised Representations. In Interspeech.","DOI":"10.21437\/Interspeech.2021-475"},{"key":"e_1_3_2_1_25_1","unstructured":"Jiatong Shi Shuai Guo Nan Huo et al. 2020. Sequence-To-Sequence Singing Voice Synthesis With Perceptual Entropy Loss. ICASSP (2020)."},{"key":"e_1_3_2_1_26_1","unstructured":"Jiatong Shi Shuai Guo Tao Qian et al. 2022. Muskits: an End-to-End Music Processing Toolkit for Singing Voice Synthesis. In Interspeech."},{"key":"e_1_3_2_1_27_1","unstructured":"Jiatong Shi Chan-Jan Hsu Holam Chung Dongji Gao Paola Garcia Shinji Watanabe Ann Lee and Hung-yi Lee. 2023. Bridging speech and textual pre-trained models with unsupervised ASR. In ICASSP."},{"key":"e_1_3_2_1_28_1","unstructured":"Jiatong Shi Hirofumi Inaguma Xutai Ma et al. 2023. Multi-resolution HuBERT: Multi-resolution Speech Self-Supervised Learning with Masked Unit Prediction. In ICLR."},{"key":"e_1_3_2_1_29_1","unstructured":"Jiatong Shi Yun Tang Ann Lee Hirofumi Inaguma Changhan Wang Juan Pino and Shinji Watanabe. 2023. Enhancing Speech-To-Speech Translation with Multiple TTS Targets. In ICASSP."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"Yuxun Tang Yuning Wu Jiatong Shi and Qin Jin. 2024. SingOMD: Singing Oriented Multi-resolution Discrete Representation Construction from Speech Models. In Interspeech.","DOI":"10.21437\/Interspeech.2024-2291"},{"key":"e_1_3_2_1_31_1","unstructured":"Chengyi Wang Sanyuan Chen Yu Wu Ziqiang Zhang Long Zhou Shujie Liu Zhuo Chen Yanqing Liu Huaming Wang Jinyu Li et al. 2023. Neural codec language models are zero-shot text to speech synthesizers. arXiv preprint arXiv:2301.02111 (2023)."},{"key":"e_1_3_2_1_32_1","volume-title":"Xiaoicesing 2: A high-fidelity singing voice synthesizer based on generative adversarial network. arXiv preprint arXiv:2210.14666","author":"Wang Chunhui","year":"2022","unstructured":"Chunhui Wang, Chang Zeng, and Xing He. 2022. Xiaoicesing 2: A high-fidelity singing voice synthesizer based on generative adversarial network. arXiv preprint arXiv:2210.14666 (2022)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3552466.3556534"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"crossref","unstructured":"Shinji Watanabe Takaaki Hori Shigeki Karita Tomoki Hayashi Jiro Nishitoba Yuya Unno Nelson Yalta Jahn Heymann Matthew Wiesner Nanxin Chen Adithya Renduchintala and Tsubasa Ochiai. 2018. ESPnet: End-to-End Speech Processing Toolkit. In Interspeech.","DOI":"10.21437\/Interspeech.2018-1456"},{"key":"e_1_3_2_1_35_1","unstructured":"Yuning Wu Jiatong Shi Tao Qian Dongji Gao and Qin Jin. 2023. PHONEix: Acoustic Feature Processing Strategy for Enhanced Singing Pronunciation With Phoneme Distribution Predictor. ICASSP."},{"key":"e_1_3_2_1_36_1","volume-title":"A Systematic Exploration of Joint-training for Singing Voice Synthesis. ArXiv","author":"Wu Yuning","year":"2023","unstructured":"Yuning Wu, Yifeng Yu, Jiatong Shi, Tao Qian, and Qin Jin. 2023. A Systematic Exploration of Joint-training for Singing Voice Synthesis. ArXiv, Vol. abs\/2308.02867 (2023)."},{"key":"e_1_3_2_1_37_1","volume-title":"Jiatong Shi, Yuxun Tang, Shan Yang, and Qin Jin.","author":"Wu Yuning","year":"2024","unstructured":"Yuning Wu, Chunlei zhang, Jiatong Shi, Yuxun Tang, Shan Yang, and Qin Jin. 2024. TokSing: Singing Voice Synthesis based on Discrete Tokens. In Interspeech."},{"key":"e_1_3_2_1_38_1","volume-title":"NNSVS: A neural network-based singing voice synthesis toolkit. In ICASSP.","author":"Yamamoto Ryuichi","year":"2023","unstructured":"Ryuichi Yamamoto, Reo Yoneyama, and Tomoki Toda. 2023. NNSVS: A neural network-based singing voice synthesis toolkit. In ICASSP."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Brian Yan Jiatong Shi Yun Tang Hirofumi Inaguma Yifan Peng Siddharth Dalmia Peter Pol\u00e1k Patrick Fernandes Dan Berrebbi Tomoki Hayashi Xiaohui Zhang Zhaoheng Ni Moto Hira Soumi Maiti Juan Pino and Shinji Watanabe. 2023. ESPnet-ST-v2: Multipurpose Spoken Language Translation Toolkit. In ACL.","DOI":"10.18653\/v1\/2023.acl-demo.38"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447751"},{"key":"e_1_3_2_1_41_1","volume-title":"VISinger2: End-to-End Singing Voice Synthesis Augmented by Self-Supervised Learning Representation. ArXiv","author":"Yu Yifeng","year":"2024","unstructured":"Yifeng Yu, Jiatong Shi, Yuning Wu, and Shinji Watanabe. 2024. VISinger2: End-to-End Singing Voice Synthesis Augmented by Self-Supervised Learning Representation. ArXiv, Vol. abs\/2406.08761 (2024)."},{"key":"e_1_3_2_1_42_1","volume-title":"SingMOS: An extensive Open-Source Singing Voice Dataset for MOS Prediction. arXiv preprint arXiv:2406.10911","author":"Qin Jin Yuxun Tang Yuning Wu","year":"2024","unstructured":"Yuning Wu Qin Jin Yuxun Tang, Jiatong Shi. 2024. SingMOS: An extensive Open-Source Singing Voice Dataset for MOS Prediction. arXiv preprint arXiv:2406.10911 (2024)."},{"key":"e_1_3_2_1_43_1","first-page":"495","article-title":"SoundStream","volume":"30","author":"Zeghidour Neil","year":"2021","unstructured":"Neil Zeghidour, Alejandro Luebs, Ahmed Omran, et al. 2021. SoundStream: An End-to-End Neural Audio Codec. TASLP, Vol. 30 (2021), 495--507.","journal-title":"An End-to-End Neural Audio Codec. TASLP"},{"key":"e_1_3_2_1_44_1","volume-title":"SpeechGPT: Empowering Large Language Models with Intrinsic Cross-Modal Conversational Abilities. In The 2023 Conference on Empirical Methods in Natural Language Processing.","author":"Zhang Dong","year":"2023","unstructured":"Dong Zhang, Shimin Li, Xin Zhang, Jun Zhan, Pengyu Wang, Yaqian Zhou, and Xipeng Qiu. 2023. SpeechGPT: Empowering Large Language Models with Intrinsic Cross-Modal Conversational Abilities. In The 2023 Conference on Empirical Methods in Natural Language Processing."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"crossref","unstructured":"Yongmao Zhang Jian Cong Heyang Xue et al. 2021. VISinger: Variational Inference with Adversarial Learning for End-to-End Singing Voice Synthesis. ICASSP (2021) 7237--7241.","DOI":"10.1109\/ICASSP43922.2022.9747664"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","unstructured":"Yongmao Zhang Heyang Xue Hanzhao Li et al. 2022. VISinger 2: High-Fidelity End-to-End Singing Voice Synthesis Enhanced by Digital Signal Processing Synthesizer. ArXiv Vol. abs\/2211.02903 (2022).","DOI":"10.21437\/Interspeech.2023-391"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3685000","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3685000","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:28Z","timestamp":1750295848000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3685000"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":46,"alternative-id":["10.1145\/3664647.3685000","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3685000","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}