{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,15]],"date-time":"2026-05-15T15:48:20Z","timestamp":1778860100682,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,10,17]],"date-time":"2021-10-17T00:00:00Z","timestamp":1634428800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Key R&D Program of China","award":["No.2020YFC0832505"],"award-info":[{"award-number":["No.2020YFC0832505"]}]},{"name":"National Natural Science Foundation of China","award":["No.61836002 No.62072397"],"award-info":[{"award-number":["No.61836002 No.62072397"]}]},{"name":"Zhejiang Natural Science Foundation","award":["LR19F020006"],"award-info":[{"award-number":["LR19F020006"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,10,17]]},"DOI":"10.1145\/3474085.3475437","type":"proceedings-article","created":{"date-parts":[[2021,10,18]],"date-time":"2021-10-18T04:59:18Z","timestamp":1634533158000},"page":"3945-3954","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":60,"title":["Multi-Singer: Fast Multi-Singer Singing Voice Vocoder With A Large-Scale Corpus"],"prefix":"10.1145","author":[{"given":"Rongjie","family":"Huang","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, Zhejiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Feiyang","family":"Chen","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, Zhejiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yi","family":"Ren","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, Zhejiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jinglin","family":"Liu","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, Zhejiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chenye","family":"Cui","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, Zhejiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhou","family":"Zhao","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, Zhejiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2021,10,17]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"High fidelity speech synthesis with adversarial networks. arXiv preprint arXiv:1909.11646","author":"Binkowski Mikolaj","year":"2019"},{"key":"e_1_3_2_1_2_1","volume-title":"Hi-Fi Singer: Towards High-Fidelity Neural Singing Voice Synthesis. arXiv preprint arXiv:2009.01776","author":"Chen Jiawei","year":"2020"},{"key":"e_1_3_2_1_3_1","volume-title":"Multispeech: Multi-speaker text to speech with transformer. arXiv preprint arXiv:2006.04664","author":"Chen Mingjian","year":"2020"},{"key":"e_1_3_2_1_4_1","volume-title":"WaveGrad: Estimating gradients for waveform generation. arXiv preprint arXiv:2009.00713","author":"Chen Nanxin","year":"2020"},{"key":"e_1_3_2_1_5_1","volume-title":"Korean Singing Voice Synthesis Based on Auto-Regressive Boundary Equilibrium Gan. In ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 7234--7238","author":"Choi S.","year":"2020"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053950"},{"key":"e_1_3_2_1_7_1","volume-title":"Austin Cozzo, and Arun Ross","author":"Chowdhury Anurag","year":"2020"},{"key":"e_1_3_2_1_8_1","volume-title":"Waveform Model Choice, and Acoustic Configurations for Multi-Speaker End-to-End Speech Synthesis. arXiv preprint arXiv:2011.04839","author":"Cooper Erica","year":"2020"},{"key":"e_1_3_2_1_9_1","volume-title":"EMOVIE: A Mandarin Emotion Speech Dataset with a Simple Emotional Text-to-Speech Model. arXiv preprint arXiv:2106.09317","author":"Cui Chenye","year":"2021"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPA.2013.6694316"},{"key":"e_1_3_2_1_11_1","volume-title":"LSSED: a large-scale dataset and benchmark for speech emotion recognition. arXiv preprint arXiv:2102.01754","author":"Fan Weiquan","year":"2021"},{"key":"e_1_3_2_1_12_1","volume-title":"A neural algorithm of artistic style. arXiv preprint arXiv:1508.06576","author":"Gatys Leon A","year":"2015"},{"key":"e_1_3_2_1_13_1","unstructured":"Alexey A. Gritsenko Tim Salimans Rianne van den Berg Jasper Snoek and Nal Kalchbrenner. 2020. A Spectral Energy Distance for Parallel Speech Synthesis. arXiv:2008.01160 [eess.AS]  Alexey A. Gritsenko Tim Salimans Rianne van den Berg Jasper Snoek and Nal Kalchbrenner. 2020. A Spectral Energy Distance for Parallel Speech Synthesis. arXiv:2008.01160 [eess.AS]"},{"key":"e_1_3_2_1_14_1","volume-title":"ByteSing: A Chinese Singing Voice Synthesis System Using Duration Allocated Encoder-Decoder Acoustic Models and WaveRNN Vocoders. arXiv preprint arXiv:2004.11012","author":"Gu Yu","year":"2020"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","first-page":"310","DOI":"10.1109\/TASL.2009.2026503","article-title":"On the improvement of singing voice separation for monaural recordings using the MIR-1K dataset","volume":"18","author":"Hsu Chao-Ling","year":"2009","journal-title":"IEEE Transactions on Audio, Speech, and Language Processing"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.632"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.5555\/3327345.3327360"},{"key":"e_1_3_2_1_18_1","volume-title":"Efficient neural audio synthesis. arXiv preprint arXiv:1802.08435","author":"Kalchbrenner Nal","year":"2018"},{"key":"e_1_3_2_1_19_1","volume-title":"HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis. arXiv preprint arXiv:2010.05646","author":"Kong Jungil","year":"2020"},{"key":"e_1_3_2_1_20_1","volume-title":"Diffwave: A versatile diffusion model for audio synthesis. arXiv preprint arXiv:2009.09761","author":"Kong Zhifeng","year":"2020"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.5555\/3454287.3455622"},{"key":"e_1_3_2_1_22_1","volume-title":"Diffsinger: Diffusion acoustic model for singing voice synthesis. arXiv preprint arXiv:2105.02446","author":"Liu Jinglin","year":"2021"},{"key":"e_1_3_2_1_23_1","unstructured":"Peiling Lu Jie Wu Jian Luan Xu Tan and Li Zhou. 2020. XiaoiceSing: A High-Quality and Integrated Singing Voice Synthesis System. arXiv:2006.06261 [eess.AS]  Peiling Lu Jie Wu Jian Luan Xu Tan and Li Zhou. 2020. XiaoiceSing: A High-Quality and Integrated Singing Voice Synthesis System. arXiv:2006.06261 [eess.AS]"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/78.258122"},{"key":"e_1_3_2_1_25_1","volume-title":"International conference on machine learning. PMLR, 3918--3926","author":"Oord Aaron","year":"2018"},{"key":"e_1_3_2_1_26_1","volume-title":"Wavenet: A generative model for raw audio. arXiv preprint arXiv:1609.03499","author":"van den Oord Aaron","year":"2016"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"e_1_3_2_1_28_1","volume-title":"Speaker conditional WaveRNN: Towards universal neural vocoder for unseen speaker and recording conditions. arXiv preprint arXiv:2008.05289","author":"Paul Dipjyoti","year":"2020"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683143"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2011.5946971"},{"key":"e_1_3_2_1_31_1","unstructured":"Yi Ren Chenxu Hu Xu Tan Tao Qin Sheng Zhao Zhou Zhao and Tie-Yan Liu. 2020. FastSpeech 2: Fast and High-Quality End-to-End Text to Speech. arXiv:2006.04558 [eess.AS]  Yi Ren Chenxu Hu Xu Tan Tao Qin Sheng Zhao Zhou Zhao and Tie-Yan Liu. 2020. FastSpeech 2: Fast and High-Quality End-to-End Text to Speech. arXiv:2006.04558 [eess.AS]"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.350"},{"key":"e_1_3_2_1_33_1","volume-title":"A study of non-autoregressive model for sequence generation. arXiv preprint arXiv:2004.10454","author":"Ren Yi","year":"2020"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.5555\/3454287.3454572"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3403249"},{"key":"e_1_3_2_1_36_1","volume-title":"AISHELL-3: A Multispeaker Mandarin TTS Corpus and the Baselines. arXiv preprint arXiv:2010.11567","author":"Shi Yao","year":"2020"},{"key":"e_1_3_2_1_37_1","volume-title":"JVS-MuSiC: Japanese multispeaker singing-voice corpus. arXiv preprint arXiv:2001.07044","author":"Tamaru Hiroki","year":"2020"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462665"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053795"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"crossref","unstructured":"Geng Yang Shan Yang Kai Liu Peng Fang Wei Chen and Lei Xie. 2020. Multiband MelGAN: Faster Waveform Generation for High-Quality Text-to-Speech. arXiv:2005.05106 [cs.SD]  Geng Yang Shan Yang Kai Liu Peng Fang Wei Chen and Lei Xie. 2020. Multiband MelGAN: Faster Waveform Generation for High-Quality Text-to-Speech. arXiv:2005.05106 [cs.SD]","DOI":"10.1109\/SLT48900.2021.9383551"},{"key":"e_1_3_2_1_41_1","volume-title":"VocGAN: A High-Fidelity Real-time Vocoder with a Hierarchically-nested Adversarial Network. arXiv preprint arXiv:2007.15256","author":"Yang Jinhyeok","year":"2020"},{"key":"e_1_3_2_1_42_1","volume-title":"Durian: Duration informed attention network for multimodal synthesis. arXiv preprint arXiv:1909.01700","author":"Yu Chengzhu","year":"2019"},{"key":"e_1_3_2_1_43_1","volume-title":"Realistic image synthesis with stacked generative adversarial networks","author":"Zhang Han","year":"2018"},{"key":"e_1_3_2_1_44_1","volume-title":"WSRGlow: A Glowbased Waveform Generative Model for Audio Super-Resolution. arXiv preprint arXiv:2106.08507","author":"Zhang Kexun","year":"2021"}],"event":{"name":"MM '21: ACM Multimedia Conference","location":"Virtual Event China","acronym":"MM '21","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 29th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475437","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3474085.3475437","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:48:33Z","timestamp":1750193313000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475437"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,17]]},"references-count":44,"alternative-id":["10.1145\/3474085.3475437","10.1145\/3474085"],"URL":"https:\/\/doi.org\/10.1145\/3474085.3475437","relation":{},"subject":[],"published":{"date-parts":[[2021,10,17]]},"assertion":[{"value":"2021-10-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}