{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T08:06:01Z","timestamp":1761897961073,"version":"3.28.0"},"reference-count":35,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,12,16]],"date-time":"2023-12-16T00:00:00Z","timestamp":1702684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,12,16]],"date-time":"2023-12-16T00:00:00Z","timestamp":1702684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,12,16]]},"DOI":"10.1109\/asru57964.2023.10389702","type":"proceedings-article","created":{"date-parts":[[2024,1,19]],"date-time":"2024-01-19T18:38:40Z","timestamp":1705689520000},"page":"1-6","source":"Crossref","is-referenced-by-count":4,"title":["Crosssinger: A Cross-Lingual Multi-Singer High-Fidelity Singing Voice Synthesizer Trained on Monolingual Singers"],"prefix":"10.1109","author":[{"given":"Xintong","family":"Wang","sequence":"first","affiliation":[{"name":"Beijing Bombax XiaoIce Technology Co., Ltd,China"}]},{"given":"Chang","family":"Zeng","sequence":"additional","affiliation":[{"name":"National Institute of Informatics,Japan"}]},{"given":"Jun","family":"Chen","sequence":"additional","affiliation":[{"name":"Tsinghua University,Shenzhen International Graduate School,Shenzhen,China"}]},{"given":"Chunhui","family":"Wang","sequence":"additional","affiliation":[{"name":"Beijing Bombax XiaoIce Technology Co., Ltd,China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953090"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2018.2872060"},{"key":"ref4","article-title":"Fastspeech: Fast, robust and control-lable text to speech","volume":"32","author":"Ren","year":"2019"},{"article-title":"FastSpeech 2: Fast and High-Quality End-to-End Text to Speech","volume-title":"9th International Conference on Learning Representations, ICLR 2021","author":"Ren","key":"ref5"},{"key":"ref6","first-page":"125","article-title":"WaveNet: A Generative Model for Raw Audio","volume-title":"The 9th ISCA Speech Synthesis Workshop","author":"van den Oord"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2956145"},{"key":"ref8","first-page":"17022","article-title":"Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis","volume":"33","author":"Kong","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683154"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1722"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1410"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-119"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547854"},{"key":"ref14","article-title":"Cross-lingual multi-speaker text-to-speech under limited-data scenario","author":"Cai","year":"2020","journal-title":"arXiv preprint arXiv:2005.10441"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1632"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2019-2668"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054722"},{"article-title":"AdaSpeech: Adaptive Text to Speech for Custom Voice","volume-title":"9th International Conference on Learning Representations, ICLR 2021","author":"Chen","key":"ref18"},{"issue":"1","key":"ref19","first-page":"2096","article-title":"Domain-adversarial training of neural networks","volume":"17","author":"Ganin","year":"2016","journal-title":"The journal of machine learning research"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1250\/ast.42.140"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPA.2013.6694316"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/3422622"},{"key":"ref23","article-title":"Hifisinger: Towards high-fidelity neural singing voice synthesis","author":"Chen","year":"2020","journal-title":"arXiv preprint arXiv:2009.01776"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053944"},{"key":"ref25","article-title":"Melgan: Generative adversarial networks for conditional waveform synthesis","volume":"32","author":"Kumar","year":"2019","journal-title":"Advances in neural information processing systems"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.304"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.632"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.19"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46487-9_43"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2019-2668"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-172"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682674"},{"article-title":"Adam: A method for stochastic optimization","volume-title":"3rd International Conference on Learning Representations, ICLR 2015","author":"Kingma","key":"ref33"},{"key":"ref34","article-title":"Attention is all you need","volume":"30","author":"Vaswani","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref35","article-title":"HiFi-WaveGAN: Generative Adversarial Network with Auxiliary Spectrogram-Phase Loss for High-Fidelity Singing Voice Generation","author":"Wang","year":"2022","journal-title":"arXiv preprint arXiv:2210.12740"}],"event":{"name":"2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","start":{"date-parts":[[2023,12,16]]},"location":"Taipei, Taiwan","end":{"date-parts":[[2023,12,20]]}},"container-title":["2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10388490\/10389614\/10389702.pdf?arnumber=10389702","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,23]],"date-time":"2024-01-23T16:39:14Z","timestamp":1706027954000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10389702\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12,16]]},"references-count":35,"URL":"https:\/\/doi.org\/10.1109\/asru57964.2023.10389702","relation":{},"subject":[],"published":{"date-parts":[[2023,12,16]]}}}