{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T18:18:18Z","timestamp":1775067498780,"version":"3.50.1"},"reference-count":40,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2022,8,8]],"date-time":"2022-08-08T00:00:00Z","timestamp":1659916800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2022,8,8]],"date-time":"2022-08-08T00:00:00Z","timestamp":1659916800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Circuits Syst Signal Process"],"published-print":{"date-parts":[[2023,1]]},"DOI":"10.1007\/s00034-022-02122-3","type":"journal-article","created":{"date-parts":[[2022,8,8]],"date-time":"2022-08-08T12:03:21Z","timestamp":1659960201000},"page":"307-321","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":8,"title":["Multi-Voice Singing Synthesis From Lyrics"],"prefix":"10.1007","volume":"42","author":[{"given":"S.","family":"Resna","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5488-9026","authenticated-orcid":false,"given":"Rajeev","family":"Rajan","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,8,8]]},"reference":[{"key":"2122_CR1","unstructured":"M. Arjovsky, S. Chintala, L. Bottou, Wasserstein generative adversarial networks. in Proceedings of the 34th International Conference on Machine Learning, vol. 70, pp. 214-223 (2017)"},{"key":"2122_CR2","doi-asserted-by":"publisher","unstructured":"M. Blaauw, J. Bonada, Sequence-to-sequence singing synthesis using the feed-forward transformer. in ICASSP 2020- 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2020) pp. 7229-7233. https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9053944","DOI":"10.1109\/ICASSP40776.2020.9053944"},{"issue":"2","key":"2122_CR3","doi-asserted-by":"publisher","first-page":"113","DOI":"10.1109\/TASSP.1979.1163209","volume":"27","author":"S Boll","year":"1979","unstructured":"S. Boll, Suppression of acoustic noise in speech using spectral subtraction. IEEE Trans. Acoust. Speech Signal Process. 27(2), 113\u2013120 (1979). https:\/\/doi.org\/10.1109\/TASSP.1979.1163209","journal-title":"IEEE Trans. Acoust. Speech Signal Process."},{"key":"2122_CR4","unstructured":"E. Casanova, J. Weber, C. Shulby, A. C. Junior, E. G\u00f6lge, M. A. Ponti, YourTTS: Towards zero-shot multi-speaker TTS and zero-shot voice conversion for everyone. arXiv preprint arXiv:2112.02418 (2021)"},{"key":"2122_CR5","doi-asserted-by":"crossref","unstructured":"P. Chandna, M. Blaauw, J. Bonada, E. G\u00f3mez, WGANSing: a multi-voice singing voice synthesizer based on the Wasserstein-GAN. in Proceedings of 27th European Signal Processing Conference, pp. 1\u20135 (2019)","DOI":"10.23919\/EUSIPCO.2019.8903099"},{"key":"2122_CR6","unstructured":"J. Chen, X. Tan, J. Luan, T. Qin, T.-Y. Liu, HiFiSinger: Towards high-fidelity neural singing voice synthesis. arXiv preprint arXiv:2009.01776 (2020)"},{"key":"2122_CR7","doi-asserted-by":"publisher","first-page":"319","DOI":"10.1109\/AIVR52153.2021.00067","volume":"2021","author":"Y-P Cho","year":"2021","unstructured":"Y.-P. Cho, F.-R. Yang, Y.-C. Chang, C.-T. Cheng, X.-H. Wang, Y.-W. Liu, A survey on recent deep learning-driven singing voice synthesis systems. IEEE International Conference on Artificial Intelligence and Virtual Reality (AIVR) 2021, 319\u2013323 (2021). https:\/\/doi.org\/10.1109\/AIVR52153.2021.00067","journal-title":"IEEE International Conference on Artificial Intelligence and Virtual Reality (AIVR)"},{"key":"2122_CR8","doi-asserted-by":"publisher","unstructured":"S. Choi, W. Kim, S. Park, S. Yong, J. Nam, Korean singing voice synthesis based on auto-regressive boundary equilibrium Gan. in ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2020) pp. 7234\u20137238. https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9053950","DOI":"10.1109\/ICASSP40776.2020.9053950"},{"key":"2122_CR9","doi-asserted-by":"publisher","first-page":"17","DOI":"10.5120\/ijca2017915612","volume":"175","author":"B Choksi","year":"2017","unstructured":"B. Choksi, A. Sawant, S. Mali, Style transfer for audio using convolutional neural networks. Int. J. Comput. Appl. 175, 17\u201320 (2017). https:\/\/doi.org\/10.5120\/ijca2017915612","journal-title":"Int. J. Comput. Appl."},{"key":"2122_CR10","doi-asserted-by":"publisher","unstructured":"Z. Duan, H. Fang, B. Li, K. C. Sim, Y. Wang, The NUS sung and spoken lyrics corpus: A quantitative comparison of singing and speech. in Proceedings of Asia-Pacific Signal and Information Processing Association Annual Summit and Conference, pp. 1\u20139 (2013) https:\/\/doi.org\/10.1109\/APSIPA.2013.6694316","DOI":"10.1109\/APSIPA.2013.6694316"},{"key":"2122_CR11","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1186\/s13636-019-0163-y","volume":"2019","author":"M Freixes","year":"2019","unstructured":"M. Freixes, F. Al\u00edas, J.C. Carri\u00e9, A unit selection text-to-speech-and-singing synthesis framework from neutral speech: proof of concept. EURASIP J. Audio Speech Music Process. 2019, 1\u201314 (2019)","journal-title":"EURASIP J. Audio Speech Music Process."},{"key":"2122_CR12","doi-asserted-by":"crossref","unstructured":"L. Gatys, A. Ecker, M. Bethge, Image style transfer using convolutional neural networks. in Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 2414\u20132423 (2016)","DOI":"10.1109\/CVPR.2016.265"},{"key":"2122_CR13","doi-asserted-by":"publisher","unstructured":"D. Griffin, Jae Lim, Signal estimation from modified short-time Fourier transform. in Proceedings of IEEE International Conference on Acoustics, Speech, and Signal Processing, vol. 8, pp. 804\u2013807 (1983). https:\/\/doi.org\/10.1109\/ICASSP.1983.1172092","DOI":"10.1109\/ICASSP.1983.1172092"},{"key":"2122_CR14","doi-asserted-by":"publisher","unstructured":"Y. Gu et al., ByteSing: A Chinese singing voice synthesis system using duration allocated encoder-decoder acoustic models and WaveRNN vocoders. in 2021 12th International Symposium on Chinese Spoken Language Processing (ISCSLP) (2021) pp. 1\u20135. https:\/\/doi.org\/10.1109\/ISCSLP49672.2021.9362104","DOI":"10.1109\/ISCSLP49672.2021.9362104"},{"key":"2122_CR15","unstructured":"C. Gupta, R. Tong, H. Li, Y. Wang, Semi-supervised Lyrics and Solo-singing alignment. in Proceedings of International Society for Music Information Retrieval Conference (ISMIR) , pp. 600\u2013607 (2018)"},{"key":"2122_CR16","doi-asserted-by":"publisher","first-page":"2803","DOI":"10.1109\/TASLP.2021.3104165","volume":"29","author":"Y Hono","year":"2021","unstructured":"Y. Hono, K. Hashimoto, K. Oura, Y. Nankaku, K. Tokuda, Sinsy: a deep neural network-based singing voice synthesis system. IEEE\/ACM Trans. Audio Speech Lang. Process. 29, 2803\u20132815 (2021). https:\/\/doi.org\/10.1109\/TASLP.2021.3104165","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"2122_CR17","doi-asserted-by":"crossref","unstructured":"Y. Hono, K. Hashimoto, K. Oura, Y. Nankaku, K. Tokuda., Singing voice synthesis based on generative adversarial networks. in Proceedins of IEEE International Conference on Acoustics, Speech and Signal Processing, pp. 6955\u20136959 (2019)","DOI":"10.1109\/ICASSP.2019.8683154"},{"key":"2122_CR18","unstructured":"Y. Jia et al., Transfer learning from speaker verification to multispeaker text-to-speech synthesis. Adv. Neural Inf. Process. Syst. 31 (2018)"},{"key":"2122_CR19","doi-asserted-by":"crossref","unstructured":"J. Kim, H. Choi, J. Park, S. Kim, J. Kim, M. Hahn, Korean singing voice synthesis system based on an LSTM recurrent neural network. in Proceedings of Interspeech, pp. 1551\u20131555 (2018)","DOI":"10.21437\/Interspeech.2018-1575"},{"key":"2122_CR20","doi-asserted-by":"publisher","unstructured":"J. Lee, H.-S. Choi, J. Koo, K. Lee, Disentangling timbre and singing style with multi-singer singing synthesis system. ICASSP 2020\u20132020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2020) pp. 7224\u20137228. https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9054636","DOI":"10.1109\/ICASSP40776.2020.9054636"},{"key":"2122_CR21","doi-asserted-by":"crossref","unstructured":"J. Li, H. Yang, W. Zhang, L. Cai, A lyrics to singing voice synthesis system with variable timbre. communications in computer and information science, pp. 186\u2013193 (2011)","DOI":"10.1007\/978-3-642-23220-6_23"},{"key":"2122_CR22","doi-asserted-by":"crossref","unstructured":"J. Liu, C. Li, Y. Ren, F. Chen, P. Liu, Z. Zhao, Diffsinger: Singing voice synthesis via shallow diffusion mechanism. arXiv preprint arXiv:2105.02446 (2021)","DOI":"10.1609\/aaai.v36i10.21350"},{"key":"2122_CR23","doi-asserted-by":"publisher","first-page":"773","DOI":"10.1109\/ASRU51503.2021.9688029","volume":"2021","author":"R Liu","year":"2021","unstructured":"R. Liu, X. Wen, C. Lu, L. Song, J.S. Sung, Vibrato learning in multi-singer singing voice synthesis. IEEE Autom. Speech Recognit. Underst. Workshop (ASRU) 2021, 773\u2013779 (2021). https:\/\/doi.org\/10.1109\/ASRU51503.2021.9688029","journal-title":"IEEE Autom. Speech Recognit. Underst. Workshop (ASRU)"},{"key":"2122_CR24","doi-asserted-by":"crossref","unstructured":"B. McFee et al., Librosa: audio and music signal analysis in python. in Proceedings of 14th Python in Science Conference, pp. 18\u201324 (2015)","DOI":"10.25080\/Majora-7b98e3ed-003"},{"key":"2122_CR25","unstructured":"P. K. Mital, Time domain neural audio style transfer. CoRR (2017). arxiv:1711.11160"},{"key":"2122_CR26","doi-asserted-by":"publisher","first-page":"1010","DOI":"10.1016\/j.csl.2019.101027","volume":"60","author":"A Nagrani","year":"2020","unstructured":"A. Nagrani, J.S. Chung, W. Xie, A. Zisserman, Voxceleb: large-scale speaker verification in the wild. Comput. Speech Lang. 60, 1010\u201327 (2020)","journal-title":"Comput. Speech Lang."},{"key":"2122_CR27","doi-asserted-by":"crossref","unstructured":"A. Nagrani, J. S. Chung, A. Zisserman, VoxCeleb: A large-scale speaker identification dataset. in Proceedings of Interspeech, pp. 2616\u20132620 (2017)","DOI":"10.21437\/Interspeech.2017-950"},{"key":"2122_CR28","doi-asserted-by":"publisher","unstructured":"K. Nakamura, S. Takaki, K. Hashimoto, K. Oura, Y. Nankaku, K. Tokuda, Fast and high-quality singing voice synthesis system based on convolutional neural networks. in ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2020) pp. 7239\u20137243. https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9053811","DOI":"10.1109\/ICASSP40776.2020.9053811"},{"key":"2122_CR29","doi-asserted-by":"crossref","unstructured":"M. Nishimura, K. Hashimoto, K. Oura, Y. Nankaku, K. Tokuda, Singing voice synthesis based on deep neural networks. in Proceedings of Interspeech, pp. 2478\u20132482 (2016)","DOI":"10.21437\/Interspeech.2016-1027"},{"key":"2122_CR30","doi-asserted-by":"publisher","unstructured":"V. Panayotov, G. Chen, D. Povey, S. Khudanpur, Librispeech: An ASR corpus based on public domain audio books. in Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing, pp. 5206\u20135210 (2015). https:\/\/doi.org\/10.1109\/ICASSP.2015.7178964","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"2122_CR31","doi-asserted-by":"crossref","unstructured":"J. Parekh, P. Rao, Y. H. Yang, Speech-to-singing conversion in an encoder-decoder framework. in Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing, pp. 261\u2013265 (2020)","DOI":"10.1109\/ICASSP40776.2020.9054473"},{"key":"2122_CR32","doi-asserted-by":"crossref","unstructured":"Y. Ren, X. Tan, T. Qin, J. Luan, Z. Zhao, T.-Y. Liu, DeepSinger: singing voice synthesis with data mined from the web. in Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp. 1979\u20131989 (2020)","DOI":"10.1145\/3394486.3403249"},{"key":"2122_CR33","first-page":"234","volume":"2015","author":"O Ronneberger","year":"2015","unstructured":"O. Ronneberger, P. Fischer, T. Brox, U-Net: convolutional networks for biomedical image segmentation. Medical Image Comput. Comput. Assist. Intervent. MICCAI 2015, 234\u2013241 (2015)","journal-title":"Medical Image Comput. Comput. Assist. Intervent. MICCAI"},{"key":"2122_CR34","doi-asserted-by":"publisher","first-page":"103","DOI":"10.1016\/j.specom.2021.12.005","volume":"137","author":"A Saeed","year":"2022","unstructured":"A. Saeed, M.F. Hayat, T. Habib, D.A. Ghaffar, M.A. Qureshi, A novel multi-speakers Urdu singing voices synthesizer using Wasserstein generative adversarial network. Speech Commun. 137, 103\u2013113 (2022). https:\/\/doi.org\/10.1016\/j.specom.2021.12.005","journal-title":"Speech Commun."},{"key":"2122_CR35","doi-asserted-by":"crossref","unstructured":"J. Shen et al. Natural TTS synthesis by conditioning WaveNet on mel spectrogram predictions. CoRR (2017). arxiv:1712.05884","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"2122_CR36","doi-asserted-by":"publisher","unstructured":"J. Shi, S. Guo, N. Huo, Y. Zhang, Q. Jin, Sequence-to-sequence singing voice synthesis with perceptual entropy loss. in ICASSP 2021\u20132021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2021) pp. 76\u201380. https:\/\/doi.org\/10.1109\/ICASSP39728.2021.9414348","DOI":"10.1109\/ICASSP39728.2021.9414348"},{"key":"2122_CR37","doi-asserted-by":"crossref","unstructured":"L. Su, Vocal melody extraction using patch-based CNN. in Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing, pp. 371\u2013375 (2018)","DOI":"10.1109\/ICASSP.2018.8462420"},{"key":"2122_CR38","unstructured":"D. Ulyanov, V. Lebedev, Audio texture synthesis and style transfer. (2016). http:\/\/tinyurl.com\/y844x8qt"},{"key":"2122_CR39","doi-asserted-by":"publisher","unstructured":"R. Valle, J. Li, R. Prenger, B. Catanzaro, Mellotron: multispeaker expressive voice synthesis by conditioning on rhythm, pitch and global style tokens. in ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2020) pp. 6189\u20136193. https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9054556","DOI":"10.1109\/ICASSP40776.2020.9054556"},{"key":"2122_CR40","unstructured":"D.-Y. Wu, Y.-H. Yang, Speech-to-singing conversion based on boundary equilibrium gan. arXiv preprint arXiv:2005.13835 (2020)"}],"container-title":["Circuits, Systems, and Signal Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00034-022-02122-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00034-022-02122-3\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00034-022-02122-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,1,4]],"date-time":"2023-01-04T05:19:24Z","timestamp":1672809564000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00034-022-02122-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,8,8]]},"references-count":40,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2023,1]]}},"alternative-id":["2122"],"URL":"https:\/\/doi.org\/10.1007\/s00034-022-02122-3","relation":{},"ISSN":["0278-081X","1531-5878"],"issn-type":[{"value":"0278-081X","type":"print"},{"value":"1531-5878","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,8,8]]},"assertion":[{"value":"16 January 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"14 July 2022","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"14 July 2022","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"8 August 2022","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that there is no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}