{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,13]],"date-time":"2025-10-13T00:23:33Z","timestamp":1760315013976,"version":"build-2065373602"},"publisher-location":"Cham","reference-count":26,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032079589","type":"print"},{"value":"9783032079596","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,10,13]],"date-time":"2025-10-13T00:00:00Z","timestamp":1760313600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,10,13]],"date-time":"2025-10-13T00:00:00Z","timestamp":1760313600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-07959-6_14","type":"book-chapter","created":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T09:22:28Z","timestamp":1760260948000},"page":"189-202","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Adaptive Singing Voice Enhancement for\u00a0Live Stages"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4818-3198","authenticated-orcid":false,"given":"Jia-Lien","family":"Hsu","sequence":"first","affiliation":[]},{"given":"Pei-Wen","family":"Chien","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,10,13]]},"reference":[{"key":"14_CR1","doi-asserted-by":"crossref","unstructured":"Blaauw, M., Bonada, J.: A neural parametric singing synthesizer modeling timbre and expression from natural songs. Appl. Sci. 7(12) (2017), https:\/\/www.mdpi.com\/2076-3417\/7\/12\/1313","DOI":"10.3390\/app7121313"},{"key":"14_CR2","unstructured":"Casanova, E., Weber, J., Shulby, C.D., Junior, A.C., G\u00f6lge, E., Ponti, M.A.: YourTTS: towards zero-shot multi-speaker TTS and zero-shot voice conversion for everyone. In: Chaudhuri, K., Jegelka, S., Song, L., Szepesvari, C., Niu, G., Sabato, S. (eds.) Proceedings of the 39th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol.\u00a0162, pp. 2709\u20132720. PMLR (2022). https:\/\/proceedings.mlr.press\/v162\/casanova22a.html"},{"key":"14_CR3","doi-asserted-by":"publisher","unstructured":"Choi, S., Han, S., Kim, D., Ha, S.: Attentron: few-shot text-to-speech utilizing attention-based variable-length embedding. In: Interspeech 2020, pp. 2007\u20132011 (2020). https:\/\/doi.org\/10.21437\/Interspeech.2020-2096","DOI":"10.21437\/Interspeech.2020-2096"},{"key":"14_CR4","doi-asserted-by":"publisher","unstructured":"Elias, I., et al.: Parallel tacotron 2: a non-autoregressive neural tts model with differentiable duration modeling. In: Interspeech 2021, pp. 141\u2013145 (2021). https:\/\/doi.org\/10.21437\/Interspeech.2021-1461","DOI":"10.21437\/Interspeech.2021-1461"},{"key":"14_CR5","doi-asserted-by":"publisher","first-page":"4036","DOI":"10.1109\/TASLP.2024.3451951","volume":"32","author":"C Gong","year":"2024","unstructured":"Gong, C., et al.: ZMM-TTS: Zero-shot multilingual and multispeaker speech synthesis conditioned on self-supervised discrete speech representations. IEEE\/ACM Trans. Audio Speech Lang. Process. 32, 4036\u20134051 (2024). https:\/\/doi.org\/10.1109\/TASLP.2024.3451951","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"14_CR6","doi-asserted-by":"publisher","unstructured":"Gu, Y., et al.: Bytesing: a Chinese singing voice synthesis system using duration allocated encoder-decoder acoustic models and wavernn vocoders. In: 2021 12th International Symposium on Chinese Spoken Language Processing (ISCSLP), pp.\u00a01\u20135 (2021). https:\/\/doi.org\/10.1109\/ISCSLP49672.2021.9362104","DOI":"10.1109\/ISCSLP49672.2021.9362104"},{"key":"14_CR7","doi-asserted-by":"publisher","unstructured":"He, J., et al.: RMSSinger: realistic-music-score based singing voice synthesis. In: Rogers, A., Boyd-Graber, J., Okazaki, N. (eds.) Findings of the Association for Computational Linguistics: ACL 2023, pp. 236\u2013248. Association for Computational Linguistics, Toronto (2023). https:\/\/doi.org\/10.18653\/v1\/2023.findings-acl.16, https:\/\/aclanthology.org\/2023.findings-acl.16\/","DOI":"10.18653\/v1\/2023.findings-acl.16"},{"key":"14_CR8","doi-asserted-by":"publisher","unstructured":"Huang, Q., Jansen, A., Lee, J., Ganti, R., Li, J.Y., Ellis, D.P.W.: Mulan: a joint embedding of music audio and natural language. In: Proceedings of the 23rd International Society for Music Information Retrieval Conference, pp. 559\u2013566. ISMIR (2022). https:\/\/doi.org\/10.5281\/zenodo.7316724","DOI":"10.5281\/zenodo.7316724"},{"key":"14_CR9","doi-asserted-by":"publisher","unstructured":"Huang, S.F., Lin, C.J., Liu, D.R., Chen, Y.C., Lee, H.Y.: Meta-TTS: meta-learning for few-shot speaker adaptive text-to-speech. IEEE\/ACM Trans. Audio Speech Lang. Process. 30, 1558\u20131571 (2022). https:\/\/doi.org\/10.1109\/TASLP.2022.3167258","DOI":"10.1109\/TASLP.2022.3167258"},{"key":"14_CR10","doi-asserted-by":"publisher","first-page":"1519","DOI":"10.1109\/TASLP.2024.3364085","volume":"32","author":"M Jeong","year":"2024","unstructured":"Jeong, M., Kim, M., Choi, B.J., Yoon, J., Jang, W., Kim, N.S.: Transfer learning for low-resource, multi-lingual, and zero-shot multi-speaker text-to-speech. IEEE\/ACM Trans. Audio Speech Lang. Process. 32, 1519\u20131530 (2024). https:\/\/doi.org\/10.1109\/TASLP.2024.3364085","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"14_CR11","doi-asserted-by":"publisher","unstructured":"Li, S., et al.: A survey on cross-modal interaction between music and multimodal data. arXiv e-prints arXiv:2504.12796 (2025). https:\/\/doi.org\/10.48550\/arXiv.2504.12796","DOI":"10.48550\/arXiv.2504.12796"},{"issue":"2","key":"14_CR12","doi-asserted-by":"publisher","first-page":"192","DOI":"10.1109\/JAS.2016.7451107","volume":"3","author":"X Li","year":"2016","unstructured":"Li, X., Wang, Z.: A hmm-based mandarin Chinese singing voice synthesis system. IEEE\/CAA J. Autom. Sinica 3(2), 192\u2013202 (2016). https:\/\/doi.org\/10.1109\/JAS.2016.7451107","journal-title":"IEEE\/CAA J. Autom. Sinica"},{"key":"14_CR13","doi-asserted-by":"publisher","unstructured":"Lin, Y.B., Sung, Y.L., Lei, J., Bansal, M., Bertasius, G.: Vision transformers are parameter-efficient audio-visual learners. In: 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 2299\u20132309 (2023). https:\/\/doi.org\/10.1109\/CVPR52729.2023.00228","DOI":"10.1109\/CVPR52729.2023.00228"},{"key":"14_CR14","doi-asserted-by":"publisher","unstructured":"Liu, J., Li, C., Ren, Y., Chen, F., Zhao, Z.: DiffSinger: singing voice synthesis via shallow diffusion mechanism. In: Proceedings of the AAAI Conference on Artificial Intelligence, no. 10, pp. 11020\u201311028 (2022). https:\/\/doi.org\/10.1609\/aaai.v36i10.21350, https:\/\/ojs.aaai.org\/index.php\/AAAI\/article\/view\/21350","DOI":"10.1609\/aaai.v36i10.21350"},{"key":"14_CR15","doi-asserted-by":"publisher","unstructured":"Lu, P., Wu, J., Luan, J., Tan, X., Zhou, L.: XiaoiceSing: a high-quality and integrated singing voice synthesis system. In: Interspeech 2020, pp. 1306\u20131310 (2020). https:\/\/doi.org\/10.21437\/Interspeech.2020-1410","DOI":"10.21437\/Interspeech.2020-1410"},{"key":"14_CR16","doi-asserted-by":"publisher","unstructured":"Peng, P., Huang, P.Y., Li, S.W., Mohamed, A., Harwath, D.: VoiceCraft: zero-shot speech editing and text-to-speech in the wild. In: Ku, L.W., Martins, A., Srikumar, V. (eds.) Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 12442\u201312462. Association for Computational Linguistics, Bangkok (2024). https:\/\/doi.org\/10.18653\/v1\/2024.acl-long.673, https:\/\/aclanthology.org\/2024.acl-long.673\/","DOI":"10.18653\/v1\/2024.acl-long.673"},{"key":"14_CR17","unstructured":"Ren, Y., Hu, C., Tan, X., Qin, T., Zhao, S., Zhao, Z., Liu, T.Y.: FastSpeech 2: fast and high-quality end-to-end text to speech. In: International Conference on Learning Representations (2021). https:\/\/openreview.net\/forum?id=piLPYqxtWuA"},{"key":"14_CR18","unstructured":"Ren, Y., et al.: FastSpeech: Fast, Robust and Controllable Text to Speech. Curran Associates Inc., Red Hook (2019)"},{"key":"14_CR19","unstructured":"Shen, K., et al.: Naturalspeech 2: latent diffusion models are natural and zero-shot speech and singing synthesizers. In: International Conference on Learning Representations (2024). https:\/\/openreview.net\/forum?id=Rc7dAwVL3v"},{"key":"14_CR20","doi-asserted-by":"publisher","unstructured":"Sun, Z., Sarma, P., Sethares, W., Liang, Y.: Learning relationships between text, audio, and video via deep canonical correlation for multimodal language analysis. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 34, no. 05, pp. 8992\u20138999 (2020). https:\/\/doi.org\/10.1609\/aaai.v34i05.6431, https:\/\/ojs.aaai.org\/index.php\/AAAI\/article\/view\/6431","DOI":"10.1609\/aaai.v34i05.6431"},{"key":"14_CR21","doi-asserted-by":"publisher","unstructured":"Tan, X., Qin, T., Soong, F., Liu, T.Y.: A survey on neural speech synthesis. arXiv e-prints arXiv:2106.15561 (2021). https:\/\/doi.org\/10.48550\/arXiv.2106.15561","DOI":"10.48550\/arXiv.2106.15561"},{"key":"14_CR22","unstructured":"van den Oord, A., et al.: WaveNet: a generative model for raw audio. In: 9th ISCA Workshop on Speech Synthesis Workshop (SSW 9), p.\u00a0125 (2016)"},{"key":"14_CR23","doi-asserted-by":"publisher","unstructured":"Wang, J.Y., Lee, H.Y., Jang, J.S.R., Su, L.: Zero-shot singing voice synthesis from musical score. In: 2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), pp.\u00a01\u20138 (2023). https:\/\/doi.org\/10.1109\/ASRU57964.2023.10389711","DOI":"10.1109\/ASRU57964.2023.10389711"},{"key":"14_CR24","doi-asserted-by":"publisher","unstructured":"Wang, W., Song, Y., Jha, S.: USAT: a universal speaker-adaptive text-to-speech approach. IEEE\/ACM Trans. Audio, Speech and Lang. Process. 32, 2590\u20132604 (2024). https:\/\/doi.org\/10.1109\/TASLP.2024.3393714","DOI":"10.1109\/TASLP.2024.3393714"},{"key":"14_CR25","doi-asserted-by":"publisher","unstructured":"Wang, Y., et al.: Tacotron: towards end-to-end speech synthesis. In: Interspeech 2017, pp. 4006\u20134010 (2017). https:\/\/doi.org\/10.21437\/Interspeech.2017-1452","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"14_CR26","doi-asserted-by":"publisher","unstructured":"Yu, Y., Tang, S., Raposo, F., Chen, L.: Deep cross-modal correlation learning for audio and lyrics in music retrieval. ACM Trans. Multimedia Comput. Commun. Appl. 15(1) (2019). https:\/\/doi.org\/10.1145\/3281746","DOI":"10.1145\/3281746"}],"container-title":["Lecture Notes in Computer Science","Speech and Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-07959-6_14","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T09:22:31Z","timestamp":1760260951000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-07959-6_14"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,13]]},"ISBN":["9783032079589","9783032079596"],"references-count":26,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-07959-6_14","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,10,13]]},"assertion":[{"value":"13 October 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that\u00a0are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"SPECOM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Speech and Computer","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Szeged","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Hungary","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 October 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"specom2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/specom.inf.u-szeged.hu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}