{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T03:05:12Z","timestamp":1742958312398,"version":"3.40.3"},"publisher-location":"Cham","reference-count":29,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031781711"},{"type":"electronic","value":"9783031781728"}],"license":[{"start":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:00:00Z","timestamp":1733184000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:00:00Z","timestamp":1733184000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-78172-8_12","type":"book-chapter","created":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T09:47:54Z","timestamp":1733132874000},"page":"177-189","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["TMCSpeech: A Chinese TV and Movie Speech Dataset with Character Descriptions and a Character-Based Voice Generation Model"],"prefix":"10.1007","author":[{"given":"Dong","family":"Liu","sequence":"first","affiliation":[]},{"given":"Yueqian","family":"Lin","sequence":"additional","affiliation":[]},{"given":"Yunfei","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Ming","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,3]]},"reference":[{"key":"12_CR1","unstructured":"Arik, S., Chen, J., Peng, K., Ping, W., Zhou, Y.: Neural voice cloning with a few samples. In: Advances in Neural Information Processing Systems, vol. 31 (2018)"},{"key":"12_CR2","doi-asserted-by":"crossref","unstructured":"Bilinski, P., et al.: Creating new voices using normalizing flows. arXiv preprint arXiv:2312.14569 (2023)","DOI":"10.21437\/Interspeech.2022-10195"},{"key":"12_CR3","unstructured":"Casanova, E., Weber, J., Shulby, C.D., Junior, A.C., G\u00f6lge, E., Ponti, M.A.: YourTTS: towards zero-shot multi-speaker TTS and zero-shot voice conversion for everyone. In: International Conference on Machine Learning, pp. 2709\u20132720. PMLR (2022)"},{"key":"12_CR4","doi-asserted-by":"crossref","unstructured":"Chung, J.S., Nagrani, A., Zisserman, A.: VoxCeleb2: deep speaker recognition. arXiv preprint arXiv:1806.05622 (2018)","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"12_CR5","doi-asserted-by":"crossref","unstructured":"Deng, J., Guo, J., Ververas, E., Kotsia, I., Zafeiriou, S.: RetinaFace: single-shot multi-level face localisation in the wild. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5203\u20135212 (2020)","DOI":"10.1109\/CVPR42600.2020.00525"},{"key":"12_CR6","doi-asserted-by":"crossref","unstructured":"Deng, J., Guo, J., Xue, N., Zafeiriou, S.: ArcFace: additive angular margin loss for deep face recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4690\u20134699 (2019)","DOI":"10.1109\/CVPR.2019.00482"},{"key":"12_CR7","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"12_CR8","doi-asserted-by":"crossref","unstructured":"Duta, I.C., Liu, L., Zhu, F., Shao, L.: Improved residual networks for image and video recognition. In: 2020 25th International Conference on Pattern Recognition, pp. 9415\u20139422. IEEE (2021)","DOI":"10.1109\/ICPR48806.2021.9412193"},{"key":"12_CR9","doi-asserted-by":"crossref","unstructured":"Gao, Z., Zhang, S., McLoughlin, I., Yan, Z.: Paraformer: fast and accurate parallel transformer for non-autoregressive end-to-end speech recognition. arXiv preprint arXiv:2206.08317 (2022)","DOI":"10.21437\/Interspeech.2022-9996"},{"key":"12_CR10","doi-asserted-by":"crossref","unstructured":"Guo, Z., Leng, Y., Wu, Y., Zhao, S., Tan, X.: PromptTTS: controllable text-to-speech with text descriptions. In: IEEE ICASSP, pp.\u00a01\u20135. IEEE (2023)","DOI":"10.1109\/ICASSP49357.2023.10096285"},{"key":"12_CR11","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"12_CR12","doi-asserted-by":"publisher","unstructured":"Hennequin, R., Khlif, A., Voituret, F., Moussallam, M.: Spleeter: a fast and efficient music source separation tool with pre-trained models. J. Open Source Softw. 5(50), 2154 (2020). https:\/\/doi.org\/10.21105\/joss.02154. Deezer Research","DOI":"10.21105\/joss.02154"},{"key":"12_CR13","unstructured":"Huang, R., et al.: Make-an-audio: text-to-audio generation with prompt-enhanced diffusion models. arXiv preprint arXiv:2301.12661 (2023)"},{"key":"12_CR14","unstructured":"Jia, Y., et\u00a0al.: Transfer learning from speaker verification to multispeaker text-to-speech synthesis. In: Advances in Neural Information Processing Systems, vol. 31 (2018)"},{"key":"12_CR15","unstructured":"Kim, J., Kong, J., Son, J.: Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech. In: International Conference on Machine Learning. pp. 5530\u20135540. PMLR (2021)"},{"key":"12_CR16","doi-asserted-by":"crossref","unstructured":"Kim, M., Cheon, S.J., Choi, B.J., Kim, J.J., Kim, N.S.: Expressive text-to-speech using style tag. arXiv preprint arXiv:2104.00436 (2021)","DOI":"10.21437\/Interspeech.2021-465"},{"key":"12_CR17","unstructured":"Kingma, D.P., Dhariwal, P.: Glow: generative flow with invertible 1x1 convolutions. In: Advances in Neural Information Processing Systems, vol. 31 (2018)"},{"key":"12_CR18","doi-asserted-by":"crossref","unstructured":"Liu, G., et al.: PromptStyle: controllable style transfer for text-to-speech with natural language descriptions. arXiv preprint arXiv:2305.19522 (2023)","DOI":"10.21437\/Interspeech.2023-1779"},{"key":"12_CR19","unstructured":"Liu, H., et al.: AudioLDM: text-to-audio generation with latent diffusion models. arXiv preprint arXiv:2301.12503 (2023)"},{"key":"12_CR20","doi-asserted-by":"crossref","unstructured":"Okabe, K., Koshinaka, T., Shinoda, K.: Attentive statistics pooling for deep speaker embedding. arXiv preprint arXiv:1803.10963 (2018)","DOI":"10.21437\/Interspeech.2018-993"},{"issue":"1","key":"12_CR21","first-page":"2617","volume":"22","author":"G Papamakarios","year":"2021","unstructured":"Papamakarios, G., Nalisnick, E., Rezende, D.J., Mohamed, S., Lakshminarayanan, B.: Normalizing flows for probabilistic modeling and inference. J. Mach. Learn. Res. 22(1), 2617\u20132680 (2021)","journal-title":"J. Mach. Learn. Res."},{"key":"12_CR22","unstructured":"Qin, X., et al.: The DKU-Tencent system for the VoxCeleb speaker recognition challenge (2022). arXiv preprint arXiv:2210.05092"},{"key":"12_CR23","doi-asserted-by":"crossref","unstructured":"Shi, Y., Bu, H., Xu, X., Zhang, S., Li, M.: AISHELL-3: a multi-speaker mandarin TTS corpus and the baselines. arXiv preprint arXiv:2010.11567 (2020)","DOI":"10.21437\/Interspeech.2021-755"},{"key":"12_CR24","unstructured":"Shi, Y., Li, M.: VoiceLens: controllable speaker generation and editing with flow. arXiv preprint arXiv:2309.14094 (2023)"},{"key":"12_CR25","doi-asserted-by":"crossref","unstructured":"Snyder, D., Garcia-Romero, D., Sell, G., Povey, D., Khudanpur, S.: X-vectors: robust DNN embeddings for speaker recognition. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5329\u20135333. IEEE (2018)","DOI":"10.1109\/ICASSP.2018.8461375"},{"key":"12_CR26","doi-asserted-by":"crossref","unstructured":"Stanton, D., et al.: Speaker generation. In: IEEE ICASSP, pp. 7897\u20137901. IEEE (2022)","DOI":"10.1109\/ICASSP43922.2022.9747345"},{"key":"12_CR27","unstructured":"Tan, X., Qin, T., Soong, F., Liu, T.Y.: A survey on neural speech synthesis. arXiv preprint arXiv:2106.15561 (2021)"},{"key":"12_CR28","doi-asserted-by":"crossref","unstructured":"Yang, D., et al.: InstrucTTTS: modelling expressive TTS in discrete latent space with natural language style prompt. arXiv preprint arXiv:2301.13662 (2023)","DOI":"10.1109\/TASLP.2024.3402088"},{"key":"12_CR29","doi-asserted-by":"crossref","unstructured":"Zhang, Y., et al.: PromptSpeaker: speaker generation based on text descriptions. arXiv preprint arXiv:2310.05001 (2023)","DOI":"10.1109\/ASRU57964.2023.10389772"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-78172-8_12","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T10:05:46Z","timestamp":1733133946000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-78172-8_12"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,3]]},"ISBN":["9783031781711","9783031781728"],"references-count":29,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-78172-8_12","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,12,3]]},"assertion":[{"value":"3 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICPR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Pattern Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Kolkata","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"India","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icpr2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icpr2024.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}