{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,27]],"date-time":"2025-11-27T06:46:48Z","timestamp":1764226008263,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":29,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819772315"},{"type":"electronic","value":"9789819772322"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-981-97-7232-2_7","type":"book-chapter","created":{"date-parts":[[2024,8,27]],"date-time":"2024-08-27T16:02:47Z","timestamp":1724774567000},"page":"90-104","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["RSET: Remapping-Based Sorting Method for\u00a0Emotion Transfer Speech Synthesis"],"prefix":"10.1007","author":[{"given":"Haoxiang","family":"Shi","sequence":"first","affiliation":[]},{"given":"Jianzong","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Xulong","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Ning","family":"Cheng","sequence":"additional","affiliation":[]},{"given":"Jun","family":"Yu","sequence":"additional","affiliation":[]},{"given":"Jing","family":"Xiao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,8,28]]},"reference":[{"key":"7_CR1","unstructured":"Cheng, P., Hao, W., Dai, S., Liu, J., Gan, Z., Carin, L.: Club: a contrastive log-ratio upper bound of mutual information. In: Proceedings of the 37th International Conference on Machine Learning, pp. 1779\u20131788 (2020)"},{"key":"7_CR2","doi-asserted-by":"crossref","unstructured":"Eyben, F., Weninger, F., Gross, F., Schuller, B.: Recent developments in openSMILE, the Munich open-source multimedia feature extractor. In: Proceedings of the 21st ACM International Conference on Multimedia, pp. 835\u2013838 (2013)","DOI":"10.1145\/2502081.2502224"},{"key":"7_CR3","doi-asserted-by":"crossref","unstructured":"Guo, Y., Du, C., Chen, X., Yu, K.: EmoDiff: intensity controllable emotional text-to-speech with soft-label guidance. In: IEEE International Conference on Acoustics, Speech and Signal Processing, pp.\u00a01\u20135 (2023)","DOI":"10.1109\/ICASSP49357.2023.10095621"},{"key":"7_CR4","doi-asserted-by":"crossref","unstructured":"Im, C., Lee, S., Kim, S., Lee, S.: EMOQ-TTS: emotion intensity quantization for fine-grained controllable emotional text-to-speech. In: IEEE International Conference on Acoustics, Speech and Signal Processing, pp. 6317\u20136321 (2022)","DOI":"10.1109\/ICASSP43922.2022.9747098"},{"key":"7_CR5","doi-asserted-by":"crossref","unstructured":"Inoue, S., Zhou, K., Wang, S., Li, H.: Hierarchical emotion prediction and control in text-to-speech synthesis. In: 2024 IEEE International Conference on Acoustics, Speech and Signal Processing, pp. 10601\u201310605 (2024)","DOI":"10.1109\/ICASSP48485.2024.10445996"},{"key":"7_CR6","doi-asserted-by":"crossref","unstructured":"Joachims, T.: Optimizing search engines using clickthrough data. In: Proceedings of the eighth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp. 133\u2013142 (2002)","DOI":"10.1145\/775047.775067"},{"key":"7_CR7","unstructured":"Kominek, J., Schultz, T., Black, A.W.: Synthesizer voice quality of new languages calibrated with mean mel cepstral distortion. In: First International Workshop on Spoken Languages Technologies for Under-Resourced Languages, pp. 63\u201368 (2008)"},{"key":"7_CR8","unstructured":"Kong, J., Kim, J., Bae, J.: HiFi-GAN: generative adversarial networks for efficient and high fidelity speech synthesis. In: Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, vol. 33, pp. 17022\u201317033 (2020)"},{"key":"7_CR9","unstructured":"Lee, Y., Rabiee, A., Lee, S.Y.: Emotional end-to-end neural speech synthesizer. arXiv preprint arXiv:1711.05447 (2017)"},{"key":"7_CR10","doi-asserted-by":"crossref","unstructured":"Lei, Y., Yang, S., Xie, L.: Fine-grained emotion strength transfer, control and prediction for emotional speech synthesis. In: IEEE Spoken Language Technology Workshop, pp. 423\u2013430 (2021)","DOI":"10.1109\/SLT48900.2021.9383524"},{"key":"7_CR11","doi-asserted-by":"crossref","unstructured":"Li, T., et al.: DiCLET-TTS: diffusion model based cross-lingual emotion transfer for text-to-speech - a study between English and mandarin. IEEE ACM Trans. Audio Speech Lang. Process. 31, 3418\u20133430 (2023)","DOI":"10.1109\/TASLP.2023.3313413"},{"key":"7_CR12","doi-asserted-by":"crossref","unstructured":"Li, T., Yang, S., Xue, L., Xie, L.: Controllable emotion transfer for end-to-end speech synthesis. In: 12th International Symposium on Chinese Spoken Language Processing, pp.\u00a01\u20135 (2021)","DOI":"10.1109\/ISCSLP49672.2021.9362069"},{"key":"7_CR13","doi-asserted-by":"crossref","unstructured":"Matsumoto, K., Hara, S., Abe, M.: Controlling the strength of emotions in speech-like emotional sound generated by waveNet. In: 21st Annual Conference of the International Speech Communication Association, pp. 3421\u20133425 (2020)","DOI":"10.21437\/Interspeech.2020-2064"},{"key":"7_CR14","unstructured":"Min, D., Lee, D.B., Yang, E., Hwang, S.J.: Meta-stylespeech: multi-speaker adaptive text-to-speech generation. In: Proceedings of the 38th International Conference on Machine Learning, pp. 7748\u20137759 (2021)"},{"key":"7_CR15","doi-asserted-by":"crossref","unstructured":"Parikh, D., Grauman, K.: Relative attributes. In: 2011 International Conference on Computer Vision, pp. 503\u2013510 (2011)","DOI":"10.1109\/ICCV.2011.6126281"},{"key":"7_CR16","unstructured":"Ren, Y., et al.: FastSpeech 2: fast and high-quality end-to-end text to speech. In: 9th International Conference on Learning Representations (2021)"},{"key":"7_CR17","doi-asserted-by":"crossref","unstructured":"Tang, H., Zhang, X., Cheng, N., Xiao, J., Wang, J.: ED-TTS: multi-scale emotion modeling using cross-domain emotion diarization for emotional speech synthesis. In: IEEE International Conference on Acoustics, Speech and Signal Processing, pp. 12146\u201312150 (2024)","DOI":"10.1109\/ICASSP48485.2024.10446467"},{"key":"7_CR18","doi-asserted-by":"crossref","unstructured":"Tang, H., Zhang, X., Wang, J., Cheng, N., Xiao, J.: EmoMix: emotion mixing via diffusion models for emotional speech synthesis. In: 24nd Annual Conference of the International Speech Communication Association, pp. 12\u201316 (2023)","DOI":"10.21437\/Interspeech.2023-1317"},{"key":"7_CR19","doi-asserted-by":"crossref","unstructured":"Tang, H., Zhang, X., Wang, J., Cheng, N., Xiao, J.: QI-TTS: questioning intonation control for emotional speech synthesis. In: IEEE International Conference on Acoustics, Speech and Signal Processing, pp.\u00a01\u20135 (2023)","DOI":"10.1109\/ICASSP49357.2023.10095623"},{"key":"7_CR20","doi-asserted-by":"crossref","unstructured":"Tits, N., Haddad, K.E., Dutoit, T.: Exploring transfer learning for low resource emotional TTS. In: Intelligent Systems and Applications - Proceedings of the 2019 Intelligent Systems Conference, vol.\u00a01037, pp. 52\u201360 (2019)","DOI":"10.1007\/978-3-030-29516-5_5"},{"key":"7_CR21","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, pp. 5998\u20136008 (2017)"},{"key":"7_CR22","doi-asserted-by":"crossref","unstructured":"Wang, D., Deng, L., Yeung, Y.T., Chen, X., Liu, X., Meng, H.: VQMIVC: vector quantization and mutual information-based unsupervised speech representation disentanglement for one-shot voice conversion. In: 22nd Annual Conference of the International Speech Communication Association, pp. 1344\u20131348 (2021)","DOI":"10.21437\/Interspeech.2021-283"},{"key":"7_CR23","doi-asserted-by":"crossref","unstructured":"Wang, Y., et al.: Tacotron: towards end-to-end speech synthesis. In: 18th Annual Conference of the International Speech Communication Association, pp. 4006\u20134010 (2017)","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"7_CR24","unstructured":"Wang, Y., et al.: Style tokens: unsupervised style modeling, control and transfer in end-to-end speech synthesis. In: Proceedings of the 35th International Conference on Machine Learning, pp. 5167\u20135176 (2018)"},{"key":"7_CR25","doi-asserted-by":"crossref","unstructured":"Zeng, A., Chen, M., Zhang, L., Xu, Q.: Are transformers effective for time series forecasting? In: Thirty-Seventh AAAI Conference on Artificial Intelligence, vol.\u00a037, pp. 11121\u201311128 (2023)","DOI":"10.1609\/aaai.v37i9.26317"},{"key":"7_CR26","doi-asserted-by":"crossref","unstructured":"Zhang, Y.J., Pan, S., He, L., Ling, Z.H.: Learning latent representations for style control and transfer in end-to-end speech synthesis. In: IEEE International Conference on Acoustics, Speech and Signal Processing, pp. 6945\u20136949 (2019)","DOI":"10.1109\/ICASSP.2019.8683623"},{"key":"7_CR27","doi-asserted-by":"crossref","unstructured":"Zhou, K., Sisman, B., Liu, R., Li, H.: Seen and unseen emotional style transfer for voice conversion with a new emotional speech dataset. In: IEEE International Conference on Acoustics, Speech and Signal Processing, pp. 920\u2013924 (2021)","DOI":"10.1109\/ICASSP39728.2021.9413391"},{"key":"7_CR28","doi-asserted-by":"publisher","first-page":"3120","DOI":"10.1109\/TAFFC.2022.3233324","volume":"14","author":"K Zhou","year":"2023","unstructured":"Zhou, K., Sisman, B., Rana, R., Schuller, B.W., Li, H.: Speech synthesis with mixed emotions. IEEE Trans. Affect. Comput. 14, 3120\u20133134 (2023)","journal-title":"IEEE Trans. Affect. Comput."},{"key":"7_CR29","doi-asserted-by":"crossref","unstructured":"Zhu, X., Yang, S., Yang, G., Xie, L.: Controlling emotion strength with relative attribute for end-to-end speech synthesis. In: IEEE Automatic Speech Recognition and Understanding Workshop, pp. 192\u2013199 (2019)","DOI":"10.1109\/ASRU46091.2019.9003829"}],"container-title":["Lecture Notes in Computer Science","Web and Big Data"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-97-7232-2_7","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T09:37:33Z","timestamp":1732700253000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-97-7232-2_7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9789819772315","9789819772322"],"references-count":29,"URL":"https:\/\/doi.org\/10.1007\/978-981-97-7232-2_7","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"28 August 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"APWeb-WAIM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Asia-Pacific Web (APWeb) and Web-Age Information Management (WAIM) Joint International Conference on Web and Big Data","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Jinhua","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"31 August 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 September 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"apwebwaim2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/apweb2024.zjnu.edu.cn\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}