{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:16:33Z","timestamp":1750220193054,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":28,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,9,23]],"date-time":"2022-09-23T00:00:00Z","timestamp":1663891200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,9,23]]},"DOI":"10.1145\/3573942.3574120","type":"proceedings-article","created":{"date-parts":[[2023,5,16]],"date-time":"2023-05-16T23:45:42Z","timestamp":1684280742000},"page":"950-955","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Voicifier-LN: An Novel Approach to Elevate the Speaker Similarity for General Zero-shot Multi-Speaker TTS"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8459-0412","authenticated-orcid":false,"given":"Dengfeng","family":"Ke","sequence":"first","affiliation":[{"name":"Beijing Language and Culture University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0770-1582","authenticated-orcid":false,"given":"Liangjie","family":"Huang","sequence":"additional","affiliation":[{"name":"Beijing Language and Culture University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1014-9565","authenticated-orcid":false,"given":"Wenhan","family":"Yao","sequence":"additional","affiliation":[{"name":"Beijing Language and Culture University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0999-0712","authenticated-orcid":false,"given":"Ruixin","family":"Hu","sequence":"additional","affiliation":[{"name":"Beijing Language and Culture University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7819-4015","authenticated-orcid":false,"given":"Xueyin","family":"Zu","sequence":"additional","affiliation":[{"name":"Beijing Language and Culture University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6765-4808","authenticated-orcid":false,"given":"Yanlu","family":"Xie","sequence":"additional","affiliation":[{"name":"Beijing Language and Culture University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1603-3136","authenticated-orcid":false,"given":"Jinsong","family":"Zhang","sequence":"additional","affiliation":[{"name":"Beijing Language and Culture University, China"}]}],"member":"320","published-online":{"date-parts":[[2023,5,16]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings of the 35th International Conference on Machine Learning, PMLR 80:5180-5189","author":"Wang Yuxuan","year":"2018","unstructured":"Yuxuan Wang, Daisy Stanton, Yu Zhang 2018. Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End Speech Synthesis. Proceedings of the 35th International Conference on Machine Learning, PMLR 80:5180-5189, 2018."},{"key":"e_1_3_2_1_2_1","unstructured":"Guangzhi Sun Yu Zhang Ron J. Weiss. 2020. GENERATING DIVERSE AND NATURAL TEXT-TO-SPEECH SAMPLES USING A QUANTIZED FINE-GRAINED VAE AND AUTOREGRESSIVE PROSODY PRIOR. ICASSP 2020 - 2020 IEEE International Conference on Acoustics Speech and Signal Processing (ICASSP)."},{"key":"e_1_3_2_1_3_1","volume-title":"Towards Multi-Scale Style Control for Expressive Speech Synthesis. ICASSP 2020 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","author":"Li Xiang","year":"2021","unstructured":"Xiang Li, Changhe Song, Jingbei Li. 2021. Towards Multi-Scale Style Control for Expressive Speech Synthesis. ICASSP 2020 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"},{"key":"e_1_3_2_1_4_1","volume-title":"NIPS'18: Proceedings of the 32nd International Conference on Neural Information Processing Systems.","author":"Sercan","year":"2018","unstructured":"Arik, Sercan O, 2018. Neural Voice Cloning with a Few Samples. NIPS'18: Proceedings of the 32nd International Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_1_5_1","unstructured":"Yutian Chen 2019. SAMPLE EFFICIENT ADAPTIVE TEXT-TO-SPEECH. ICLR 7 th International Conference on Learning Representations."},{"key":"e_1_3_2_1_6_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning, PMLR 139:7748-7759","author":"Min Dongchan","year":"2019","unstructured":"Dongchan Min, 2019. Meta-StyleSpeech : Multi-Speaker Adaptive Text-to-Speech Generation. Proceedings of the 38th International Conference on Machine Learning, PMLR 139:7748-7759, 2021."},{"key":"e_1_3_2_1_7_1","volume-title":"ZERO-SHOT MULTI-SPEAKER TEXT-TO-SPEECH WITH STATE-OF-THE-ART NEURAL SPEAKER EMBEDDINGS. ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP).","author":"Cooper Erica","year":"2020","unstructured":"Erica Cooper, 2020. ZERO-SHOT MULTI-SPEAKER TEXT-TO-SPEECH WITH STATE-OF-THE-ART NEURAL SPEAKER EMBEDDINGS. ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)."},{"key":"e_1_3_2_1_8_1","volume-title":"PROSOSPEECH: ENHANCING PROSODY WITH QUANTIZED VECTOR PRE-TRAINING IN TEXT-TO-SPEECH. ICASSP 2021 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP).","author":"Ren Yi","year":"2022","unstructured":"Yi Ren, 2022. PROSOSPEECH: ENHANCING PROSODY WITH QUANTIZED VECTOR PRE-TRAINING IN TEXT-TO-SPEECH. ICASSP 2021 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)."},{"key":"e_1_3_2_1_9_1","volume-title":"Thirty-seventh International Conference on Machine Learning(ICML).","author":"Qian Kaizhi","year":"2021","unstructured":"Kaizhi Qian, 2021. Unsupervised Speech Decomposition via Triple Information Bottleneck. Thirty-seventh International Conference on Machine Learning(ICML)."},{"key":"e_1_3_2_1_10_1","volume-title":"L2-arctic: A non-native english speech corpus. Interspeech","author":"Zhao Guanlong","year":"2018","unstructured":"Guanlong Zhao, Sinem Sonsaat, Alif O. Silpachai, 2018. L2-arctic: A non-native english speech corpus. Interspeech 2018."},{"key":"e_1_3_2_1_11_1","volume-title":"AISHELL-3: A MULTI-SPEAKER MANDARIN TTS CORPUS AND THE BASELINES. Interspeech","author":"Shi Yao","year":"2021","unstructured":"Yao Shi. 2021. AISHELL-3: A MULTI-SPEAKER MANDARIN TTS CORPUS AND THE BASELINES. Interspeech 2021."},{"key":"e_1_3_2_1_12_1","unstructured":"Heiga Zen Viet Dang Rob Clark Yu Zhang Ron J. Weiss Ye Jia Zhifeng Chen Yonghui Wu. 2019. Interspeech 2019."},{"key":"e_1_3_2_1_13_1","unstructured":"Yi Ren Chenxu Hu Xu Tan 2021. FASTSPEECH 2: FAST AND HIGH-QUALITY END-TOEND TEXT TO SPEECH. ICLR 9 th International Conference on Learning Representations."},{"key":"e_1_3_2_1_14_1","volume-title":"LIBRISPEECH: AN ASR CORPUS BASED ON PUBLIC DOMAIN AUDIO BOOKS. ICASSP 2014 - 2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP).","author":"Panayotov Vassil","year":"2015","unstructured":"Vassil Panayotov, Guoguo Chen, Daniel Povey, Sanjeev Khudanpur. 2015. LIBRISPEECH: AN ASR CORPUS BASED ON PUBLIC DOMAIN AUDIO BOOKS. ICASSP 2014 - 2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)."},{"volume-title":"NNSPEECH: SPEAKER-GUIDED CONDITIONAL VARIATIONAL AUTOENCODER FOR ZERO-SHOT MULTI-SPEAKER TEXT-TO-SPEECH","author":"Botao Zhao","key":"e_1_3_2_1_15_1","unstructured":"Botao Zhao 2022. NNSPEECH: SPEAKER-GUIDED CONDITIONAL VARIATIONAL AUTOENCODER FOR ZERO-SHOT MULTI-SPEAKER TEXT-TO-SPEECH. Ping An Technology (Shenzhen) Co., Ltd., China."},{"key":"e_1_3_2_1_16_1","volume-title":"Proceedings of the 35th International Conference on Machine Learning, PMLR 80:5180-5189","author":"Wang Yuxuan","year":"2018","unstructured":"Yuxuan Wang, Daisy Stanton, Yu Zhang 2018. Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End Speech Synthesis. Proceedings of the 35th International Conference on Machine Learning, PMLR 80:5180-5189, 2018."},{"key":"e_1_3_2_1_17_1","unstructured":"Xu Tan 2018. A Survey on Neural Speech Synthesis. Microsoft Research Asia."},{"key":"e_1_3_2_1_18_1","volume-title":"33th International Conference on Machine Learning(ICML).","author":"Skerry-Ryan RJ","year":"2018","unstructured":"RJ Skerry-Ryan 2018. Towards end-to-end prosody transfer for expressive speech synthesis with tacotron. 33th International Conference on Machine Learning(ICML)."},{"key":"e_1_3_2_1_19_1","volume-title":"Adaspeech 3: Adaptive text to speech for spontaneous style. Interspeech","author":"Yuzi Yan","year":"2021","unstructured":"Yuzi Yan 2021. Adaspeech 3: Adaptive text to speech for spontaneous style. Interspeech 2021."},{"key":"e_1_3_2_1_20_1","unstructured":"Mingjian Chen 2021. Adaspeech: Adaptive text to speech for custom voice. ICLR 9 th International Conference on Learning Representations."},{"key":"e_1_3_2_1_21_1","unstructured":"Kaiyang Zhou 2021. Domain generalization with mixstyle. ICLR 9 th International Conference on Learning Representations."},{"key":"e_1_3_2_1_22_1","volume-title":"Can Speaker Augmentation Improve Multi-Speaker End-to-End TTS. Interspeech","author":"Cooper Erica","year":"2020","unstructured":"Erica Cooper, Cheng-I Lai 2020. Can Speaker Augmentation Improve Multi-Speaker End-to-End TTS. Interspeech 2020."},{"key":"e_1_3_2_1_23_1","volume-title":"RJ Skerry-Ryan","author":"Wang Yuxuan","year":"2017","unstructured":"Yuxuan Wang, RJ Skerry-Ryan 2017. Tacotron: Towards end-to-end speech synthesis. Interspeech2017."},{"key":"e_1_3_2_1_24_1","volume-title":"MelGAN: Generative Adversarial Networks for Conditional Waveform Synthesis. 34th Conference on Neural Information Processing Systems.","author":"Kumar Kundan","year":"2019","unstructured":"Kundan Kumar, Rithesh Kumar 2019. MelGAN: Generative Adversarial Networks for Conditional Waveform Synthesis. 34th Conference on Neural Information Processing Systems."},{"issue":"2014","key":"e_1_3_2_1_25_1","first-page":"1","article-title":"Accelerating t-SNE using Tree-Based Algorithms","volume":"15","author":"van der Maaten Laurens","year":"2019","unstructured":"Laurens van der Maaten 2019. Accelerating t-SNE using Tree-Based Algorithms. Journal of Machine Learning Research 15 (2014) 1-21.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_26_1","volume-title":"End-to-end text-to-speech for low-resource languages by cross-lingual transfer learning. Interspeech","author":"Chen Yuan-Jui","year":"2019","unstructured":"Yuan-Jui Chen, Hung-yi Lee 2019. End-to-end text-to-speech for low-resource languages by cross-lingual transfer learning. Interspeech 2019."},{"key":"e_1_3_2_1_27_1","volume-title":"Muhammed PV Shifas","author":"Paul Dipjyoti","year":"2020","unstructured":"Dipjyoti Paul, Muhammed PV Shifas 2020. Enhancing speech intelligibility in text-to-speech synthesis using speaking style conversion. Interspeech 2020."},{"volume-title":"One-shot Voice Conversion by Separating Speaker and Content Representations with Instance Normalization. Interspeech","year":"2019","key":"e_1_3_2_1_28_1","unstructured":"Ju-chieh Chou, Cheng-chieh Yeh, Hung-yi Lee. 2019. One-shot Voice Conversion by Separating Speaker and Content Representations with Instance Normalization. Interspeech 2019."}],"event":{"name":"AIPR 2022: 2022 5th International Conference on Artificial Intelligence and Pattern Recognition","acronym":"AIPR 2022","location":"Xiamen China"},"container-title":["Proceedings of the 2022 5th International Conference on Artificial Intelligence and Pattern Recognition"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3573942.3574120","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3573942.3574120","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:02:33Z","timestamp":1750186953000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3573942.3574120"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,9,23]]},"references-count":28,"alternative-id":["10.1145\/3573942.3574120","10.1145\/3573942"],"URL":"https:\/\/doi.org\/10.1145\/3573942.3574120","relation":{},"subject":[],"published":{"date-parts":[[2022,9,23]]},"assertion":[{"value":"2023-05-16","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}