{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,2]],"date-time":"2026-07-02T05:35:19Z","timestamp":1782970519793,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":62,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680777","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"7523-7532","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":36,"title":["From Speaker to Dubber: Movie Dubbing with Prosody and Duration Consistency Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-1129-8914","authenticated-orcid":false,"given":"Zhedong","family":"Zhang","sequence":"first","affiliation":[{"name":"Hangzhou Dianzi University, Hangzhou, Zhejiang Province, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1943-8219","authenticated-orcid":false,"given":"Liang","family":"Li","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-1239-8119","authenticated-orcid":false,"given":"Gaoxiang","family":"Cong","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3025-0938","authenticated-orcid":false,"given":"Haibing","family":"Yin","sequence":"additional","affiliation":[{"name":"Hangzhou Dianzi University &amp; Lishui Institute of Hangzhou Dianzi University, Hangzhou, Zhejiang Province, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7777-3613","authenticated-orcid":false,"given":"Yuhan","family":"Gao","sequence":"additional","affiliation":[{"name":"Hangzhou Dianzi University &amp; Lishui Institute of Hangzhou Dianzi University, Hangzhou, Zhejiang Province, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1204-0512","authenticated-orcid":false,"given":"Chenggang","family":"Yan","sequence":"additional","affiliation":[{"name":"Hangzhou Dianzi University, Hangzhou City, Zhejiang Province, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3027-8364","authenticated-orcid":false,"given":"Anton van den","family":"Hengel","sequence":"additional","affiliation":[{"name":"University of Adelaide, City of Adelaide, Australia"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4312-5682","authenticated-orcid":false,"given":"Yuankai","family":"Qi","sequence":"additional","affiliation":[{"name":"Macquarie University, Sydney, Australia"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Advances in Neural Information Processing Systems 31: Annual Conference on Neural Information Processing Systems 2018","author":"Arik Sercan \u00d6mer","year":"2018","unstructured":"Sercan \u00d6mer Arik, Jitong Chen, Kainan Peng, Wei Ping, and Yanqi Zhou. 2018. Neural Voice Cloning with a Few Samples. In Advances in Neural Information Processing Systems 31: Annual Conference on Neural Information Processing Systems 2018, NeurIPS 2018, December 3-8, 2018, Montr\u00e9al, Canada. 10040--10050."},{"key":"e_1_3_2_1_2_1","volume-title":"22nd Annual Conference of the International Speech Communication Association","author":"Casanova Edresson","year":"2021","unstructured":"Edresson Casanova, Christopher Shulby, Eren G\u00f6lge, Nicolas Michael M\u00fcller, Frederico Santos de Oliveira, Arnaldo Candido Jr., Anderson da Silva Soares, Sandra Maria Alu\u00edsio, and Moacir Antonelli Ponti. [n.,d.]. SC-GlowTTS: An Efficient Zero-Shot Multi-Speaker Text-To-Speech Model. In Interspeech 2021, 22nd Annual Conference of the International Speech Communication Association, Brno, Czechia, 30 August - 3 September 2021. 3645--3649."},{"key":"e_1_3_2_1_3_1","volume-title":"YourTTS: Towards Zero-Shot Multi-Speaker TTS and Zero-Shot Voice Conversion for Everyone. In International Conference on Machine Learning, ICML 2022","volume":"162","author":"Casanova Edresson","year":"2022","unstructured":"Edresson Casanova, Julian Weber, Christopher Dane Shulby, Arnaldo C\u00e2ndido J\u00fanior, Eren G\u00f6lge, and Moacir A. Ponti. 2022. YourTTS: Towards Zero-Shot Multi-Speaker TTS and Zero-Shot Voice Conversion for Everyone. In International Conference on Machine Learning, ICML 2022, 17--23 July 2022, Baltimore, Maryland, USA, Vol. 162. 2709--2720."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.02056"},{"key":"e_1_3_2_1_5_1","first-page":"1111","article-title":"Review of omnimedia content quality evaluation","volume":"38","author":"Chenggang Y","year":"2022","unstructured":"Y Chenggang, S Yaoqi, Z Hao, Z Chenwei, Z Zunjie, Z Bolun, and Z Xiaofei. 2022. Review of omnimedia content quality evaluation. J. Signal Process., Vol. 38, 6 (2022), 1111--1143.","journal-title":"J. Signal Process."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01411"},{"key":"e_1_3_2_1_7_1","volume-title":"StyleDubber: Towards Multi-Scale Style Learning for Movie Dubbing. arXiv preprint arXiv:2402.12636","author":"Cong Gaoxiang","year":"2024","unstructured":"Gaoxiang Cong, Yuankai Qi, Liang Li, Amin Beheshti, Zhedong Zhang, Anton van den Hengel, Ming-Hsuan Yang, Chenggang Yan, and Qingming Huang. 2024. StyleDubber: Towards Multi-Scale Style Learning for Movie Dubbing. arXiv preprint arXiv:2402.12636 (2024)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1121\/1.2229005"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680899"},{"key":"e_1_3_2_1_10_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. CoRR","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. CoRR, Vol. abs\/1810.04805 (2018)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i16.29769"},{"key":"e_1_3_2_1_12_1","volume-title":"VoiceFlow: Efficient Text-to-Speech with Rectified Flow Matching. In ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 11121--11125","author":"Guo Yiwei","year":"2024","unstructured":"Yiwei Guo, Chenpeng Du, Ziyang Ma, Xie Chen, and Kai Yu. 2024. VoiceFlow: Efficient Text-to-Speech with Rectified Flow Matching. In ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 11121--11125."},{"key":"e_1_3_2_1_13_1","volume-title":"Speech Prosody in Speech Synthesis: Modeling and generation of prosody for high quality and flexible speech synthesis","author":"Hirose Keikichi","unstructured":"Keikichi Hirose and Jianhua Tao. 2015. Speech Prosody in Speech Synthesis: Modeling and generation of prosody for high quality and flexible speech synthesis. Springer."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"e_1_3_2_1_15_1","volume-title":"Advances in Neural Information Processing Systems 34: Annual Conference on Neural Information Processing Systems 2021","author":"Hu Chenxu","year":"2021","unstructured":"Chenxu Hu, Qiao Tian, Tingle Li, Yuping Wang, Yuxuan Wang, and Hang Zhao. 2021. Neural Dubber: Dubbing for Videos According to Scripts. In Advances in Neural Information Processing Systems 34: Annual Conference on Neural Information Processing Systems 2021, NeurIPS 2021, December 6-14, 2021, virtual. 16582--16595."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i21.30570"},{"key":"e_1_3_2_1_17_1","volume-title":"ProDiff: Progressive Fast Diffusion Model for High-Quality Text-to-Speech. In MM '22: The 30th ACM International Conference on Multimedia","author":"Huang Rongjie","year":"2022","unstructured":"Rongjie Huang, Zhou Zhao, Huadai Liu, Jinglin Liu, Chenye Cui, and Yi Ren. 2022. ProDiff: Progressive Fast Diffusion Model for High-Quality Text-to-Speech. In MM '22: The 30th ACM International Conference on Multimedia, Lisboa, Portugal, October 10 - 14, 2022. 2595--2605."},{"key":"e_1_3_2_1_18_1","volume-title":"22nd Annual Conference of the International Speech Communication Association","author":"Jia Ye","year":"2021","unstructured":"Ye Jia, Heiga Zen, Jonathan Shen, Yu Zhang, and Yonghui Wu. 2021. PnG BERT: Augmented BERT on Phonemes and Graphemes for Neural TTS. In Interspeech 2021, 22nd Annual Conference of the International Speech Communication Association, Brno, Czechia, 30 August - 3 September 2021. 151--155."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2307.07218"},{"key":"e_1_3_2_1_20_1","unstructured":"Zeqian Ju Yuancheng Wang Kai Shen Xu Tan Detai Xin Dongchao Yang Yanqing Liu Yichong Leng Kaitao Song Siliang Tang et al. 2024. NaturalSpeech 3: Zero-Shot Speech Synthesis with Factorized Codec and Diffusion Models. arXiv preprint arXiv:2403.03100 (2024)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00453"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00618"},{"key":"e_1_3_2_1_23_1","volume-title":"Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020","author":"Kim Jaehyeon","year":"2020","unstructured":"Jaehyeon Kim, Sungwon Kim, Jungil Kong, and Sungroh Yoon. 2020. Glow-TTS: A Generative Flow for Text-to-Speech via Monotonic Alignment Search. In Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual. https:\/\/proceedings.neurips.cc\/paper\/2020\/hash\/5c3b99e8f92532e5ad1556e53ceea00c-Abstract.html"},{"key":"e_1_3_2_1_24_1","volume-title":"Adam: A Method for Stochastic Optimization. In 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings.","author":"Diederik","unstructured":"Diederik P. Kingma and Jimmy Ba. 2015. Adam: A Method for Stochastic Optimization. In 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings."},{"key":"e_1_3_2_1_25_1","volume-title":"Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems","author":"Kong Jungil","year":"2020","unstructured":"Jungil Kong, Jaehyeon Kim, and Jaekyoung Bae. 2020. HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis. In Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual. https:\/\/proceedings.neurips.cc\/paper\/2020\/hash\/c5d736809766d46260d816d8dbc9eb44-Abstract.html"},{"key":"e_1_3_2_1_26_1","volume-title":"Imaginary Voice: Face-Styled Diffusion Model for Text-to-Speech. In IEEE International Conference on Acoustics, Speech and Signal Processing ICASSP 2023","author":"Lee Jiyoung","year":"2023","unstructured":"Jiyoung Lee, Joon Son Chung, and Soo-Whan Chung. 2023. Imaginary Voice: Face-Styled Diffusion Model for Text-to-Speech. In IEEE International Conference on Acoustics, Speech and Signal Processing ICASSP 2023, Rhodes Island, Greece, June 4-10, 2023. 1--5."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3158546"},{"key":"e_1_3_2_1_28_1","volume-title":"Phoneme-Level Bert for Enhanced Prosody of Text-To-Speech with Grapheme Predictions. In IEEE International Conference on Acoustics, Speech and Signal Processing ICASSP 2023","author":"Li Yinghao Aaron","year":"2023","unstructured":"Yinghao Aaron Li, Cong Han, Xilin Jiang, and Nima Mesgarani. 2023. Phoneme-Level Bert for Enhanced Prosody of Text-To-Speech with Grapheme Predictions. In IEEE International Conference on Acoustics, Speech and Signal Processing ICASSP 2023, Rhodes Island, Greece, June 4-10, 2023. IEEE, 1--5."},{"key":"e_1_3_2_1_29_1","volume-title":"Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023","author":"Li Yinghao Aaron","year":"2023","unstructured":"Yinghao Aaron Li, Cong Han, Vinay S. Raghavan, Gavin Mischler, and Nima Mesgarani. 2023. StyleTTS 2: Towards Human-Level Text-to-Speech through Style Diffusion and Adversarial Training with Large Speech Language Models. In Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans, LA, USA, December 10 - 16, 2023. http:\/\/papers.nips.cc\/paper_files\/paper\/2023\/hash\/3eaad2a0b62b5ed7a2e66c2188bb1449-Abstract-Conference.html"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.23919\/cje.2022.00.173"},{"key":"e_1_3_2_1_31_1","first-page":"3003","article-title":"Entity-enhanced adaptive reconstruction network for weakly supervised referring expression grounding","volume":"45","author":"Liu Xuejing","year":"2022","unstructured":"Xuejing Liu, Liang Li, Shuhui Wang, Zheng-Jun Zha, Zechao Li, Qi Tian, and Qingming Huang. 2022. Entity-enhanced adaptive reconstruction network for weakly supervised referring expression grounding. IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. 45, 3 (2022), 3003--3018.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_2_1_32_1","volume-title":"18th Annual Conference of the International Speech Communication Association","author":"McAuliffe Michael","year":"2017","unstructured":"Michael McAuliffe, Michaela Socolof, Sarah Mihuc, Michael Wagner, and Morgan Sonderegger. 2017. Montreal Forced Aligner: Trainable Text-Speech Alignment Using Kaldi. In Interspeech 2017, 18th Annual Conference of the International Speech Communication Association, Stockholm, Sweden, August 20-24, 2017. 498--502."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448291"},{"key":"e_1_3_2_1_34_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning, ICML 2021","volume":"139","author":"Min Dongchan","year":"2021","unstructured":"Dongchan Min, Dong Bok Lee, Eunho Yang, and Sung Ju Hwang. 2021. Meta-StyleSpeech: Multi-Speaker Adaptive Text-to-Speech Generation. In Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18-24 July 2021, Virtual Event, Vol. 139. 7748--7759."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2305.19709"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"e_1_3_2_1_37_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18--24","volume":"139","author":"Popov Vadim","year":"2021","unstructured":"Vadim Popov, Ivan Vovk, Vladimir Gogoryan, Tasnima Sadekova, and Mikhail A. Kudinov. 2021. Grad-TTS: A Diffusion Probabilistic Model for Text-to-Speech. In Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18--24 July 2021, Virtual Event, Vol. 139. 8599--8608."},{"key":"e_1_3_2_1_38_1","volume-title":"International Conference on Machine Learning, ICML 2023","author":"Radford Alec","year":"2023","unstructured":"Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. 2023. Robust Speech Recognition via Large-Scale Weak Supervision. In International Conference on Machine Learning, ICML 2023, 23-29 July 2023, Honolulu, Hawaii, USA. 28492--28518."},{"key":"e_1_3_2_1_39_1","volume-title":"9th International Conference on Learning Representations, ICLR 2021","author":"Ren Yi","year":"2021","unstructured":"Yi Ren, Chenxu Hu, Xu Tan, Tao Qin, Sheng Zhao, Zhou Zhao, and Tie-Yan Liu. 2021. FastSpeech 2: Fast and High-Quality End-to-End Text to Speech. In 9th International Conference on Learning Representations, ICLR 2021, Virtual Event, Austria, May 3-7, 2021. https:\/\/openreview.net\/forum?id=piLPYqxtWuA"},{"key":"e_1_3_2_1_40_1","volume-title":"Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems 2019","author":"Ren Yi","year":"2019","unstructured":"Yi Ren, Yangjun Ruan, Xu Tan, Tao Qin, Sheng Zhao, Zhou Zhao, and Tie-Yan Liu. 2019. FastSpeech: Fast, Robust and Controllable Text to Speech. In Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems 2019, NeurIPS 2019, December 8-14, 2019, Vancouver, BC, Canada. 3165--3174."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2304.09116"},{"key":"e_1_3_2_1_42_1","volume-title":"The Eighth ISCA Tutorial and Research Workshop on Speech Synthesis","author":"Suni Antti","year":"2013","unstructured":"Antti Suni, Daniel Aalto, Tuomo Raitio, Paavo Alku, and Martti Vainio. 2013. Wavelets for intonation modeling in HMM speech synthesis. In The Eighth ISCA Tutorial and Research Workshop on Speech Synthesis, Barcelona, Spain, August 31-September 2, 2013. 285--290."},{"key":"e_1_3_2_1_43_1","volume-title":"Naturalspeech: End-to-end text-to-speech synthesis with human-level quality","author":"Tan Xu","year":"2024","unstructured":"Xu Tan, Jiawei Chen, Haohe Liu, Jian Cong, Chen Zhang, Yanqing Liu, Xi Wang, Yichong Leng, Yuanhao Yi, Lei He, et al. 2024. Naturalspeech: End-to-end text-to-speech synthesis with human-level quality. IEEE Transactions on Pattern Analysis and Machine Intelligence (2024)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.23919\/cje.2022.00.284"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-020-00280-0"},{"key":"e_1_3_2_1_46_1","volume-title":"Smart: Syntax-calibrated multi-aspect relation transformer for change captioning","author":"Tu Yunbin","year":"2024","unstructured":"Yunbin Tu, Liang Li, Li Su, Zheng-Jun Zha, and Qingming Huang. 2024. Smart: Syntax-calibrated multi-aspect relation transformer for change captioning. IEEE Transactions on Pattern Analysis and Machine Intelligence (2024)."},{"key":"e_1_3_2_1_47_1","volume-title":"Generalized End-to-End Loss for Speaker Verification. In 2018 IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2018","author":"Wan Li","year":"2018","unstructured":"Li Wan, Quan Wang, Alan Papir, and Ignacio L\u00f3pez-Moreno. 2018. Generalized End-to-End Loss for Speaker Verification. In 2018 IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2018, Calgary, AB, Canada, April 15-20, 2018. 4879--4883."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2301.02111"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3226328"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.2975798"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3067449"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404374"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3472810"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3468872"},{"key":"e_1_3_2_1_55_1","volume-title":"Temporal Modeling Matters: A Novel Temporal Emotional Modeling Approach for Speech Emotion Recognition. In IEEE International Conference on Acoustics, Speech and Signal Processing ICASSP 2023","author":"Ye Jiaxin","year":"2023","unstructured":"Jiaxin Ye, Xin-Cheng Wen, Yujie Wei, Yong Xu, Kunhong Liu, and Hongming Shan. 2023. Temporal Modeling Matters: A Novel Temporal Emotional Modeling Approach for Speech Emotion Recognition. In IEEE International Conference on Acoustics, Speech and Signal Processing ICASSP 2023, Rhodes Island, Greece, June 4--10, 2023. IEEE, 1--5."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1049\/cje.2021.00.455"},{"key":"e_1_3_2_1_57_1","volume-title":"20th Annual Conference of the International Speech Communication Association","author":"Zen Heiga","year":"2019","unstructured":"Heiga Zen, Viet Dang, Rob Clark, Yu Zhang, Ron J. Weiss, Ye Jia, Zhifeng Chen, and Yonghui Wu. 2019. LibriTTS: A Corpus Derived from LibriSpeech for Text-to-Speech. In Interspeech 2019, 20th Annual Conference of the International Speech Communication Association, Graz, Austria, 15-19 September 2019. 1526--1530."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3432099"},{"key":"e_1_3_2_1_59_1","volume-title":"23rd Annual Conference of the International Speech Communication Association","author":"Zhang Guangyan","year":"2022","unstructured":"Guangyan Zhang, Kaitao Song, Xu Tan, Daxin Tan, Yuzi Yan, Yanqing Liu, Gang Wang, Wei Zhou, Tao Qin, Tan Lee, and Sheng Zhao. 2022. Mixed-Phoneme BERT: Improving BERT with Mixed Phoneme and Sup-Phoneme Representations for Text to Speech. In Interspeech 2022, 23rd Annual Conference of the International Speech Communication Association, Incheon, Korea, 18-22 September 2022. 456--460."},{"key":"e_1_3_2_1_60_1","volume-title":"Proceedings of the IEEE international conference on computer vision. 192--201","author":"Zhang Shifeng","year":"2017","unstructured":"Shifeng Zhang, Xiangyu Zhu, Zhen Lei, Hailin Shi, Xiaobo Wang, and Stan Z Li. 2017. S3fd: Single shot scale-invariant face detector. In Proceedings of the IEEE international conference on computer vision. 192--201."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.23919\/cje.2022.00.414"},{"key":"e_1_3_2_1_62_1","volume-title":"23rd Annual Conference of the International Speech Communication Association","author":"Zhou Yixuan","year":"2022","unstructured":"Yixuan Zhou, Changhe Song, Xiang Li, Luwen Zhang, Zhiyong Wu, Yanyao Bian, Dan Su, and Helen Meng. 2022. Content-Dependent Fine-Grained Speaker Embedding for Zero-Shot Speaker Adaptation in Text-to-Speech Synthesis. In Interspeech 2022, 23rd Annual Conference of the International Speech Communication Association, Incheon, Korea, 18-22 September 2022. 2573--2577. gr"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680777","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680777","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:57:42Z","timestamp":1750294662000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680777"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":62,"alternative-id":["10.1145\/3664647.3680777","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680777","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}