{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T08:56:53Z","timestamp":1765357013086,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,10,17]],"date-time":"2021-10-17T00:00:00Z","timestamp":1634428800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Higher Education Sprout Project of the National Yang Ming Chiao Tung University and Ministry of Education (MOE), Taiwan"},{"name":"Ministry of Science and Technology (MOST) of Taiwan","award":["MOST-109-2223-E-009-002-MY3,MOST-110-2634-F-007-015,MOST-109-2218-E-002-015,MOST-109-2221-E-009-114-MY3,MOST-110-2218-E-A49-018,MOST-109-2327-B-010-005,MOST-109-2221-E-009-097,MOST-109-2221-E-001-015"],"award-info":[{"award-number":["MOST-109-2223-E-009-002-MY3,MOST-110-2634-F-007-015,MOST-109-2218-E-002-015,MOST-109-2221-E-009-114-MY3,MOST-110-2218-E-A49-018,MOST-109-2327-B-010-005,MOST-109-2221-E-009-097,MOST-109-2221-E-001-015"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,10,17]]},"DOI":"10.1145\/3474085.3475198","type":"proceedings-article","created":{"date-parts":[[2021,10,18]],"date-time":"2021-10-18T10:23:20Z","timestamp":1634552600000},"page":"496-505","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":12,"title":["Face-based Voice Conversion: Learning the Voice behind a Face"],"prefix":"10.1145","author":[{"given":"Hsiao-Han","family":"Lu","sequence":"first","affiliation":[{"name":"National Yang Ming Chiao Tung University, Hsinchu, Taiwan Roc"}]},{"given":"Shao-En","family":"Weng","sequence":"additional","affiliation":[{"name":"National Yang Ming Chiao Tung University, Hsinchu, Taiwan Roc"}]},{"given":"Ya-Fan","family":"Yen","sequence":"additional","affiliation":[{"name":"National Yang Ming Chiao Tung University, Hsinchu, Taiwan Roc"}]},{"given":"Hong-Han","family":"Shuai","sequence":"additional","affiliation":[{"name":"National Yang Ming Chiao Tung University, Hsinchu, Taiwan Roc"}]},{"given":"Wen-Huang","family":"Cheng","sequence":"additional","affiliation":[{"name":"National Yang Ming Chiao Tung University, Hsinchu, Taiwan Roc"}]}],"member":"320","published-online":{"date-parts":[[2021,10,17]]},"reference":[{"key":"e_1_3_2_2_1_1","first-page":"3244","article-title":"The Conversation","volume":"2018","author":"Afouras Triantafyllos","year":"2018","unstructured":"Triantafyllos Afouras , Joon Son Chung , and Andrew Zisserman . 2018 a. The Conversation : Deep Audio-Visual Speech Enhancement. In Proc. Interspeech 2018. 3244 -- 3248 . https:\/\/doi.org\/10.21437\/Interspeech.2018--1400 10.21437\/Interspeech.2018--1400 Triantafyllos Afouras, Joon Son Chung, and Andrew Zisserman. 2018a. The Conversation: Deep Audio-Visual Speech Enhancement. In Proc. Interspeech 2018. 3244--3248. https:\/\/doi.org\/10.21437\/Interspeech.2018--1400","journal-title":"Deep Audio-Visual Speech Enhancement. In Proc. Interspeech"},{"key":"e_1_3_2_2_2_1","volume-title":"Joon Son Chung, and Andrew Zisserman","author":"Afouras Triantafyllos","year":"2018","unstructured":"Triantafyllos Afouras , Joon Son Chung, and Andrew Zisserman . 2018 b. LRS 3-TED: a large-scale dataset for visual speech recognition. arXiv preprint arXiv:1809.00496 (2018). Triantafyllos Afouras, Joon Son Chung, and Andrew Zisserman. 2018b. LRS3-TED: a large-scale dataset for visual speech recognition. arXiv preprint arXiv:1809.00496 (2018)."},{"doi-asserted-by":"crossref","unstructured":"Sandesh Aryal Daniel Felps and Ricardo Gutierrez-Osuna. 2013. Foreign Accent Conversion through Voice Morphing. In Interspeech-2013. 3077--3081.  Sandesh Aryal Daniel Felps and Ricardo Gutierrez-Osuna. 2013. Foreign Accent Conversion through Voice Morphing. In Interspeech-2013. 3077--3081.","key":"e_1_3_2_2_3_1","DOI":"10.21437\/Interspeech.2013-671"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_4_1","DOI":"10.1109\/FG.2018.00020"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_5_1","DOI":"10.1007\/978-3-030-58583-9_42"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_6_1","DOI":"10.1145\/3394171.3413710"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_7_1","DOI":"10.21437\/Interspeech.2019-2663"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_8_1","DOI":"10.1145\/3394171.3413700"},{"key":"e_1_3_2_2_9_1","first-page":"3486","article-title":"Seeing Voices and Hearing Voices","volume":"2020","author":"Chung Soo-Whan","year":"2020","unstructured":"Soo-Whan Chung , Hong-Goo Kang , and Joon Son Chung . 2020 . Seeing Voices and Hearing Voices : Learning Discriminative Embeddings Using Cross-Modal Self-Supervision. In Proc. Interspeech 2020. 3486 -- 3490 . https:\/\/doi.org\/10.21437\/Interspeech.2020--1113 10.21437\/Interspeech.2020--1113 Soo-Whan Chung, Hong-Goo Kang, and Joon Son Chung. 2020. Seeing Voices and Hearing Voices: Learning Discriminative Embeddings Using Cross-Modal Self-Supervision. In Proc. Interspeech 2020. 3486--3490. https:\/\/doi.org\/10.21437\/Interspeech.2020--1113","journal-title":"Learning Discriminative Embeddings Using Cross-Modal Self-Supervision. In Proc. Interspeech"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_10_1","DOI":"10.1109\/ICASSP40776.2020.9054535"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_11_1","DOI":"10.1145\/3197517.3201357"},{"doi-asserted-by":"crossref","unstructured":"Mahdi Eslami Hamid Sheikhzadeh and Abolghasem Sayadiyan. 2011. Quality improvement of voice conversion systems based on trellis structured vector quantization. In Interspeech-2011. 665--668.  Mahdi Eslami Hamid Sheikhzadeh and Abolghasem Sayadiyan. 2011. Quality improvement of voice conversion systems based on trellis structured vector quantization. In Interspeech-2011. 665--668.","key":"e_1_3_2_2_12_1","DOI":"10.21437\/Interspeech.2011-271"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_13_1","DOI":"10.1109\/ICASSP.2018.8462342"},{"key":"e_1_3_2_2_14_1","volume-title":"Foley music: Learning to generate music from videos. arXiv preprint arXiv:2007.10984","author":"Gan Chuang","year":"2020","unstructured":"Chuang Gan , Deng Huang , Peihao Chen , Joshua B Tenenbaum , and Antonio Torralba . 2020. Foley music: Learning to generate music from videos. arXiv preprint arXiv:2007.10984 , Vol. 4 , 6 ( 2020 ), 7. Chuang Gan, Deng Huang, Peihao Chen, Joshua B Tenenbaum, and Antonio Torralba. 2020. Foley music: Learning to generate music from videos. arXiv preprint arXiv:2007.10984, Vol. 4, 6 (2020), 7."},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_15_1","DOI":"10.1109\/TMM.2020.2991507"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_16_1","DOI":"10.1145\/3240508.3240601"},{"key":"e_1_3_2_2_17_1","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Hu Di","year":"2020","unstructured":"Di Hu , Rui Qian , Minyue Jiang , Xiao Tan , Shilei Wen , Errui Ding , Weiyao Lin , and Dejing Dou . 2020 . Discriminative Sounding Objects Localization via Self-supervised Audiovisual Matching . Advances in Neural Information Processing Systems , Vol. 33 (2020). Di Hu, Rui Qian, Minyue Jiang, Xiao Tan, Shilei Wen, Errui Ding, Weiyao Lin, and Dejing Dou. 2020. Discriminative Sounding Objects Localization via Self-supervised Audiovisual Matching. Advances in Neural Information Processing Systems, Vol. 33 (2020)."},{"volume-title":"2021 IEEE Spoken Language Technology Workshop (SLT). IEEE, 514--521","year":"2021","unstructured":"Tzu-hsien Huang, Jheng-hao Lin, and Hung-yi Lee. 2021 . How Far Are We from Robust Voice Conversion: A Survey . In 2021 IEEE Spoken Language Technology Workshop (SLT). IEEE, 514--521 . Tzu-hsien Huang, Jheng-hao Lin, and Hung-yi Lee. 2021. How Far Are We from Robust Voice Conversion: A Survey. In 2021 IEEE Spoken Language Technology Workshop (SLT). IEEE, 514--521.","key":"e_1_3_2_2_18_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_19_1","DOI":"10.5555\/3327345.3327360"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_20_1","DOI":"10.1109\/SLT.2018.8639535"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_21_1","DOI":"10.23919\/EUSIPCO.2018.8553236"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_22_1","DOI":"10.21437\/Interspeech.2017-970"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_23_1","DOI":"10.1109\/ICASSP.2019.8682897"},{"key":"e_1_3_2_2_24_1","first-page":"679","article-title":"b. StarGAN-VC2","volume":"2019","author":"Kaneko Takuhiro","year":"2019","unstructured":"Takuhiro Kaneko , Hirokazu Kameoka , Kou Tanaka , and Nobukatsu Hojo . 2019 b. StarGAN-VC2 : Rethinking Conditional Methods for StarGAN-Based Voice Conversion. In Proc. Interspeech 2019. 679 -- 683 . https:\/\/doi.org\/10.21437\/Interspeech.2019--2236 10.21437\/Interspeech.2019--2236 Takuhiro Kaneko, Hirokazu Kameoka, Kou Tanaka, and Nobukatsu Hojo. 2019 b. StarGAN-VC2: Rethinking Conditional Methods for StarGAN-Based Voice Conversion. In Proc. Interspeech 2019. 679--683. https:\/\/doi.org\/10.21437\/Interspeech.2019--2236","journal-title":"Rethinking Conditional Methods for StarGAN-Based Voice Conversion. In Proc. Interspeech"},{"key":"e_1_3_2_2_25_1","first-page":"2017","article-title":"CycleGAN-VC3","volume":"2020","author":"Kaneko Takuhiro","year":"2020","unstructured":"Takuhiro Kaneko , Hirokazu Kameoka , Kou Tanaka , and Nobukatsu Hojo . 2020 . CycleGAN-VC3 : Examining and Improving CycleGAN-VCs for Mel-Spectrogram Conversion. In Proc. Interspeech 2020. 2017 -- 2021 . https:\/\/doi.org\/10.21437\/Interspeech.2020--2280 10.21437\/Interspeech.2020--2280 Takuhiro Kaneko, Hirokazu Kameoka, Kou Tanaka, and Nobukatsu Hojo. 2020. CycleGAN-VC3: Examining and Improving CycleGAN-VCs for Mel-Spectrogram Conversion. In Proc. Interspeech 2020. 2017--2021. https:\/\/doi.org\/10.21437\/Interspeech.2020--2280","journal-title":"Examining and Improving CycleGAN-VCs for Mel-Spectrogram Conversion. In Proc. Interspeech"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_26_1","DOI":"10.5555\/2969442.2969527"},{"key":"e_1_3_2_2_27_1","volume-title":"Fifth ISCA workshop on speech synthesis .","author":"Kominek John","year":"2004","unstructured":"John Kominek and Alan W Black . 2004 . The Carnegie Mellon University Arctic speech databases . In Fifth ISCA workshop on speech synthesis . John Kominek and Alan W Black. 2004. The Carnegie Mellon University Arctic speech databases. In Fifth ISCA workshop on speech synthesis ."},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_28_1","DOI":"10.1145\/3240508.3241911"},{"doi-asserted-by":"crossref","unstructured":"Javier Latorre Vincent Wan and Kayoko Yanagisawa. 2014. Voice Expression Conversion with Factorised HMM-TTS Models. In Interspeech-2014. 1514--1518.  Javier Latorre Vincent Wan and Kayoko Yanagisawa. 2014. Voice Expression Conversion with Factorised HMM-TTS Models. In Interspeech-2014. 1514--1518.","key":"e_1_3_2_2_29_1","DOI":"10.21437\/Interspeech.2014-363"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_30_1","DOI":"10.1109\/ICASSP40776.2020.9054582"},{"key":"e_1_3_2_2_31_1","first-page":"2579","article-title":"Visualizing data using t-SNE","volume":"9","author":"van der Maaten Laurens","year":"2008","unstructured":"Laurens van der Maaten and Geoffrey Hinton . 2008 . Visualizing data using t-SNE . Journal of machine learning research , Vol. 9 , Nov (2008), 2579 -- 2605 . Laurens van der Maaten and Geoffrey Hinton. 2008. Visualizing data using t-SNE. Journal of machine learning research, Vol. 9, Nov (2008), 2579--2605.","journal-title":"Journal of machine learning research"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_32_1","DOI":"10.1109\/CVPR42600.2020.00144"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_33_1","DOI":"10.1145\/3394171.3413570"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_34_1","DOI":"10.1109\/ICASSP.2007.366960"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_35_1","DOI":"10.1109\/ICASSP40776.2020.9054057"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_36_1","DOI":"10.1145\/3343031.3351010"},{"doi-asserted-by":"crossref","unstructured":"Toru Nakashika Tetsuya Takiguchi and Yasuo Ariki. 2014. High-order sequence modeling using speaker-dependent recurrent temporal restricted Boltzmann machines for voice conversion. In Fifteenth annual conference of the international speech communication association .  Toru Nakashika Tetsuya Takiguchi and Yasuo Ariki. 2014. High-order sequence modeling using speaker-dependent recurrent temporal restricted Boltzmann machines for voice conversion. In Fifteenth annual conference of the international speech communication association .","key":"e_1_3_2_2_37_1","DOI":"10.21437\/Interspeech.2014-447"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_38_1","DOI":"10.1109\/CVPR.2019.00772"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_39_1","DOI":"10.1109\/CVPR42600.2020.01381"},{"key":"e_1_3_2_2_40_1","volume-title":"International Conference on Machine Learning. PMLR, 7836--7846","author":"Qian Kaizhi","year":"2020","unstructured":"Kaizhi Qian , Yang Zhang , Shiyu Chang , Mark Hasegawa-Johnson , and David Cox . 2020 . Unsupervised speech decomposition via triple information bottleneck . In International Conference on Machine Learning. PMLR, 7836--7846 . Kaizhi Qian, Yang Zhang, Shiyu Chang, Mark Hasegawa-Johnson, and David Cox. 2020. Unsupervised speech decomposition via triple information bottleneck. In International Conference on Machine Learning. PMLR, 7836--7846."},{"key":"e_1_3_2_2_41_1","volume-title":"International Conference on Machine Learning. PMLR, 5210--5219","author":"Qian Kaizhi","year":"2019","unstructured":"Kaizhi Qian , Yang Zhang , Shiyu Chang , Xuesong Yang , and Mark Hasegawa-Johnson . 2019 . Autovc: Zero-shot voice style transfer with only autoencoder loss . In International Conference on Machine Learning. PMLR, 5210--5219 . Kaizhi Qian, Yang Zhang, Shiyu Chang, Xuesong Yang, and Mark Hasegawa-Johnson. 2019. Autovc: Zero-shot voice style transfer with only autoencoder loss. In International Conference on Machine Learning. PMLR, 5210--5219."},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_42_1","DOI":"10.1006\/dspr.1999.0361"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_43_1","DOI":"10.1109\/CVPR.2015.7298682"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_44_1","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"e_1_3_2_2_45_1","volume-title":"ICASSP 2020--2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 2802--2806","author":"Lal Brij Mohan","year":"2020","unstructured":"Brij Mohan Lal Sr ivastava, Nathalie Vauquier , Md Sahidullah , Aur\u00e9lien Bellet , Marc Tommasi , and Emmanuel Vincent . 2020 . Evaluating voice conversion-based privacy protection against informed attackers . In ICASSP 2020--2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 2802--2806 . Brij Mohan Lal Srivastava, Nathalie Vauquier, Md Sahidullah, Aur\u00e9lien Bellet, Marc Tommasi, and Emmanuel Vincent. 2020. Evaluating voice conversion-based privacy protection against informed attackers. In ICASSP 2020--2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 2802--2806."},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_46_1","DOI":"10.1109\/ICASSP.2019.8683282"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_47_1","DOI":"10.1109\/TASL.2012.2205241"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_48_1","DOI":"10.5555\/1134782.1134792"},{"key":"e_1_3_2_2_49_1","volume-title":"Interspeech","author":"Valentini-Botinhao Cassia","year":"2016","unstructured":"Cassia Valentini-Botinhao , Xin Wang , Shinji Takaki , and Junichi Yamagishi . 2016. Speech Enhancement for a Noise-Robust Text-to-Speech Synthesis System Using Deep Recurrent Neural Networks . In Interspeech 2016 . 352--356. https:\/\/doi.org\/10.21437\/Interspeech.2016--159 10.21437\/Interspeech.2016--159 Cassia Valentini-Botinhao, Xin Wang, Shinji Takaki, and Junichi Yamagishi. 2016. Speech Enhancement for a Noise-Robust Text-to-Speech Synthesis System Using Deep Recurrent Neural Networks. In Interspeech 2016. 352--356. https:\/\/doi.org\/10.21437\/Interspeech.2016--159"},{"key":"e_1_3_2_2_50_1","volume-title":"WaveNet: A Generative Model for Raw Audio. In 9th ISCA Speech Synthesis Workshop. 125--125","author":"van den Oord A\u00e4ron","year":"2016","unstructured":"A\u00e4ron van den Oord , Sander Dieleman , Heiga Zen , Karen Simonyan , Oriol Vinyals , Alex Graves , Nal Kalchbrenner , Andrew Senior , and Koray Kavukcuoglu . 2016 . WaveNet: A Generative Model for Raw Audio. In 9th ISCA Speech Synthesis Workshop. 125--125 . A\u00e4ron van den Oord, Sander Dieleman, Heiga Zen, Karen Simonyan, Oriol Vinyals, Alex Graves, Nal Kalchbrenner, Andrew Senior, and Koray Kavukcuoglu. 2016. WaveNet: A Generative Model for Raw Audio. In 9th ISCA Speech Synthesis Workshop. 125--125."},{"unstructured":"Christophe Veaux Junichi Yamagishi Kirsten MacDonald etal 2017. Superseded-cstr vctk corpus: English multi-speaker corpus for cstr voice cloning toolkit. (2017).  Christophe Veaux Junichi Yamagishi Kirsten MacDonald et al. 2017. Superseded-cstr vctk corpus: English multi-speaker corpus for cstr voice cloning toolkit. (2017).","key":"e_1_3_2_2_51_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_52_1","DOI":"10.1109\/ICASSP.2018.8462665"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_53_1","DOI":"10.5555\/3454287.3454760"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_54_1","DOI":"10.21437\/Interspeech.2020-1443"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_55_1","DOI":"10.1109\/ICASSP40776.2020.9053854"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_56_1","DOI":"10.1109\/LSP.2016.2603342"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_57_1","DOI":"10.1109\/CVPR42600.2020.01235"},{"key":"e_1_3_2_2_58_1","volume-title":"Deep audio-visual learning: A survey. arXiv preprint arXiv:2001.04758","author":"Zhu Hao","year":"2020","unstructured":"Hao Zhu , Mandi Luo , Rui Wang , Aihua Zheng , and Ran He. 2020. Deep audio-visual learning: A survey. arXiv preprint arXiv:2001.04758 ( 2020 ). Hao Zhu, Mandi Luo, Rui Wang, Aihua Zheng, and Ran He. 2020. Deep audio-visual learning: A survey. arXiv preprint arXiv:2001.04758 (2020)."}],"event":{"sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"acronym":"MM '21","name":"MM '21: ACM Multimedia Conference","location":"Virtual Event China"},"container-title":["Proceedings of the 29th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475198","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3474085.3475198","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:48:47Z","timestamp":1750193327000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475198"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,17]]},"references-count":58,"alternative-id":["10.1145\/3474085.3475198","10.1145\/3474085"],"URL":"https:\/\/doi.org\/10.1145\/3474085.3475198","relation":{},"subject":[],"published":{"date-parts":[[2021,10,17]]},"assertion":[{"value":"2021-10-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}