{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T18:34:40Z","timestamp":1776882880162,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":34,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681539","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"4446-4454","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Convert and Speak: Zero-shot Accent Conversion with Minimum Supervision"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-8842-2088","authenticated-orcid":false,"given":"Zhijun","family":"Jia","sequence":"first","affiliation":[{"name":"Nanjing University, Nanjing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-7110-5574","authenticated-orcid":false,"given":"Huaying","family":"Xue","sequence":"additional","affiliation":[{"name":"Microsoft Research Asia, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8213-4878","authenticated-orcid":false,"given":"Xiulian","family":"Peng","sequence":"additional","affiliation":[{"name":"Microsoft Research Asia, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5383-6424","authenticated-orcid":false,"given":"Yan","family":"Lu","sequence":"additional","affiliation":[{"name":"Microsoft Research Asia, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems, Vol. 33 (2020), 12449--12460."},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3288409"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"e_1_3_2_2_4_1","volume-title":"High Fidelity Neural Audio Compression. CoRR","author":"D\u00e9fossez Alexandre","year":"2022","unstructured":"Alexandre D\u00e9fossez, Jade Copet, Gabriel Synnaeve, and Yossi Adi. 2022. High Fidelity Neural Audio Compression. CoRR, Vol. abs\/2210.13438 (2022)."},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2021.101302"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"e_1_3_2_2_7_1","volume-title":"Transpeech: Speech-to-speech translation with bilateral perturbation. arXiv preprint arXiv:2205.12523","author":"Huang Rongjie","year":"2022","unstructured":"Rongjie Huang, Jinglin Liu, Huadai Liu, Yi Ren, Lichao Zhang, Jinzheng He, and Zhou Zhao. 2022. Transpeech: Speech-to-speech translation with bilateral perturbation. arXiv preprint arXiv:2205.12523 (2022)."},{"key":"e_1_3_2_2_8_1","volume-title":"The new accent technologies: recognition, measurement and manipulation of accented speech","author":"Huckvale Mark","unstructured":"Mark Huckvale. 2006. The new accent technologies: recognition, measurement and manipulation of accented speech. Beijing: Language and Culture Press."},{"key":"e_1_3_2_2_9_1","volume-title":"Non-parallel Accent Conversion using Pseudo Siamese Disentanglement Network. arXiv preprint arXiv:2212.05751","author":"Jia Dongya","year":"2022","unstructured":"Dongya Jia, Qiao Tian, Jiaxin Li, Yuanzhe Chen, Kainan Peng, Mingbo Ma, Yuping Wang, and Yuxuan Wang. 2022. Non-parallel Accent Conversion using Pseudo Siamese Disentanglement Network. arXiv preprint arXiv:2212.05751 (2022)."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3277693"},{"key":"e_1_3_2_2_11_1","volume-title":"Black","author":"Kominek John","year":"2004","unstructured":"John Kominek and Alan W. Black. 2004. The Carnegie Mellon University Arctic speech databases. In SSW. ISCA, 223--224."},{"key":"e_1_3_2_2_12_1","volume-title":"Bart: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. arXiv preprint arXiv:1910.13461","author":"Lewis Mike","year":"2019","unstructured":"Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov, and Luke Zettlemoyer. 2019. Bart: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. arXiv preprint arXiv:1910.13461 (2019)."},{"key":"e_1_3_2_2_13_1","volume-title":"Improving accent conversion with reference encoder and end-to-end text-to-speech. arXiv preprint arXiv:2005.09271","author":"Li Wenjie","year":"2020","unstructured":"Wenjie Li, Benlai Tang, Xiang Yin, Yushi Zhao, Wei Li, Kang Wang, Hao Huang, Yuxuan Wang, and Zejun Ma. 2020. Improving accent conversion with reference encoder and end-to-end text-to-speech. arXiv preprint arXiv:2005.09271 (2020)."},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413699"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053797"},{"key":"e_1_3_2_2_16_1","volume-title":"Nisqa: A deep cnn-self-attention model for multidimensional speech quality prediction with crowdsourced datasets. arXiv preprint arXiv:2104.09494","author":"Mittag Gabriel","year":"2021","unstructured":"Gabriel Mittag, Babak Naderi, Assmaa Chehadi, and Sebastian M\u00f6ller. 2021. Nisqa: A deep cnn-self-attention model for multidimensional speech quality prediction with crowdsourced datasets. arXiv preprint arXiv:2104.09494 (2021)."},{"key":"e_1_3_2_2_17_1","first-page":"2583","article-title":"Accent Conversion using Pre-trained Model and Synthesized Data from Voice Conversion","volume":"2022","author":"Nguyen Tuan Nam","year":"2022","unstructured":"Tuan Nam Nguyen, Ngoc-Quan Pham, and Alexander Waibel. 2022. Accent Conversion using Pre-trained Model and Synthesized Data from Voice Conversion. In Proc. Interspeech, Vol. 2022. 2583--2587.","journal-title":"Proc. Interspeech"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"e_1_3_2_2_19_1","volume-title":"Speech resynthesis from discrete disentangled self-supervised representations. arXiv preprint arXiv:2104.00355","author":"Polyak Adam","year":"2021","unstructured":"Adam Polyak, Yossi Adi, Jade Copet, Eugene Kharitonov, Kushal Lakhotia, Wei-Ning Hsu, Abdelrahman Mohamed, and Emmanuel Dupoux. 2021. Speech resynthesis from discrete disentangled self-supervised representations. arXiv preprint arXiv:2104.00355 (2021)."},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.5555\/3455716.3455856"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"e_1_3_2_2_22_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_2_23_1","unstructured":"Chengyi Wang Sanyuan Chen Yu Wu Ziqiang Zhang Long Zhou Shujie Liu Zhuo Chen Yanqing Liu Huaming Wang Jinyu Li et al. 2023. Neural codec language models are zero-shot text to speech synthesizers. arXiv preprint arXiv:2301.02111 (2023)."},{"key":"e_1_3_2_2_24_1","volume-title":"Xiao Chen, Xunying Liu, and Helen Meng.","author":"Wang Disong","year":"2021","unstructured":"Disong Wang, Liqun Deng, Yu Ting Yeung, Xiao Chen, Xunying Liu, and Helen Meng. 2021. Vqmivc: Vector quantization and mutual information-based unsupervised speech representation disentanglement for one-shot voice conversion. arXiv preprint arXiv:2106.10132 (2021)."},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","unstructured":"Junichi Yamagishi Christophe Veaux and Kirsten MacDonald. 2019. CSTR VCTK Corpus: English Multi-speaker Corpus for CSTR Voice Cloning Toolkit (version 0.92). https:\/\/doi.org\/10.7488\/ds\/2645","DOI":"10.7488\/ds"},{"key":"e_1_3_2_2_26_1","volume-title":"Zipformer: A faster and better encoder for automatic speech recognition. CoRR","author":"Yao Zengwei","year":"2023","unstructured":"Zengwei Yao, Liyong Guo, Xiaoyu Yang, Wei Kang, Fangjun Kuang, Yifan Yang, Zengrui Jin, Long Lin, and Daniel Povey. 2023. Zipformer: A faster and better encoder for automatic speech recognition. CoRR, Vol. abs\/2310.11230 (2023)."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3129994"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"crossref","unstructured":"Heiga Zen Viet Dang Rob Clark Yu Zhang Ron J. Weiss Ye Jia Zhifeng Chen and Yonghui Wu. 2019. LibriTTS: A Corpus Derived from LibriSpeech for Text-to-Speech. In INTERSPEECH. ISCA 1526--1530.","DOI":"10.21437\/Interspeech.2019-2441"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"crossref","unstructured":"Guanlong Zhao Shaojin Ding and Ricardo Gutierrez-Osuna. 2019. Foreign Accent Conversion by Synthesizing Speech from Phonetic Posteriorgrams.. In Interspeech. 2843--2847.","DOI":"10.21437\/Interspeech.2019-1778"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3060813"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462258"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"crossref","unstructured":"Guanlong Zhao Sinem Sonsaat Alif Silpachai Ivana Lucic Evgeny Chukharev-Hudilainen John Levis and Ricardo Gutierrez-Osuna. 2018. L2-ARCTIC: A Non-native English Speech Corpus. In INTERSPEECH. ISCA 2783--2787.","DOI":"10.21437\/Interspeech.2018-1110"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2023.3270079"},{"key":"e_1_3_2_2_34_1","volume-title":"CommonAccent: Exploring Large Acoustic Pretrained Models for Accent Classification Based on Common Voice. Interspeech 2023","author":"Zuluaga-Gomez Juan","year":"2023","unstructured":"Juan Zuluaga-Gomez, Sara Ahmed, Danielius Visockas, and Cem Subakan. 2023. CommonAccent: Exploring Large Acoustic Pretrained Models for Accent Classification Based on Common Voice. Interspeech 2023 (2023). https:\/\/arxiv.org\/abs\/2305.18283"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681539","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681539","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:57:48Z","timestamp":1750294668000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681539"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":34,"alternative-id":["10.1145\/3664647.3681539","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681539","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}