{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,2]],"date-time":"2025-08-02T17:35:53Z","timestamp":1754156153480,"version":"3.41.2"},"reference-count":33,"publisher":"Emerald","issue":"2","license":[{"start":{"date-parts":[[2024,1,26]],"date-time":"2024-01-26T00:00:00Z","timestamp":1706227200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.emerald.com\/insight\/site-policies"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IJWIS"],"published-print":{"date-parts":[[2024,2,23]]},"abstract":"<jats:sec>\n<jats:title content-type=\"abstract-subheading\">Purpose<\/jats:title>\n<jats:p>This paper aims to disentangle Chinese-English-rich resources linguistic and speaker timbre features, achieving cross-lingual speaker transfer for Cambodian.<\/jats:p>\n<\/jats:sec>\n<jats:sec>\n<jats:title content-type=\"abstract-subheading\">Design\/methodology\/approach<\/jats:title>\n<jats:p>This study introduces a novel approach: the construction of a cross-lingual feature disentangler coupled with the integration of time-frequency attention adaptive normalization to proficiently convert Cambodian speaker timbre into Chinese-English without altering the underlying Cambodian speech content.<\/jats:p>\n<\/jats:sec>\n<jats:sec>\n<jats:title content-type=\"abstract-subheading\">Findings<\/jats:title>\n<jats:p>Considering the limited availability of multi-speaker corpora in Cambodia, conventional methods have demonstrated subpar performance in Cambodian speaker voice transfer.<\/jats:p>\n<\/jats:sec>\n<jats:sec>\n<jats:title content-type=\"abstract-subheading\">Originality\/value<\/jats:title>\n<jats:p>The originality of this study lies in the effectiveness of the disentanglement process and precise control over speaker timbre feature transfer.<\/jats:p>\n<\/jats:sec>","DOI":"10.1108\/ijwis-09-2023-0162","type":"journal-article","created":{"date-parts":[[2024,1,24]],"date-time":"2024-01-24T06:24:21Z","timestamp":1706077461000},"page":"113-128","source":"Crossref","is-referenced-by-count":0,"title":["Cross-lingual speaker transfer for Cambodian based on feature disentangler and time-frequency attention adaptive normalization"],"prefix":"10.1108","volume":"20","author":[{"given":"Yuanzhang","family":"Yang","sequence":"first","affiliation":[]},{"given":"Linqin","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Shengxiang","family":"Gao","sequence":"additional","affiliation":[]},{"given":"Zhengtao","family":"Yu","sequence":"additional","affiliation":[]},{"given":"Ling","family":"Dong","sequence":"additional","affiliation":[]}],"member":"140","published-online":{"date-parts":[[2024,1,26]]},"reference":[{"key":"key2024022204045954100_ref001","first-page":"2709","article-title":"Yourtts: towards zero-shot multi-speaker tts and zero-shot voice conversion for everyone[C]","volume-title":"International conference on machine learning","year":"2022"},{"key":"key2024022204045954100_ref002","first-page":"7907","article-title":"Fine-grained style control in transformer-based text-to-speech synthesis[C]\/\/ICASSP","volume-title":"2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","year":"2022"},{"key":"key2024022204045954100_ref003","first-page":"3893","article-title":"Voice conversion using artificial neural networks [C]\/\/2009","volume-title":"IEEE International Conference on Acoustics, Speech and Signal Processing","year":"2009"},{"article-title":"On the impact of alignment on voice conversion performance[C]","volume-title":"Ninth annual conference of the international speech communication association","year":"2008","key":"key2024022204045954100_ref004"},{"issue":"1","key":"key2024022204045954100_ref005","doi-asserted-by":"crossref","first-page":"149","DOI":"10.1017\/S0041977X00084251","article-title":"The main features of Cambodian pronunciation[J]","volume":"14","year":"1952","journal-title":"Bulletin of the School of Oriental and African Studies"},{"article-title":"Voice conversion from unaligned corpora using variational autoencoding wasserstein generative adversarial networks[J]","volume-title":"arXiv preprint arXiv:1704.00849","year":"2017","key":"key2024022204045954100_ref006"},{"article-title":"Hierarchical generative modeling for controllable speech synthesis[J]","volume-title":"arXiv preprint arXiv:1810.07217","year":"2018","key":"key2024022204045954100_ref007"},{"key":"key2024022204045954100_ref008","first-page":"2525","article-title":"Singgan: Generative adversarial network for high-fidelity singing voice generation","volume-title":"C]\/\/Proceedings of the 30th ACM International Conference on Multimedia","year":"2022"},{"article-title":"Transpeech: Speech-to-speech translation with bilateral perturbation[J]","volume-title":"arXiv preprint arXiv:2205.12523","year":"2022","key":"key2024022204045954100_ref009"},{"first-page":"6820","article-title":"Cyclegan-vc2: Improved cyclegan-based non-parallel voice conversion[C]\/\/ICASSP","year":"2019","key":"key2024022204045954100_ref010"},{"article-title":"Cyclegan-vc3: Examining and improving cyclegan-vcs for mel-spectrogram conversion[J]","volume-title":"arXiv preprint arXiv:2010.11672","year":"2020","key":"key2024022204045954100_ref011"},{"first-page":"5919","article-title":"Maskcyclegan-vc: Learning non-parallel voice conversion with filling in frames[C]\/\/ICASSP","year":"2021","key":"key2024022204045954100_ref012"},{"article-title":"Parallel-data-free voice conversion using cycle-consistent adversarial networks[J]","volume-title":"arXiv preprint arXiv:1711.11293","year":"2017","key":"key2024022204045954100_ref013"},{"key":"key2024022204045954100_ref014","first-page":"32","article-title":"Melgan: Generative adversarial networks for conditional waveform synthesis[J]","year":"2019","journal-title":"Advances in Neural Information Processing Systems"},{"key":"key2024022204045954100_ref015","doi-asserted-by":"crossref","first-page":"1173","DOI":"10.1109\/TASLP.2022.3156757","article-title":"Duration controllable voice conversion via phoneme-based information bottleneck[J]","volume":"30","year":"2022","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"key":"key2024022204045954100_ref016","first-page":"11020","article-title":"Diffsinger: Singing voice synthesis via shallow diffusion mechanism","volume-title":"[C]\/\/Proceedings of the AAAI conference on artificial intelligence","year":"2022"},{"key":"key2024022204045954100_ref017","doi-asserted-by":"publisher","first-page":"91","DOI":"10.1163\/9789004473263_005","article-title":"Language contact and lexical changes in Khmer and Teochew in Cambodia and beyond[M]\/\/Sinophone southeast Asia","year":"2021","journal-title":"Brill"},{"article-title":"One model, many languages: Meta-learning for multilingual text-to-speech[J]","volume-title":"arXiv preprint arXiv:2008.00768","year":"2020","key":"key2024022204045954100_ref018"},{"first-page":"7012","article-title":"Nvc-net: End-to-end adversarial voice conversion[C]\/\/ICASSP","year":"2022","key":"key2024022204045954100_ref019"},{"first-page":"6284","article-title":"F0-consistent many-to-many non-parallel voice conversion via conditional autoencoder[C]\/\/ICASSP","year":"2020","key":"key2024022204045954100_ref021"},{"first-page":"6797","article-title":"Cross-speaker style transfer for text-to-speech using data augmentation[C]\/\/ICASSP","year":"2022","key":"key2024022204045954100_ref022"},{"issue":"2","key":"key2024022204045954100_ref023","doi-asserted-by":"crossref","first-page":"131","DOI":"10.1109\/89.661472","article-title":"Continuous probabilistic transform for voice conversion[J]","volume":"6","year":"1998","journal-title":"IEEE Transactions on Speech and Audio Processing"},{"first-page":"556","article-title":"VTLN-based voice conversion[C]\/\/proceedings of the 3rd","year":"2003","key":"key2024022204045954100_ref024"},{"article-title":"Cross-speaker emotion transfer for low-resource text-to-speech using non-parallel voice conversion with pitch-shift data augmentation[J]","volume-title":"arXiv preprint arXiv:2204.10020","year":"2022","key":"key2024022204045954100_ref025"},{"issue":"8","key":"key2024022204045954100_ref026","doi-asserted-by":"crossref","first-page":"2222","DOI":"10.1109\/TASL.2007.907344","article-title":"Voice conversion based on maximum-likelihood estimation of spectral parameter trajectory[J]","volume":"15","year":"2007","journal-title":"IEEE Transactions on Audio, Speech and Language Processing"},{"issue":"8","key":"key2024022204045954100_ref027","doi-asserted-by":"crossref","first-page":"2222","DOI":"10.1109\/TASL.2007.907344","article-title":"Voice conversion based on maximum-likelihood estimation of spectral parameter trajectory[J]","volume":"15","year":"2007","journal-title":"IEEE Transactions on Audio, Speech and Language Processing"},{"article-title":"Instance normalization: the missing ingredient for fast stylization[J]","volume-title":"arXiv preprint arXiv:1607.08022","year":"2016","key":"key2024022204045954100_ref028"},{"key":"key2024022204045954100_ref029","first-page":"1","article-title":"Zero-shot voice conversion via self-supervised prosody representation learning[C]","volume-title":"2022 International Joint Conference on Neural Networks (IJCNN)","year":"2022"},{"article-title":"Speech representation disentanglement with adversarial mutual information learning for one-shot voice conversion[J]","volume-title":"arXiv preprint arXiv:2208.08757","year":"2022","key":"key2024022204045954100_ref030"},{"issue":"3","key":"key2024022204045954100_ref031","doi-asserted-by":"crossref","first-page":"631","DOI":"10.1109\/TASLP.2019.2892235","article-title":"Sequence-to-sequence acoustic modeling for voice conversion[J]","volume":"27","year":"2019","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"article-title":"Learning to speak fluently in a foreign language: Multilingual speech synthesis and cross-language voice cloning[J]","volume-title":"arXiv preprint arXiv:1907.04448","year":"2019","key":"key2024022204045954100_ref032"},{"article-title":"Content-dependent fine-grained speaker embedding for zero-shot speaker adaptation in text-to-speech synthesis[J]","volume-title":"arXiv preprint arXiv:2204.00990","year":"2022","key":"key2024022204045954100_ref033"},{"first-page":"2223","article-title":"Unpaired image-to-image translation using cycle-consistent adversarial networks[C]","year":"2017","key":"key2024022204045954100_ref034"}],"container-title":["International Journal of Web Information Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.emerald.com\/insight\/content\/doi\/10.1108\/IJWIS-09-2023-0162\/full\/xml","content-type":"application\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/www.emerald.com\/insight\/content\/doi\/10.1108\/IJWIS-09-2023-0162\/full\/html","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,7,24]],"date-time":"2025-07-24T22:24:17Z","timestamp":1753395857000},"score":1,"resource":{"primary":{"URL":"http:\/\/www.emerald.com\/ijwis\/article\/20\/2\/113-128\/1215305"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,1,26]]},"references-count":33,"journal-issue":{"issue":"2","published-online":{"date-parts":[[2024,1,26]]},"published-print":{"date-parts":[[2024,2,23]]}},"alternative-id":["10.1108\/IJWIS-09-2023-0162"],"URL":"https:\/\/doi.org\/10.1108\/ijwis-09-2023-0162","relation":{},"ISSN":["1744-0084","1744-0084"],"issn-type":[{"type":"print","value":"1744-0084"},{"type":"electronic","value":"1744-0084"}],"subject":[],"published":{"date-parts":[[2024,1,26]]}}}