{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,2]],"date-time":"2026-01-02T07:50:30Z","timestamp":1767340230358,"version":"3.32.0"},"reference-count":29,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,11,7]],"date-time":"2024-11-07T00:00:00Z","timestamp":1730937600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,11,7]],"date-time":"2024-11-07T00:00:00Z","timestamp":1730937600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,11,7]]},"DOI":"10.1109\/iscslp63861.2024.10800697","type":"proceedings-article","created":{"date-parts":[[2024,12,23]],"date-time":"2024-12-23T19:11:17Z","timestamp":1734981077000},"page":"616-620","source":"Crossref","is-referenced-by-count":2,"title":["The NPU-HWC System for the ISCSLP 2024 Inspirational and Convincing Audio Generation Challenge"],"prefix":"10.1109","author":[{"given":"Dake","family":"Guo","sequence":"first","affiliation":[{"name":"School of Computer Science, Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU),Xi&#x0027;an"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jixun","family":"Yao","sequence":"additional","affiliation":[{"name":"School of Computer Science, Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU),Xi&#x0027;an"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xinfa","family":"Zhu","sequence":"additional","affiliation":[{"name":"School of Computer Science, Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU),Xi&#x0027;an"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kangxiang","family":"Xia","sequence":"additional","affiliation":[{"name":"School of Computer Science, Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU),Xi&#x0027;an"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhao","family":"Guo","sequence":"additional","affiliation":[{"name":"School of Computer Science, Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU),Xi&#x0027;an"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ziyu","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Computer Science, Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU),Xi&#x0027;an"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yao","family":"Wang","sequence":"additional","affiliation":[{"name":"Huawei Cloud"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jie","family":"Liu","sequence":"additional","affiliation":[{"name":"Huawei Cloud"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lei","family":"Xie","sequence":"additional","affiliation":[{"name":"School of Computer Science, Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU),Xi&#x0027;an"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"ref2","first-page":"3165","article-title":"Fastspeech: Fast, robust and controllable text to speech","volume-title":"Proc. NeurIPS.","author":"Ren","year":"2019"},{"key":"ref3","first-page":"5530","article-title":"Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech","volume-title":"Proc","author":"Kim"},{"key":"ref4","article-title":"Naturalspeech 2: Latent diffusion models are natural and zero-shot speech and singing synthesizers","author":"Shen","year":"2023","journal-title":"arXiv preprint"},{"key":"ref5","article-title":"Better speech synthesis through scaling","volume-title":"arXiv preprint","author":"Betker","year":"2023"},{"key":"ref6","doi-asserted-by":"crossref","DOI":"10.1109\/ISCSLP63861.2024.10800374","article-title":"Inspirational and Convincing Audio Generation Challenge 2024 ICAGC 2024","volume-title":"Proc. ISCSLP","author":"Fu","year":"2024"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-1559"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095105"},{"key":"ref9","first-page":"2709","article-title":"Yourtts: Towards zero-shot multi-speaker tts and zero-shot voice conversion for everyone","volume-title":"International Conference on Machine Learning","author":"Casanova","year":"2022"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1774"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682535"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10054"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746049"},{"key":"ref14","article-title":"Vec-tok speech: speech vectorization and tokenization for neural speech generation","volume":"abs\/2310.07246","author":"Zhu","year":"2023","journal-title":"CoRR"},{"key":"ref15","article-title":"Mega-tts 2: Zero-shot text-to-speech with arbitrary length speech prompts","author":"Jiang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref16","article-title":"Neural codec language models are zero-shot text to speech synthesizers","author":"Wang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-2016"},{"key":"ref18","first-page":"1","article-title":"Flamingo: a visual language model for few-shot learning","author":"Alayrac","year":"2022","journal-title":"Proc NeurIPS"},{"key":"ref19","article-title":"BASE TTS: lessons from building a billion-parameter text-to-speech model on 100k hours of data","author":"Lajszczak","year":"2024","journal-title":"arXiv preprint"},{"key":"ref20","article-title":"Seed-TTS: A family of high-quality versatile speech generation models","author":"Anastassiou","year":"2024","journal-title":"arXiv preprint"},{"key":"ref21","article-title":"Audiogen: Tex-tually guided audio generation","volume-title":"Proc. ICLR","author":"Kreuk","year":"2023"},{"key":"ref22","first-page":"21450","article-title":"AudioLDM: Text-to-audio generation with latent diffusion models","volume-title":"Proc. ICML","volume":"202","author":"Liu","year":"2023"},{"key":"ref23","first-page":"13916","article-title":"Make-An-Audio: Text-To-Audio Gener-ation with Prompt-Enhanced Diffusion Models","volume-title":"Proc. ICML","author":"Huang","year":"2023"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681688"},{"key":"ref25","article-title":"Direct preference optimization: Your language model is secretly a reward model","volume-title":"Proc. NeurIPS","author":"Rafailov","year":"2023"},{"key":"ref26","article-title":"Scaling instruction-finetuned language models","author":"Chung","year":"2022","journal-title":"arXiv preprint"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-2343"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414423"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-9996"}],"event":{"name":"2024 IEEE 14th International Symposium on Chinese Spoken Language Processing (ISCSLP)","start":{"date-parts":[[2024,11,7]]},"location":"Beijing, China","end":{"date-parts":[[2024,11,10]]}},"container-title":["2024 IEEE 14th International Symposium on Chinese Spoken Language Processing (ISCSLP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10799944\/10799969\/10800697.pdf?arnumber=10800697","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,24]],"date-time":"2024-12-24T06:26:39Z","timestamp":1735021599000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10800697\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,7]]},"references-count":29,"URL":"https:\/\/doi.org\/10.1109\/iscslp63861.2024.10800697","relation":{},"subject":[],"published":{"date-parts":[[2024,11,7]]}}}