{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,12,25]],"date-time":"2024-12-25T05:13:47Z","timestamp":1735103627915,"version":"3.32.0"},"reference-count":22,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,11,7]],"date-time":"2024-11-07T00:00:00Z","timestamp":1730937600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,11,7]],"date-time":"2024-11-07T00:00:00Z","timestamp":1730937600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,11,7]]},"DOI":"10.1109\/iscslp63861.2024.10800334","type":"proceedings-article","created":{"date-parts":[[2024,12,23]],"date-time":"2024-12-23T19:11:17Z","timestamp":1734981077000},"page":"621-625","source":"Crossref","is-referenced-by-count":0,"title":["Vivid Background Audio Generation Based on Large Language Models and AudioLDM"],"prefix":"10.1109","author":[{"given":"Yiwei","family":"Liang","sequence":"first","affiliation":[{"name":"Duke Kunshan University,Suzhou Municipal Key Laboratory of Multimodal Intelligent Systems, Data Science Research Center,Kunshan"}]},{"given":"Ming","family":"Li","sequence":"additional","affiliation":[{"name":"Duke Kunshan University,Suzhou Municipal Key Laboratory of Multimodal Intelligent Systems, Data Science Research Center,Kunshan"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095623"},{"key":"ref2","first-page":"19594","article-title":"Styletts 2: Towards human-level text-to-speech through style diffusion and adversarial training with large speech language models","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Li","year":"2023"},{"volume-title":"Speecht5: Unified-modal encoder-decoder pretraining for spoken language processing","year":"2022","author":"Ao","key":"ref3"},{"volume-title":"Neural codec language models are zero-shot text to speech synthesizers","year":"2023","author":"Wang","key":"ref4"},{"key":"ref5","doi-asserted-by":"crossref","DOI":"10.1109\/CVPR52688.2022.01042","volume-title":"High-resolution image synthesis with latent diffusion models","author":"Rombach","year":"2022"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1145\/3592458"},{"volume-title":"Text-driven foley sound generation with latent diffusion model","year":"2023","author":"Yuan","key":"ref7"},{"key":"ref8","first-page":"21450","article-title":"AudioLDM: Text-to-audio generation with latent diffusion models","volume-title":"Proceedings of the International Conference on Machine Learning","author":"Liu","year":"2023"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3399607"},{"key":"ref10","first-page":"56998","article-title":"Latent diffusion for language generation","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Lovelace","year":"2023"},{"volume-title":"Naturalspeech 2: Latent diffusion models are natural and zero-shot speech and singing synthesizers","year":"2023","author":"Shen","key":"ref11"},{"volume-title":"A comprehensive overview of large language models","year":"2024","author":"Naveed","key":"ref12"},{"volume-title":"Text-to-audio generation using instruction-tuned llm and latent diffusion model","year":"2023","author":"Ghosal","key":"ref13"},{"volume-title":"Tango 2: Aligning diffusion-based text-to-audio generations through direct preference optimization","year":"2024","author":"Majumder","key":"ref14"},{"issue":"70","key":"ref15","first-page":"1","article-title":"Scaling instruction-finetuned language models","volume":"25","author":"Chung","year":"2024","journal-title":"Journal of Machine Learning Research"},{"volume-title":"Icagc 2024: Inspirational and convincing audio generation challenge 2024","year":"2024","author":"Fu","key":"ref16"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/aimv53313.2021.9670959"},{"key":"ref18","first-page":"119","article-title":"AudioCaps: Generating captions for audios in the wild","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","volume":"1","author":"Kim"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3605943"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"volume-title":"Masked autoencoders that lis-ten","year":"2023","author":"Huang","key":"ref21"},{"volume-title":"Decoupling magnitude and phase estimation with deep resunet for music source separation","year":"2021","author":"Kong","key":"ref22"}],"event":{"name":"2024 IEEE 14th International Symposium on Chinese Spoken Language Processing (ISCSLP)","start":{"date-parts":[[2024,11,7]]},"location":"Beijing, China","end":{"date-parts":[[2024,11,10]]}},"container-title":["2024 IEEE 14th International Symposium on Chinese Spoken Language Processing (ISCSLP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10799944\/10799969\/10800334.pdf?arnumber=10800334","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,24]],"date-time":"2024-12-24T06:26:23Z","timestamp":1735021583000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10800334\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,7]]},"references-count":22,"URL":"https:\/\/doi.org\/10.1109\/iscslp63861.2024.10800334","relation":{},"subject":[],"published":{"date-parts":[[2024,11,7]]}}}