{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,20]],"date-time":"2026-02-20T04:00:19Z","timestamp":1771560019355,"version":"3.50.1"},"reference-count":121,"publisher":"Springer Science and Business Media LLC","issue":"12","license":[{"start":{"date-parts":[[2026,2,20]],"date-time":"2026-02-20T00:00:00Z","timestamp":1771545600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,2,20]],"date-time":"2026-02-20T00:00:00Z","timestamp":1771545600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Front. Comput. Sci."],"published-print":{"date-parts":[[2026,12]]},"DOI":"10.1007\/s11704-025-51171-9","type":"journal-article","created":{"date-parts":[[2026,2,20]],"date-time":"2026-02-20T01:51:18Z","timestamp":1771552278000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Next-Gen AIGC: a review of multimodal foundation models for text-to-media innovations"],"prefix":"10.1007","volume":"20","author":[{"given":"Cong","family":"Jin","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jingru","family":"Fan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jinfa","family":"Huang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jinyuan","family":"Fu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tao","family":"Mei","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Li","family":"Yuan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiebo","family":"Luo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,2,20]]},"reference":[{"key":"51171_CR1","unstructured":"Bommasani R, Hudson D A, Adeli E, Altman R, Arora S, et al. On the opportunities and risks of foundation models. 2021, arXiv preprint arXiv: 2108.07258"},{"key":"51171_CR2","first-page":"8748","volume-title":"Proceedings of the 38th International Conference on Machine Learning","author":"A Radford","year":"2021","unstructured":"Radford A, Kim J W, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J, Krueger G, Sutskever I. Learning transferable visual models from natural language supervision. In: Proceedings of the 38th International Conference on Machine Learning. 2021, 8748\u20138763"},{"key":"51171_CR3","first-page":"8821","volume-title":"Proceedings of the 38th International Conference on Machine Learning","author":"A Ramesh","year":"2021","unstructured":"Ramesh A, Pavlov M, Goh G, Gray S, Voss C, Radford A, Chen M, Sutskever I. Zero-shot text-to-image generation. In: Proceedings of the 38th International Conference on Machine Learning. 2021, 8821\u20138831"},{"key":"51171_CR4","first-page":"1597","volume-title":"Proceedings of the 37th International Conference on Machine Learning","author":"T Chen","year":"2020","unstructured":"Chen T, Kornblith S, Norouzi M, Hinton G E. A simple framework for contrastive learning of visual representations. In: Proceedings of the 37th International Conference on Machine Learning. 2020, 1597\u20131607"},{"key":"51171_CR5","first-page":"9726","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"K He","year":"2020","unstructured":"He K, Fan H, Wu Y, Xie S, Girshick R. Momentum contrast for unsupervised visual representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2020, 9726\u20139735"},{"key":"51171_CR6","volume-title":"Proceedings of the 10th International Conference on Learning Representations","author":"H Bao","year":"2022","unstructured":"Bao H, Dong L, Piao S, Wei F. BEiT: BERT pre-training of image transformers. In: Proceedings of the 10th International Conference on Learning Representations. 2022"},{"key":"51171_CR7","first-page":"15979","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"K He","year":"2022","unstructured":"He K, Chen X, Xie S, Li Y, Doll\u00e1r P, Girshick R. Masked autoencoders are scalable vision learners. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2022, 15979\u201315988"},{"key":"51171_CR8","unstructured":"Achiam J, Adler S, Agarwal S, Ahmad L, Akkaya I, et al. GPT-4 technical report. 2023, arXiv preprint arXiv: 2303.08774"},{"key":"51171_CR9","unstructured":"Anil R, Borgeaud S, Alayrac J B, Yu J H, Soricut R, et al. Gemini: a family of highly capable multimodal models. 2023, arXiv preprint arXiv: 2312.11805"},{"key":"51171_CR10","unstructured":"Jin Y, Li J, Liu Y, Gu T, Wu K, Jiang Z, He M, Zhao B, Tan X, Gan Z, Wang Y, Wang C, Ma L. Efficient multimodal large language models: a survey. 2024, arXiv preprint arXiv: 2405.10739"},{"key":"51171_CR11","first-page":"12401","volume-title":"Proceedings of Findings of the Association for Computational Linguistics","author":"D Zhang","year":"2024","unstructured":"Zhang D, Yu Y, Dong J, Li C, Su D, Chu C, Yu D. MM-LLMs: recent advances in multimodal large language models. In: Proceedings of Findings of the Association for Computational Linguistics. 2024, 12401\u201312430"},{"key":"51171_CR12","unstructured":"Bai G, Chai Z, Ling C, Wang S, Lu J, Zhang N, Shi T, Yu Z, Zhu M, Zhang Y, Song X, Yang C, Cheng Y, Zhao L. Beyond efficiency: a systematic survey of resource-efficient large language models. 2024, arXiv preprint arXiv: 2401.00625"},{"key":"51171_CR13","first-page":"1419","volume-title":"Proceedings of the 36th International Conference on Neural Information Processing Systems","author":"B Sorscher","year":"2022","unstructured":"Sorscher B, Geirhos R, Shekhar S, Ganguli S, Morcos A. Beyond neural scaling laws: beating power law scaling via data pruning. In: Proceedings of the 36th International Conference on Neural Information Processing Systems. 2022, 1419"},{"key":"51171_CR14","unstructured":"Bai T, Liang H, Wan B, Xu Y, Li X, Li S, Yang L, Li B, Wang Y, Cui B, Huang P, Shan J, He C, Yuan B, Zhang W. A survey of multimodal large language model from a data-centric perspective. 2024, arXiv preprint arXiv: 2405.16640"},{"issue":"9","key":"51171_CR15","doi-asserted-by":"publisher","first-page":"5005","DOI":"10.1109\/TFUZZ.2024.3409146","volume":"32","author":"C Jin","year":"2024","unstructured":"Jin C, Liu X, Zhao Y, Zhu Y, Wang J, Wang H. ViolinBot: A framework for imitation learning of violin bowing using fuzzy logic and PCA. IEEE Transactions on Fuzzy Systems, 2024, 32(9): 5005\u20135017","journal-title":"IEEE Transactions on Fuzzy Systems"},{"key":"51171_CR16","first-page":"12888","volume-title":"Proceedings of the 39th International Conference on Machine Learning","author":"J Li","year":"2022","unstructured":"Li J, Li D, Xiong C, Hoi S C H. BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: Proceedings of the 39th International Conference on Machine Learning. 2022, 12888\u201312900"},{"key":"51171_CR17","first-page":"19358","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Y Fang","year":"2023","unstructured":"Fang Y, Wang W, Xie B, Sun Q, Wu L, Wang X, Huang T, Wang X, Cao Y. EVA: exploring the limits of masked visual representation learning at scale. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2023, 19358\u201319369"},{"issue":"1","key":"51171_CR18","doi-asserted-by":"publisher","first-page":"6","DOI":"10.1631\/FITEE.2300089","volume":"25","author":"J Zhou","year":"2024","unstructured":"Zhou J, Ke P, Qiu X, Huang M, Zhang J. ChatGPT: potential, prospects, and limitations. Frontiers of Information Technology & Electronic Engineering, 2024, 25(1): 6\u201311.","journal-title":"Frontiers of Information Technology & Electronic Engineering"},{"key":"51171_CR19","volume-title":"Vicuna: An open-source chatbot impressing GPT-4 with 90%* chatGPT quality","author":"W L Chiang","year":"2023","unstructured":"Chiang W L, Li Z, Lin Z, Sheng Y, Wu Z, Zhang H, Zheng L, Zhuang S, Zhuang Y, Gonzalez J E, Stoica I, Xing E P. Vicuna: An open-source chatbot impressing GPT-4 with 90%* chatGPT quality. See vicuna.lmsys website, 2023"},{"key":"51171_CR20","unstructured":"Touvron H, Lavril T, Izacard G, Martinet X, Lachaux M A, Lacroix T, Rozi\u00e8re B, Goyal N, Hambro E, Azhar F, Rodriguez A, Joulin A, Grave E, Lample G. LLaMA: open and efficient foundation language models. 2023, arXiv preprint arXiv: 2302.13971"},{"key":"51171_CR21","first-page":"1723","volume-title":"Proceedings of the 36th International Conference on Neural Information Processing Systems","author":"J B Alayrac","year":"2022","unstructured":"Alayrac J B, Donahue J, Luc P, Miech A, Barr I, Hasson Y, Lenc K, Mensch A, Millican K, Reynolds M, Ring R, Rutherford E, Cabi S, Han T, Gong Z, Samangooei S, Monteiro M, Menick J, Borgeaud S, Brock A, Nematzadeh A, Sharifzadeh S, Binkowski M, Barreira R, Vinyals O, Zisserman A, Simonyan K. Flamingo: a visual language model for few-shot learning. In: Proceedings of the 36th International Conference on Neural Information Processing Systems. 2022, 1723"},{"key":"51171_CR22","first-page":"814","volume-title":"Proceedings of the 40th International Conference on Machine Learning","author":"J Li","year":"2023","unstructured":"Li J, Li D, Savarese S, Hoi S. BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: Proceedings of the 40th International Conference on Machine Learning. 2023, 814"},{"key":"51171_CR23","unstructured":"Peng Z, Wang W, Dong L, Hao Y, Huang S, Ma S, Wei F. Kosmos-2: grounding multimodal large language models to the world. 2023, arXiv preprint arXiv: 2306.14824"},{"key":"51171_CR24","unstructured":"Zheng K Z, He X H, Wang X E. MiniGPT-5: interleaved vision-and-language generation via generative vokens. 2023, arXiv preprint arXiv: 2310.02239"},{"key":"51171_CR25","first-page":"15757","volume-title":"Proceedings of the Findings of the Association for Computational Linguistics","author":"D Zhang","year":"2023","unstructured":"Zhang D, Li S, Zhang X, Zhan J, Wang P, Zhou Y, Qiu X. SpeechGPT: empowering large language models with intrinsic cross-modal conversational abilities. In: Proceedings of the Findings of the Association for Computational Linguistics. 2023, 15757\u201315773"},{"key":"51171_CR26","unstructured":"Wu C, Yin S, Qi W, Wang X, Tang Z, Duan N. Visual chatGPT: talking, drawing and editing with visual foundation models. 2023, arXiv preprint arXiv: 2303.04671"},{"key":"51171_CR27","first-page":"23802","volume-title":"Proceedings of the 38th AAAI Conference on Artificial Intelligence","author":"R Huang","year":"2024","unstructured":"Huang R, Li M, Yang D, Shi J, Chang X, Ye Z, Wu Y, Hong Z, Huang J, Liu J, Ren Y, Zou Y, Zhao Z, Watanabe S. AudioGPT: understanding and generating speech, music, sound, and talking head. In: Proceedings of the 38th AAAI Conference on Artificial Intelligence. 2024, 23802\u201323804"},{"issue":"10","key":"51171_CR28","doi-asserted-by":"publisher","first-page":"026","DOI":"10.1088\/1475-7516\/2012\/10\/026","volume":"2012","author":"Y Farzan","year":"2012","unstructured":"Farzan Y, RezaeiAkbarieh A. VDM: a model for vector dark matter. Journal of Cosmology and Astroparticle Physics, 2012, 2012(10): 026","journal-title":"Journal of Cosmology and Astroparticle Physics"},{"key":"51171_CR29","unstructured":"Zhou D, Wang W, Yan H, Lv W, Zhu Y, Feng J. MagicVideo: efficient video generation with latent diffusion models. 2022, arXiv preprint arXiv: 2211.11018"},{"key":"51171_CR30","first-page":"15908","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"L Khachatryan","year":"2023","unstructured":"Khachatryan L, Movsisyan A, Tadevosyan V, Henschel R, Wang Z, Navasardyan S, Shi H. Text2Video-zero: text-to-image diffusion models are zero-shot video generators. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. 2023, 15908\u201315918"},{"issue":"1","key":"51171_CR31","first-page":"99","volume":"16","author":"Y Cui","year":"2024","unstructured":"Cui Y, Shan X, Chung J. A feasibility study on RUNWAY GEN-2 for generating realistic style images. International Journal of Internet, Broadcasting and Communication, 2024, 16(1): 99\u2013105","journal-title":"International Journal of Internet, Broadcasting and Communication"},{"key":"51171_CR32","first-page":"1309","volume-title":"Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics","author":"S Yin","year":"2023","unstructured":"Yin S, Wu C, Yang H, Wang J, Wang X, Ni M, Yang Z, Li L, Liu S, Yang F, Fu J, Gong M, Wang L, Liu Z, Li H, Duan N. NUWA-XL: diffusion over diffusion for extremely long video generation. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics. 2023, 1309\u20131320"},{"key":"51171_CR33","first-page":"722","volume-title":"Proceedings of the 6th Indian Conference on Computer Vision, Graphics & Image Processing","author":"M E Nilsback","year":"2008","unstructured":"Nilsback M E, Zisserman A. Automated flower classification over a large number of classes. In: Proceedings of the 6th Indian Conference on Computer Vision, Graphics & Image Processing. 2008, 722\u2013729"},{"key":"51171_CR34","first-page":"740","volume-title":"Proceedings of the 13th European Conference Computer Vision","author":"T Y Lin","year":"2014","unstructured":"Lin T Y, Maire M, Belongie S, Hays J, Perona P, Ramanan D, Doll\u00e1r P, Zitnick C L. Microsoft COCO: common objects in context. In: Proceedings of the 13th European Conference Computer Vision. 2014, 740\u2013755"},{"key":"51171_CR35","first-page":"2256","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"W Xia","year":"2021","unstructured":"Xia W, Yang Y, Xue J H, Wu B. TediGAN: text-guided diverse face image generation and manipulation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2021, 2256\u20132265"},{"key":"51171_CR36","first-page":"13779","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"Y Jiang","year":"2021","unstructured":"Jiang Y, Huang Z, Pan X, Loy C C, Liu Z. Talk-to-edit: finegrained facial editing via dialog. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. 2021, 13779\u201313788"},{"key":"51171_CR37","doi-asserted-by":"publisher","first-page":"2940","DOI":"10.1145\/3474085.3481026","volume-title":"Proceedings of the 29th ACM International Conference on Multimedia","author":"Y Zhou","year":"2021","unstructured":"Zhou Y. Generative adversarial network for text-to-face synthesis and manipulation. In: Proceedings of the 29th ACM International Conference on Multimedia. 2021, 2940\u20132944"},{"key":"51171_CR38","doi-asserted-by":"publisher","first-page":"2290","DOI":"10.1145\/3474085.3475391","volume-title":"Proceedings of the 29th ACM International Conference on Multimedia","author":"J Sun","year":"2021","unstructured":"Sun J, Li Q, Wang W, Zhao J, Sun Z. Multi-caption text-to-face synthesis: dataset and algorithm. In: Proceedings of the 29th ACM International Conference on Multimedia. 2021, 2290\u20132298"},{"issue":"4","key":"51171_CR39","doi-asserted-by":"publisher","first-page":"162","DOI":"10.1145\/3528223.3530104","volume":"41","author":"Y Jiang","year":"2022","unstructured":"Jiang Y, Yang S, Qiu H, Wu W, Loy C C, Liu Z. Text2Human: text-driven controllable human image generation. ACM Transactions on Graphics, 2022, 41(4): 162","journal-title":"ACM Transactions on Graphics"},{"key":"51171_CR40","doi-asserted-by":"publisher","first-page":"53","DOI":"10.1145\/3552485.3554935","volume-title":"Proceedings of the 1st International Workshop on Multimedia for Cooking, Eating, and Related APPlications","author":"Y Zhou","year":"2022","unstructured":"Zhou Y, Shimada N. ABLE: aesthetic box lunch editing. In: Proceedings of the 1st International Workshop on Multimedia for Cooking, Eating, and Related APPlications. 2022, 53\u201356"},{"key":"51171_CR41","first-page":"14805","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"J Yu","year":"2023","unstructured":"Yu J, Zhu H, Jiang L, Loy C C, Cai W, Wu W. CelebV-text: a large-scale facial text-video dataset. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2023, 14805\u201314814"},{"key":"51171_CR42","first-page":"5804","volume-title":"Proceedings of the IEEE International Conference on Computer Vision","author":"L A Hendricks","year":"2017","unstructured":"Hendricks L A, Wang O, Shechtman E, Sivic J, Darrell T, Russell B. Localizing moments in video with natural language. In: Proceedings of the IEEE International Conference on Computer Vision. 2017, 5804\u20135813"},{"key":"51171_CR43","first-page":"1708","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"M Bain","year":"2021","unstructured":"Bain M, Nagrani A, Varol G, Zisserman A. Frozen in time: a joint video and image encoder for end-to-end retrieval. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. 2021, 1708\u20131718"},{"key":"51171_CR44","first-page":"3185","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems","author":"S Chen","year":"2023","unstructured":"Chen S, Li H, Wang Q, Zhao Z, Sun M, Zhu X, Liu J. VAST: a vision-audio-subtitle-text omni-modality foundation model and dataset. In: Proceedings of the 37th International Conference on Neural Information Processing Systems. 2023, 3185"},{"key":"51171_CR45","first-page":"13320","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"T S Chen","year":"2024","unstructured":"Chen T S, Siarohin A, Menapace W, Deyneka E, Chao H W, Jeon B E, Fang Y, Lee H Y, Ren J, Yang M H, Tulyakov S. Panda-70M: Captioning 70M videos with multiple cross-modality teachers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2024, 13320\u201313331"},{"key":"51171_CR46","first-page":"510","volume-title":"Proceedings of the 14th European Conference on Computer Vision","author":"G A Sigurdsson","year":"2016","unstructured":"Sigurdsson G A, Varol G, Wang X, Farhadi A, Laptev I, Gupta A. Hollywood in homes: crowdsourcing data collection for activity understanding. In: Proceedings of the 14th European Conference on Computer Vision. 2016, 510\u2013526"},{"key":"51171_CR47","first-page":"706","volume-title":"Proceedings of the IEEE International Conference on Computer Vision","author":"R Krishna","year":"2017","unstructured":"Krishna R, Hata K, Ren F, Li F F, Niebles J C. Dense-captioning events in videos. In: Proceedings of the IEEE International Conference on Computer Vision. 2017, 706\u2013715"},{"key":"51171_CR48","first-page":"2630","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"A Miech","year":"2019","unstructured":"Miech A, Zhukov D, Alayrac J B, Tapaswi M, Laptev I, Sivic J. HowTo100M: Learning a text-video embedding by watching hundred million narrated video clips. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. 2019, 2630\u20132640"},{"key":"51171_CR49","first-page":"753","volume-title":"Proceedings of the 15th European Conference on Computer Vision","author":"D Damen","year":"2018","unstructured":"Damen D, Doughty H, Farinella G M, Fidler S, Furnari A, Kazakos E, Moltisanti D, Munro J, Perrett T, Price W, Wray M. Scaling egocentric vision: the EPIC-KITCHENS dataset. In: Proceedings of the 15th European Conference on Computer Vision. 2018, 753\u2013771"},{"key":"51171_CR50","first-page":"316","volume-title":"Proceedings of the 18th International Society for Music Information Retrieval Conference","author":"M Defferrard","year":"2017","unstructured":"Defferrard M, Benzi K, Vandergheynst P, Bresson X. FMA: a dataset for music analysis. In: Proceedings of the 18th International Society for Music Information Retrieval Conference. 2017, 316\u2013323"},{"key":"51171_CR51","volume-title":"MUSDB18-HQ - an uncompressed version of MUSDB18","author":"Z Rafii","year":"2019","unstructured":"Rafii Z, Liutkus A, St\u00f6ter F R, Mimilakis S I, Bittner R. MUSDB18-HQ - an uncompressed version of MUSDB18. See sigsep.github.io\/datasets\/musdb.html#sisec-2018-evaluation-campaign website, 2019."},{"key":"51171_CR52","unstructured":"The \u2018mixing secrets\u2019 free multitrack download library. All downloads from this site are provided free of charge for educational purposes only."},{"key":"51171_CR53","first-page":"45","volume-title":"Proceedings of 2019 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","author":"E Manilow","year":"2019","unstructured":"Manilow E, Wichern G, Seetharaman P, Le Roux J. Cutting music source separation some slakh: A dataset to study the impact of training data quality and quantity. In: Proceedings of 2019 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA). 2019, 45\u201349"},{"key":"51171_CR54","volume-title":"Proceedings of the 128th Audio Engineering Society (AES) Convention","author":"M Stein","year":"2000","unstructured":"Stein M, Abe\u00dfer J, Dittmar C, Schuller G. Automatic detection of audio effects in guitar and bass recordings. In: Proceedings of the 128th Audio Engineering Society (AES) Convention. 2000"},{"key":"51171_CR55","volume-title":"Proceedings of the 12th International Society for Music Information Retrieval Conference","author":"T Bertin-Mahieux","year":"2011","unstructured":"Bertin-Mahieux T, Ellis D P W, Whitman B, Lamere P. The million song dataset. In: Proceedings of the 12th International Society for Music Information Retrieval Conference. 2011"},{"key":"51171_CR56","first-page":"222","volume-title":"Proceedings of the 24th International Conference on Digital Audio Effects (DAFx)","author":"J Turian","year":"2021","unstructured":"Turian J, Shier J, Tzanetakis G, McNally K, Henry M. One billion audio sounds from GPU-enabled modular synthesis. In: Proceedings of the 24th International Conference on Digital Audio Effects (DAFx). 2021, 222\u2013229"},{"key":"51171_CR57","first-page":"343","volume-title":"Proceedings of the 9th Sound and Music Computing Conference","author":"M Cartwright","year":"2012","unstructured":"Cartwright M, Pardo B. Building a music search database using human computation. In: Proceedings of the 9th Sound and Music Computing Conference. 2012, 343\u2013349"},{"issue":"4","key":"51171_CR58","doi-asserted-by":"publisher","first-page":"236","DOI":"10.1089\/big.2016.0028","volume":"4","author":"M Plappert","year":"2016","unstructured":"Plappert M, Mandery C, Asfour T. The KIT motion-language dataset. Big Data, 2016, 4(4): 236\u2013252","journal-title":"Big Data"},{"key":"51171_CR59","first-page":"722","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"A R Punnakkal","year":"2021","unstructured":"Punnakkal A R, Chandrasekaran A, Athanasiou N, Quir\u00f3s-Ram\u00edrez A, Black M J. BABEL: bodies, action and behavior with English labels. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2021, 722\u2013731"},{"key":"51171_CR60","first-page":"5441","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"N Mahmood","year":"2019","unstructured":"Mahmood N, Ghorbani N, Troje N F, Pons-Moll G, Black M. AMASS: archive of motion capture as surface shapes. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. 2019, 5441\u20135450"},{"issue":"9","key":"51171_CR61","doi-asserted-by":"publisher","first-page":"3462","DOI":"10.1007\/s11263-024-02042-6","volume":"132","author":"H Liang","year":"2024","unstructured":"Liang H, Zhang W, Li W, Yu J, Xu L. InterGen: diffusion-based multi-human motion generation under complex interactions. International Journal of Computer Vision, 2024, 132(9): 3462\u20133483","journal-title":"International Journal of Computer Vision"},{"key":"51171_CR62","doi-asserted-by":"publisher","first-page":"2021","DOI":"10.1145\/3394171.3413635","volume-title":"Proceedings of the 28th ACM International Conference on Multimedia","author":"C Guo","year":"2020","unstructured":"Guo C, Zuo X, Wang S, Zou S, Sun Q, Deng A, Gong M, Cheng L. Action2Motion: conditioned generation of 3D human motions. In: Proceedings of the 28th ACM International Conference on Multimedia. 2020, 2021\u20132029"},{"key":"51171_CR63","first-page":"1010","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"A Shahroudy","year":"2016","unstructured":"Shahroudy A, Liu J, Ng T T, Wang G. NTU RGB+D: A large scale dataset for 3D human activity analysis. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 2016, 1010\u20131019"},{"issue":"7","key":"51171_CR64","first-page":"1325","volume":"36","author":"C Ionescu","year":"2014","unstructured":"Ionescu C, Papava D, Olaru V, Sminchisescu C. Human3. 6m: large scale datasets and predictive methods for 3D human sensing in natural environments. IEEE Transactions on Pattern Analysis and Machine Intelligence, 2014, 36(7): 1325\u20131339","journal-title":"6m: large scale datasets and predictive methods for 3D human sensing in natural environments. IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"issue":"8","key":"51171_CR65","doi-asserted-by":"publisher","first-page":"2191","DOI":"10.1109\/TMM.2014.2360793","volume":"16","author":"H Kadu","year":"2014","unstructured":"Kadu H, Kuo C C J. Automatic human mocap data classification. IEEE Transactions on Multimedia, 2014, 16(8): 2191\u20132202","journal-title":"IEEE Transactions on Multimedia"},{"key":"51171_CR66","first-page":"2643","volume-title":"Proceedings of the 36th International Conference on Neural Information Processing Systems","author":"C Saharia","year":"2022","unstructured":"Saharia C, Chan W, Saxena S, Li L L, Whang J, Denton E L, Ghasemipour K, Lopes R G, Ayan B K, Salimans T, Ho J, Fleet D J, Norouzi M. Photorealistic text-to-image diffusion models with deep language understanding. In: Proceedings of the 36th International Conference on Neural Information Processing Systems. 2022, 2643"},{"key":"51171_CR67","first-page":"10674","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"R Rombach","year":"2022","unstructured":"Rombach R, Blattmann A, Lorenz D, Esser P, Ommer B. Highresolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2022, 10674\u201310685"},{"key":"51171_CR68","volume-title":"Proceedings of the 11th International Conference on Learning Representations","author":"U Singer","year":"2023","unstructured":"Singer U, Polyak A, Hayes T, Yin X, An J, Zhang S, Hu Q, Yang H, Ashual O, Gafni O, Parikh D, Gupta S, Taigman Y. Make-a-video: text-to-video generation without text-video data. In: Proceedings of the 11th International Conference on Learning Representations. 2023"},{"key":"51171_CR69","first-page":"22563","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"A Blattmann","year":"2023","unstructured":"Blattmann A, Rombach R, Ling H, Dockhorn T, Kim S W, Fidler S, Kreis K. Align your latents: High-resolution video synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2023, 22563\u201322575"},{"key":"51171_CR70","unstructured":"Agostinelli A, Denk T I, Borsos Z, Engel J, Verzetti M, Caillon A, Huang Q, Jansen A, Roberts A, Tagliasacchi M, Sharifi M, Zeghidour N, Frank C. MusicLM: generating music from text. 2023, arXiv preprint arXiv: 2301.11325"},{"issue":"6","key":"51171_CR71","doi-asserted-by":"publisher","first-page":"4115","DOI":"10.1109\/TPAMI.2024.3355414","volume":"46","author":"M Zhang","year":"2024","unstructured":"Zhang M, Cai Z, Pan L, Hong F, Guo X, Yang L, Liu Z. MotionDiffuse: text-driven human motion generation with diffusion model. IEEE Transactions on Pattern Analysis and Machine Intelligence, 2024, 46(6): 4115\u20134128","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"51171_CR72","first-page":"574","volume-title":"Proceedings of the 34th International Conference on Neural Information Processing Systems","author":"J Ho","year":"2020","unstructured":"Ho J, Jain A, Abbeel P. Denoising diffusion probabilistic models. In: Proceedings of the 34th International Conference on Neural Information Processing Systems. 2020, 574"},{"key":"51171_CR73","first-page":"11294","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"B Zhang","year":"2022","unstructured":"Zhang B, Gu S, Zhang B, Bao J, Chen D, Wen F, Wang Y, Guo B. StyleSwin: Transformer-based GAN for high-resolution image generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2022, 11294\u201311304"},{"key":"51171_CR74","first-page":"4172","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"W Peebles","year":"2023","unstructured":"Peebles W, Xie S. Scalable diffusion models with transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. 2023, 4172\u20134182"},{"key":"51171_CR75","first-page":"8435","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"R Zhu","year":"2024","unstructured":"Zhu R, Pan Y, Li Y, Yao T, Sun Z, Mei T, Chen C W. SD-DiT: Unleashing the power of self-supervised discrimination in diffusion transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2024, 8435\u20138445"},{"key":"51171_CR76","first-page":"8911","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Y Qian","year":"2024","unstructured":"Qian Y, Cai Q, Pan Y, Li Y, Yao T, Sun Q, Mei T. Boosting diffusion models with moving average sampling in frequency domain. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2024, 8911\u20138920"},{"key":"51171_CR77","first-page":"4171","volume-title":"Proceedings of 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","author":"J Devlin","year":"2019","unstructured":"Devlin J, Chang M W, Lee K, Toutanova K. BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. 2019, 4171\u20134186"},{"key":"51171_CR78","first-page":"3616","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"I Skorokhodov","year":"2022","unstructured":"Skorokhodov I, Tulyakov S, Elhoseiny M. StyleGAN-V: a continuous video generator with the price, image quality and perks of styleGAN2. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2022, 3616\u20133626"},{"key":"51171_CR79","first-page":"1505","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"T Qiao","year":"2019","unstructured":"Qiao T, Zhang J, Xu D, Tao D. MirrorGAN: learning text-to-image generation by redescription. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2019, 1505\u20131514"},{"key":"51171_CR80","volume-title":"Proceedings of the 12th International Conference on Learning Representations","author":"X Chen","year":"2024","unstructured":"Chen X, Wang Y, Zhang L, Zhuang S, Ma X, Yu J, Wang Y, Lin D, Qiao Y, Liu Z. SEINE: short-to-long video diffusion model for generative transition and prediction. In: Proceedings of the 12th International Conference on Learning Representations. 2024"},{"key":"51171_CR81","first-page":"700","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems","author":"J Xu","year":"2023","unstructured":"Xu J, Liu X, Wu Y, Tong Y, Li Q, Ding M, Tang J, Dong Y. ImageReward: learning and evaluating human preferences for text-to-image generation. In: Proceedings of the 37th International Conference on Neural Information Processing Systems. 2023, 700"},{"key":"51171_CR82","first-page":"185","volume-title":"Proceedings of the 33rd International Conference on Neural Information Processing Systems","author":"B Li","year":"2019","unstructured":"Li B, Qi X, Lukasiewicz T, Torr P H S. Controllable text-to-image generation. In: Proceedings of the 33rd International Conference on Neural Information Processing Systems. 2019, 185"},{"key":"51171_CR83","first-page":"2066","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems","author":"J Copet","year":"2023","unstructured":"Copet J, Kreuk F, Gat I, Remez T, Kant D, Synnaeve G, Adi Y, D\u00e9fossez A. Simple and controllable music generation. In: Proceedings of the 37th International Conference on Neural Information Processing Systems. 2023, 2066"},{"issue":"9","key":"51171_CR84","doi-asserted-by":"publisher","first-page":"7340","DOI":"10.1109\/TPAMI.2025.3558507","volume":"47","author":"S Yuan","year":"2025","unstructured":"Yuan S, Huang J, Shi Y, Xu Y, Zhu R, Lin B, Cheng X, Yuan L, Luo J. MagicTime: time-lapse video generation models as metamorphic simulators. IEEE Transactions on Pattern Analysis and Machine Intelligence, 2025, 47(9): 7340\u20137351","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"51171_CR85","unstructured":"Ma X, Wang Y, Jia G, Chen X, Liu Z, Li Y F, Chen C, Qiao Y. Latte: latent diffusion transformer for video generation. Transactions on Machine Learning Research, 2025"},{"key":"51171_CR86","unstructured":"Zheng Z, Peng X, Yang T, Shen C, Li S, Liu H, Zhou Y, Li T, You Y. Open-Sora: democratizing efficient video production for all. 2024, arXiv preprint arXiv:2412.20404"},{"key":"51171_CR87","volume-title":"Proceedings of the 2nd International Conference on Learning Representations","author":"D P Kingma","year":"2014","unstructured":"Kingma D P, Welling M. Auto-encoding variational Bayes. In: Proceedings of the 2nd International Conference on Learning Representations. 2014"},{"key":"51171_CR88","unstructured":"Blattmann A, Dockhorn T, Kulal S, Mendelevitch D, Kilian M, Lorenz D, Levi Y, English Z, Voleti V, Letts A, Jampani V, Rombach R. Stable video diffusion: scaling latent video diffusion models to large datasets. 2023, arXiv preprint arXiv: 2311.15127"},{"key":"51171_CR89","unstructured":"Sun P, Jiang Y, Chen S, Zhang S, Peng B, Luo P, and Yuan Z. Autoregressive model beats diffusion: llama for scalable image generation. 2024, arXiv preprint arXiv: 2406.06525"},{"key":"51171_CR90","first-page":"1650","volume-title":"Proceedings of the 34th International Conference on Neural Information Processing Systems","author":"A Vahdat","year":"2020","unstructured":"Vahdat A, Kautz J. NVAE: a deep hierarchical variational autoencoder. In: Proceedings of the 34th International Conference on Neural Information Processing Systems. 2020, 1650"},{"key":"51171_CR91","volume-title":"Proceedings of the 6th International Conference on Learning Representations","author":"T Karras","year":"2018","unstructured":"Karras T, Aila T, Laine S, Lehtinen J. Progressive growing of GANs for improved quality, stability, and variation. In: Proceedings of the 6th International Conference on Learning Representations. 2018"},{"key":"51171_CR92","first-page":"125","volume-title":"Proceedings of the 9th ISCA Speech Synthesis Workshop","author":"A van den Oord","year":"2016","unstructured":"van den Oord A, Dieleman S, Zen H, Simonyan K, Vinyals O, Graves A, Kalchbrenner N, Senior A W, Kavukcuoglu K. WaveNet: a generative model for raw audio. In: Proceedings of the 9th ISCA Speech Synthesis Workshop. 2016, 125"},{"issue":"3","key":"51171_CR93","doi-asserted-by":"publisher","first-page":"36","DOI":"10.4018\/IJMDEM.2020070103","volume":"11","author":"C Jin","year":"2020","unstructured":"Jin C, Wang T, Liu S, Tie Y, Li J, Li X, Lui S. A transformer-based model for multi-track music generation. International Journal of Multimedia Data Engineering and Management, 2020, 11(3): 36\u201354","journal-title":"International Journal of Multimedia Data Engineering and Management"},{"key":"51171_CR94","doi-asserted-by":"publisher","first-page":"25","DOI":"10.1007\/978-3-031-18444-4_2","volume-title":"Advances in Speech and Music Technology: Computational Aspects and Applications","author":"C Hernandez-Olivan","year":"2023","unstructured":"Hernandez-Olivan C, Beltr\u00e1n J R. Music composition with deep learning: a review. In: Biswas A, Wennekes E, Wieczorkowska A, Laskar R H, eds. Advances in Speech and Music Technology: Computational Aspects and Applications. Cham: Springer, 2023, 25\u201350"},{"key":"51171_CR95","first-page":"6000","volume-title":"Proceedings of the 31st International Conference on Neural Information Processing Systems","author":"A Vaswani","year":"2017","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez A N, Kaiser \u0141, Polosukhin I. Attention is all you need. In: Proceedings of the 31st International Conference on Neural Information Processing Systems. 2017, 6000\u20136010"},{"key":"51171_CR96","first-page":"311","volume-title":"Proceedings of the 25th International Society for Music Information Retrieval Conference","author":"Y H Lan","year":"2024","unstructured":"Lan Y H, Hsiao W Y, Cheng H C, Yang Y H. MusiConGen: Rhythm and chord control for transformer-based text-to-music generation. In: Proceedings of the 25th International Society for Music Information Retrieval Conference. 2024, 311\u2013318"},{"key":"51171_CR97","doi-asserted-by":"publisher","first-page":"1180","DOI":"10.1145\/3394171.3413671","volume-title":"Proceedings of the 28th ACM International Conference on Multimedia","author":"Y S Huang","year":"2020","unstructured":"Huang Y S, Yang Y H. Pop music transformer: beat-based modeling and generation of expressive pop piano compositions. In: Proceedings of the 28th ACM International Conference on Multimedia. 2020, 1180\u20131188"},{"issue":"11","key":"51171_CR98","doi-asserted-by":"publisher","first-page":"287","DOI":"10.1145\/3672554","volume":"56","author":"A Dash","year":"2024","unstructured":"Dash A, Agres K. AI-based affective music generation systems: a review of methods and challenges. ACM Computing Surveys, 2024, 56(11): 287","journal-title":"ACM Computing Surveys"},{"key":"51171_CR99","doi-asserted-by":"publisher","first-page":"2830","DOI":"10.1109\/TASLPRO.2025.3574867","volume":"33","author":"X Liu","year":"2025","unstructured":"Liu X, Zhu Z, Liu H, Yuan Y, Huang Q, Cui M, Liang J, Cao Y, Kong Q, Plumbley M D, Wang W. WavJourney: Compositional audio creation with large language models. IEEE Transactions on Audio, Speech and Language Processing, 2025, 33: 2830\u20132844","journal-title":"IEEE Transactions on Audio, Speech and Language Processing"},{"key":"51171_CR100","first-page":"377","volume-title":"Proceedings of the 12th IEEE International Conference on Semantic Computing (ICSC)","author":"H H Mao","year":"2018","unstructured":"Mao H H, Shin T, Cottrell G. DeepJ: style-specific music generation. In: Proceedings of the 12th IEEE International Conference on Semantic Computing (ICSC). 2018, 377\u2013382"},{"key":"51171_CR101","first-page":"1118","volume-title":"Proceedings of the 18th Annual Conference of the International Speech Communication Association","author":"A Tamamori","year":"2017","unstructured":"Tamamori A, Hayashi T, Kobayashi K, Takeda K, Toda T. Speaker-dependent WaveNet vocoder. In: Proceedings of the 18th Annual Conference of the International Speech Communication Association. 2017, 1118\u20131122"},{"key":"51171_CR102","first-page":"1068","volume-title":"Proceedings of the 34th International Conference on Machine Learning","author":"J Engel","year":"2017","unstructured":"Engel J, Resnick C, Roberts A, Dieleman S, Norouzi M, Eck D, Simonyan K. Neural audio synthesis of musical notes with WaveNet autoencoders. In: Proceedings of the 34th International Conference on Machine Learning. 2017, 1068\u20131077"},{"key":"51171_CR103","first-page":"4227","volume-title":"Proceedings of the 23rd Annual Conference of the International Speech Communication Association","author":"H Liu","year":"2022","unstructured":"Liu H, Choi W, Liu X, Kong Q, Tian Q, Wang D L. Neural vocoder is all you need for speech super-resolution. In: Proceedings of the 23rd Annual Conference of the International Speech Communication Association. 2022, 4227\u20134231"},{"issue":"3","key":"51171_CR104","doi-asserted-by":"publisher","first-page":"42","DOI":"10.1109\/MSP.2021.3134634","volume":"39","author":"L Ericsson","year":"2022","unstructured":"Ericsson L, Gouk H, Loy C C, Hospedales T M. Self-supervised representation learning: introduction, advances, and challenges. IEEE Signal Processing Magazine, 2022, 39(3): 42\u201362","journal-title":"IEEE Signal Processing Magazine"},{"issue":"12","key":"51171_CR105","doi-asserted-by":"publisher","first-page":"5894","DOI":"10.3390\/app12125894","volume":"12","author":"T Vanhatalo","year":"2022","unstructured":"Vanhatalo T, Legrand P, Desainte-Catherine M, Hanna P, Brusco A, Pille G, Bayle Y. A review of neural network-based emulation of guitar amplifiers. Applied Sciences, 2022, 12(12): 5894","journal-title":"Applied Sciences"},{"key":"51171_CR106","doi-asserted-by":"publisher","first-page":"3746","DOI":"10.1145\/3474085.3475180","volume-title":"Proceedings of the 29th ACM International Conference on Multimedia","author":"S Wu","year":"2021","unstructured":"Wu S, Liu Z, Lu S, Cheng L. Dual learning music composition and dance choreography. In: Proceedings of the 29th ACM International Conference on Multimedia. 2021, 3746\u20133754"},{"key":"51171_CR107","first-page":"3029","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"Y X Wang","year":"2017","unstructured":"Wang Y X, Ramanan D, Hebert M. Growing a brain: fine-tuning by increasing model capacity. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 2017, 3029\u20133038"},{"issue":"4","key":"51171_CR108","doi-asserted-by":"publisher","first-page":"798","DOI":"10.3390\/math11040798","volume":"11","author":"S Li","year":"2023","unstructured":"Li S, Sung Y. MRBERT: pre-training of melody and rhythm for automatic music generation. Mathematics, 2023, 11(4): 798","journal-title":"Mathematics"},{"key":"51171_CR109","unstructured":"Tokui N. Can GAN originate new electronic dance music genres? \u2014Generating novel rhythm patterns using GAN with genre ambiguity loss. 2020, arXiv preprint arXiv: 2011.13062"},{"key":"51171_CR110","first-page":"616","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems","author":"M Zhang","year":"2023","unstructured":"Zhang M, Li H, Cai Z, Ren J, Yang L, Liu Z. FineMoGen: finegrained spatio-temporal motion generation and editing. In: Proceedings of the 37th International Conference on Neural Information Processing Systems. 2023, 616"},{"key":"51171_CR111","first-page":"5142","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"C Guo","year":"2022","unstructured":"Guo C, Zou S, Zuo X, Wang S, Ji W, Li X, Cheng L. Generating diverse and natural 3D human motions from text. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2022, 5142\u20135151"},{"key":"51171_CR112","first-page":"66","volume-title":"Proceedings of the ACM SIGGRAPH 2024 Conference Papers","author":"H Sun","year":"2024","unstructured":"Sun H, Zheng R, Huang H, Ma C, Huang H, Hu R. LGTM: local-to-global text-driven human motion diffusion model. In: Proceedings of the ACM SIGGRAPH 2024 Conference Papers. 2024, 66"},{"key":"51171_CR113","doi-asserted-by":"publisher","first-page":"6989","DOI":"10.1145\/3664647.3681034","volume-title":"Proceedings of the 32nd ACM International Conference on Multimedia","author":"W Chen","year":"2024","unstructured":"Chen W, Xiao H, Zhang E, Hu L, Wang L, Liu M, Chen C. SATO: stable text-to-motion framework. In: Proceedings of the 32nd ACM International Conference on Multimedia. 2024, 6989\u20136997"},{"key":"51171_CR114","unstructured":"Zhao K, Li G, Tang S. DART: a diffusion-based autoregressive motion model for real-time text-driven motion control. 2024, arXiv preprint arXiv: 2410.05260"},{"key":"51171_CR115","first-page":"445","volume-title":"Proceedings of the 18th European Conference on Computer Vision","author":"J Liu","year":"2025","unstructured":"Liu J, Dai W, Wang C, Cheng Y, Tang Y, Tong X. Plan, posture and go: towards open-vocabulary text-to-motion generation. In: Proceedings of the 18th European Conference on Computer Vision. 2025, 445\u2013463"},{"issue":"6","key":"51171_CR116","doi-asserted-by":"publisher","first-page":"9777","DOI":"10.1109\/TNNLS.2025.3526815","volume":"36","author":"J Li","year":"2025","unstructured":"Li J, Zhang Y, Zeng Y, Ye C, Xu W, Ben X, Wang F Y, Zhang J. Rethinking appearance-based deep gait recognition: reviews, analysis, and insights from gait recognition evolution. IEEE Transactions on Neural Networks and Learning Systems, 2025, 36(6): 9777\u20139797","journal-title":"IEEE Transactions on Neural Networks and Learning Systems"},{"key":"51171_CR117","first-page":"18","volume-title":"Proceedings of the 18th European Conference on Computer Vision","author":"S Chi","year":"2025","unstructured":"Chi S, Chi H G, Ma H, Agarwal N, Siddiqui F, Ramani K, Lee K. M2D2M: multi-motion generation from text with discrete diffusion models. In: Proceedings of the 18th European Conference on Computer Vision. 2025, 18\u201336"},{"issue":"4","key":"51171_CR118","doi-asserted-by":"publisher","first-page":"2430","DOI":"10.1109\/TPAMI.2023.3330935","volume":"46","author":"W Zhu","year":"2024","unstructured":"Zhu W, Ma X, Ro D, Ci H, Zhang J, Shi J, Gao F, Tian Q, Wang Y. Human motion generation: a survey. IEEE Transactions on Pattern Analysis and Machine Intelligence, 2024, 46(4): 2430\u20132449","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"51171_CR119","first-page":"9085","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"Y Sun","year":"2019","unstructured":"Sun Y, Liu J, Liu W, Han J, Ding E, Liu J. Chinese street view text: large-scale Chinese text reading with partially supervised learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. 2019, 9085\u20139094"},{"key":"51171_CR120","unstructured":"Mao Y, Liu X, Zhou W, Lu Z, Li H. Learning generalizable human motion generator with reinforcement learning. 2024, arXiv preprint arXiv: 2405.15541"},{"key":"51171_CR121","first-page":"2901","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops","author":"K Uchida","year":"2025","unstructured":"Uchida K, Shibuya T, Takida Y, Murata N, Tanke J, Takahashi S, Mitsufuji Y. MoLA: motion generation and editing with latent diffusion enhanced by adversarial training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops. 2025, 2901\u20132910"}],"container-title":["Frontiers of Computer Science"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11704-025-51171-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11704-025-51171-9","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11704-025-51171-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,20]],"date-time":"2026-02-20T03:02:27Z","timestamp":1771556547000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11704-025-51171-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2,20]]},"references-count":121,"journal-issue":{"issue":"12","published-print":{"date-parts":[[2026,12]]}},"alternative-id":["51171"],"URL":"https:\/\/doi.org\/10.1007\/s11704-025-51171-9","relation":{},"ISSN":["2095-2228","2095-2236"],"issn-type":[{"value":"2095-2228","type":"print"},{"value":"2095-2236","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,2,20]]},"assertion":[{"value":"6 August 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 September 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 February 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"The authors declare that they have no competing interests or financial conflicts to disclose.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"2012368"}}