{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,21]],"date-time":"2026-04-21T23:11:12Z","timestamp":1776813072318,"version":"3.51.2"},"reference-count":61,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,2,12]],"date-time":"2025-02-12T00:00:00Z","timestamp":1739318400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2025,2,12]],"date-time":"2025-02-12T00:00:00Z","timestamp":1739318400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J AUDIO SPEECH MUSIC PROC."],"DOI":"10.1186\/s13636-025-00397-3","type":"journal-article","created":{"date-parts":[[2025,2,12]],"date-time":"2025-02-12T13:34:41Z","timestamp":1739367281000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["AI-based Chinese-style music generation from video content: a study on cross-modal analysis and generation methods"],"prefix":"10.1186","volume":"2025","author":[{"given":"Moxi","family":"Cao","sequence":"first","affiliation":[]},{"given":"Jiaxiang","family":"Zheng","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0004-5214-9840","authenticated-orcid":false,"given":"Chongbin","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,2,12]]},"reference":[{"key":"397_CR1","unstructured":"Suno\u00a0AI. Suno: Ai-powered music and lyrics generator. (2023).\u00a0https:\/\/www.suno.com.\u00a0Accessed 20 Aug 2024"},{"key":"397_CR2","unstructured":"Z. Evans, C.J. Carr, J. Taylor, S.H. Hawley, J. Pons, Fast timing-conditioned latent audio diffusion.\u00a0(2024).\u00a0arXiv preprint arXiv:2402.04825"},{"issue":"4","key":"397_CR3","first-page":"756","volume":"60","author":"A Johnson","year":"2023","unstructured":"A. Johnson et al., The impact of ai-generated music in advertising: A case study approach. J. Mark. Res. 60(4), 756\u2013775 (2023)","journal-title":"J. Mark. Res."},{"issue":"2","key":"397_CR4","first-page":"189","volume":"15","author":"R Williams","year":"2023","unstructured":"R. Williams et al., Dynamic music generation in video games using AI: Challenges and opportunities. IEEE Trans. Games 15(2), 189\u2013203 (2023)","journal-title":"IEEE Trans. Games"},{"key":"397_CR5","unstructured":"S. Park et al., AI-assisted music therapy: A systematic review of applications and outcomes. Front. Psychol. 14(1047392) (2023)"},{"key":"397_CR6","unstructured":"iResearch. China\u2019s digital music market report 2022. (2022).\u00a0https:\/\/www.iresearch.com.cn.\u00a0Accessed 21 Aug 2024."},{"key":"397_CR7","unstructured":"QuestMobile. 2023 China short video platform user behavior report. (2023). https:\/\/www.questmobile.com.cn.\u00a0Accessed 22 Aug 2024."},{"key":"397_CR8","doi-asserted-by":"crossref","unstructured":"K.V.D. Doel, P.G. Kry, D.K. Pai, in Proceedings of the 28th Annual Conference on Computer Graphics and Interactive Techniques. FoleyAutomatic: Physically-based sound effects for interactive simulation and animation (ACM, New York, 2001), pp. 537\u2013544","DOI":"10.1145\/383259.383322"},{"key":"397_CR9","unstructured":"K. Su, X. Liu, E. Shlizerman, Multi-instrumentalist net: Unsupervised generation of music from body movements. (2020). arXiv\u00a0preprint\u00a0arXiv:2012.03478"},{"key":"397_CR10","unstructured":"K. Su, X. Liu, E. Shlizerman, in Advances in Neural Information Processing Systems. How does it sound? vol. 34 (2021)"},{"key":"397_CR11","doi-asserted-by":"crossref","unstructured":"A. Owens, P. Isola, J. McDermott, A. Torralba, E.H. Adelson, W.T. Freeman.Visually Indicated Sounds. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR). (IEEE, Piscataway,\u00a02016)","DOI":"10.1109\/CVPR.2016.264"},{"key":"397_CR12","doi-asserted-by":"crossref","unstructured":"Y. Zhou, Z. Wang, C. Fang, T. Bui, T.L. Berg, in Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. Visual to sound: Generating natural sound for videos in the wild (IEEE, Salt Lake City, 2018), pp. 3550\u20133558","DOI":"10.1109\/CVPR.2018.00374"},{"key":"397_CR13","doi-asserted-by":"crossref","unstructured":"A.S. Koepke, O. Wiles, Y. Moses, A. Zisserman, in ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). Sight to sound: An end-to-end approach for visual piano transcription (IEEE, Barcelona, 2020), pp. 1838\u20131842","DOI":"10.1109\/ICASSP40776.2020.9053115"},{"key":"397_CR14","doi-asserted-by":"crossref","unstructured":"C. Gan, D. Huang, P. Chen, J.B. Tenenbaum, A. Torralba, in European Conference on Computer Vision. Foley music: Learning to generate music from videos (Springer, Glasgow, 2020), pp. 758\u2013775","DOI":"10.1007\/978-3-030-58621-8_44"},{"key":"397_CR15","doi-asserted-by":"crossref","unstructured":"S. Di, Z. Jiang, S. Liu, Z. Wang, L. Zhu, Z. He, H. Liu, S. Yan, in Proceedings of the 29th ACM International Conference on Multimedia. Video background music generation with controllable music transformer (ACM, Chengdu, 2021)","DOI":"10.1145\/3474085.3475195"},{"key":"397_CR16","doi-asserted-by":"crossref","unstructured":"L. Zhuo, Z. Wang, B. Wang, Y. Liao, C. Bao, S. Peng, S. Han, A. Zhang, F. Fang, S. Liu, in Proceedings of the IEEE\/CVF International Conference on Computer Vision. Video background music generation: Dataset, method and evaluation (IEEE, Paris, 2023), pp. 15637\u201315647","DOI":"10.1109\/ICCV51070.2023.01433"},{"key":"397_CR17","doi-asserted-by":"publisher","unstructured":"J. Kang, S. Poria, D. Herremans, Video2music: Suitable music generation from videos using an affective multimodal transformer model. Expert. Syst. Appl.\u00a0(Elsevier BV, 2024).\u00a0249.\u00a0https:\/\/doi.org\/10.1016\/j.eswa.2024.123640. arXiv\u00a0preprint\u00a0arXiv:2311.00968","DOI":"10.1016\/j.eswa.2024.123640"},{"key":"397_CR18","unstructured":"V. Iashin, E. Rahtu, Taming visually guided sound generation. (2021). arXiv\u00a0preprint\u00a0arXiv:2110.08791"},{"key":"397_CR19","doi-asserted-by":"crossref","unstructured":"A. Ephrat, S. Peleg, in 2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). Vid2speech: Speech reconstruction from silent video (IEEE, New Orleans, 2017), pp. 5095\u20135099","DOI":"10.1109\/ICASSP.2017.7953127"},{"key":"397_CR20","doi-asserted-by":"crossref","unstructured":"K. Prajwal, R. Mukhopadhyay, V.P. Namboodiri, C. Jawahar, in Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. Learning individual speaking styles for accurate lip to speech synthesis (IEEE, Seattle, 2020), pp. 13796\u201313805","DOI":"10.1109\/CVPR42600.2020.01381"},{"key":"397_CR21","unstructured":"S. Rouard, G. Hadjeres, in Proceedings of the ISMIR 2021. CRASH: Raw audio score-based generative modeling for controllable high-resolution drum sound synthesis (Online, 2021)"},{"key":"397_CR22","unstructured":"S. Forsgren, H. Martiros. Riffusion-stable diffusion for real-time music generation. (2022). https:\/\/github.com\/riffusion\/riffusion.\u00a0Accessed 22 Aug 2024."},{"key":"397_CR23","unstructured":"C. Hawthorne, I. Simon, A. Roberts, N. Zeghidour, J. Gardner, E. Manilow, J. Engel, Multi-instrument music synthesis with spectrogram diffusion. (2022). arXiv\u00a0preprint\u00a0arXiv:2206.05408"},{"key":"397_CR24","unstructured":"Q. Huang, A. Jansen, J. Lee, R. Ganti, J.Y. Li, D.P.W. Ellis, MuLan: A joint embedding of music audio and natural language. (2022). arXiv\u00a0preprint\u00a0arXiv:2208.12415"},{"key":"397_CR25","unstructured":"A. Agostinelli, T.I. Denk, Z. Borsos, J. Engel, M. Verzetti, A. Caillon, Q. Huang, A. Jansen, A. Roberts, M. Tagliasacchi et al., MusicLM: Generating music from text. (2023). arXiv\u00a0preprint\u00a0arXiv:2301.11325"},{"key":"397_CR26","unstructured":"Q. Huang, D.S. Park, T. Wang, T.I. Denk, A. Ly, N. Chen, Z. Zhang, Z. Zhang, J. Yu, C. Frank et al., Noise2Music: Text-conditioned music generation with diffusion models. (2023). arXiv\u00a0preprint\u00a0arXiv:2302.03917"},{"key":"397_CR27","unstructured":"H. F Garcia, P. Seetharaman, R. Kumar, B. Pardo, VampNet: Music generation via masked acoustic token modeling. (2023). arXiv\u00a0preprint\u00a0arXiv:2307.04686"},{"key":"397_CR28","unstructured":"G. Mariani, I. Tallini, E. Postolache, M. Mancusi, L. Cosmo, E. Rodol\u00e0, Multi-source diffusion models for simultaneous music generation and separation. (2024). arXiv\u00a0preprint\u00a0arXiv:2302.02257"},{"key":"397_CR29","unstructured":"A. Ziv, I. Gat, G.L. Lan, T. Remez, F. Kreuk, A. D\u00e9fossez, J. Copet, G. Synnaeve, Y. Adi, Masked audio generation using a single non-autoregressive transformer. (2024). arXiv\u00a0preprint\u00a0arXiv:2401.04577"},{"key":"397_CR30","doi-asserted-by":"crossref","unstructured":"J.D. Parker, J. Spijkervet, K. Kosta, F. Yesiler, B. Kuznetsov, J.C. Wang, M. Avent, J. Chen, D. Le, StemGen: A music generation model that listens. (2024). arXiv\u00a0preprint\u00a0arXiv:2312.08723","DOI":"10.1109\/ICASSP48485.2024.10446088"},{"key":"397_CR31","doi-asserted-by":"crossref","unstructured":"Z. Zeng, L. Zhou, in Proceedings of the 2021 6th International Conference on Intelligent Computing and Signal Processing (ICSP). A memetic algorithm for Chinese traditional music composition (IEEE, Xi\u2019an, 2021), pp. 187\u2013192","DOI":"10.1109\/ICSP51882.2021.9408813"},{"key":"397_CR32","unstructured":"Y. Wang, G. Xia, A survey on AI-generated music creation and analysis. (2022). arXiv\u00a0preprint\u00a0arXiv:2208.11178"},{"key":"397_CR33","doi-asserted-by":"publisher","first-page":"80","DOI":"10.1007\/s10489-023-05195-y","volume":"54","author":"P Li","year":"2024","unstructured":"P. Li, T.M. Liang, Y.M. Cao, X.M. Wang, X.J. Wu, L.Y. Lei, A novel Xi\u2019an drum music generation method based on Bi-LSTM deep reinforcement learning. Appl. Intell. 54, 80\u201394 (2024)","journal-title":"Appl. Intell."},{"key":"397_CR34","doi-asserted-by":"publisher","unstructured":"S. Chen, Y. Zhong, R. Du, Automatic composition of Guzheng (Chinese Zither) music using long short-term memory network (LSTM) and reinforcement learning (RL). Sci. Rep. 12(15829) (2022).\u00a0https:\/\/doi.org\/10.1038\/s41598-022-19786-1","DOI":"10.1038\/s41598-022-19786-1"},{"key":"397_CR35","doi-asserted-by":"crossref","unstructured":"J. Luo, X. Yang, S. Ji, J. Li, in Proceedings of the 7th Conference on Sound and Music Technology (CSMT) Revised Selected Papers. MG-VAE: Deep Chinese folk songs generation with specific regional styles (Springer Singapore, Singapore, 2020), pp. 93\u2013106","DOI":"10.1007\/978-981-15-2756-2_8"},{"key":"397_CR36","doi-asserted-by":"publisher","first-page":"99","DOI":"10.1016\/j.visinf.2020.04.003","volume":"4","author":"J Shen","year":"2020","unstructured":"J. Shen, R. Wang, H.W. Shen, Visual exploration of latent space for traditional Chinese music. Vis. Informat. 4, 99\u2013108 (2020)","journal-title":"Vis. Informat."},{"key":"397_CR37","doi-asserted-by":"crossref","unstructured":"F. Jiang, L. Zhang, K. Wang, X. Deng, W. Yang, BoYaTCN: Research on music generation of traditional Chinese pentatonic scale based on bidirectional octave your attention temporal convolutional network. Appl. Sci. 12(9309),\u00a06836\u20136846\u00a0(2022)","DOI":"10.3390\/app12189309"},{"key":"397_CR38","doi-asserted-by":"publisher","unstructured":"A. Arnab, M. Dehghani, G. Heigold, C. Sun, M. Lucic, C. Schmid, in Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV). Vivit: A video vision transformer (2021), pp. 6836\u20136846. https:\/\/doi.org\/10.1109\/ICCV48922.2021.00675","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"397_CR39","doi-asserted-by":"publisher","unstructured":"A. Vaswani, N. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A.N. Gomez, L. Kaiser, I. Polosukhin, in Proceedings of the 31st International Conference on Neural Information Processing Systems (NIPS 2017). Attention is all you need (2017), pp. 6000\u20136010. https:\/\/doi.org\/10.48550\/arXiv.1706.03762","DOI":"10.48550\/arXiv.1706.03762"},{"key":"397_CR40","doi-asserted-by":"publisher","unstructured":"V. Popov, C. Mai, D. Romero, H. Zen, O. Vinyals, in Proceedings of the 40th International Conference on Machine Learning (ICML 2023). Fast timing-conditioned latent audio diffusion (2023). https:\/\/doi.org\/10.48550\/arXiv.2306.13404","DOI":"10.48550\/arXiv.2306.13404"},{"key":"397_CR41","doi-asserted-by":"crossref","unstructured":"Y. Wu, K. Chen, T. Zhang, Y. Hui, T. Berg-Kirkpatrick, S. Dubnov, in ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). Large-scale contrastive language-audio pretraining with feature fusion and keyword-to-caption augmentation (2023), pp. 1\u20135","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"key":"397_CR42","doi-asserted-by":"crossref","unstructured":"S. Kullback, R.A. Leibler, On information and sufficiency. Ann. Math. Stat. (Institute of Mathematical Statistics, 1951).\u00a022(1), 79\u201386 .\u00a0https:\/\/www.jstor.org\/stable\/2236703","DOI":"10.1214\/aoms\/1177729694"},{"key":"397_CR43","unstructured":"Capella\u00a0Software. capella-audio2score. (n.d.). https:\/\/www.capella-software.com.\u00a0Accessed 22 Aug 2024."},{"key":"397_CR44","doi-asserted-by":"publisher","DOI":"10.1093\/oso\/9780198250470.001.0001","volume-title":"Introduction to a Philosophy of Music","author":"P Kivy","year":"2002","unstructured":"P. Kivy, Introduction to a Philosophy of Music (Clarendon Press, Oxford, 2002)"},{"key":"397_CR45","doi-asserted-by":"publisher","DOI":"10.7551\/mitpress\/6575.001.0001","volume-title":"Sweet Anticipation: Music and the Psychology of Expectation","author":"D Huron","year":"2006","unstructured":"D. Huron, Sweet Anticipation: Music and the Psychology of Expectation (MIT Press, Cambridge, 2006)"},{"issue":"5","key":"397_CR46","doi-asserted-by":"publisher","first-page":"559","DOI":"10.1017\/S0140525X08005293","volume":"31","author":"PN Juslin","year":"2008","unstructured":"P.N. Juslin, D. V\u00e4stfj\u00e4ll, Emotional responses to music: The need to consider underlying mechanisms. Behav. Brain Sci. 31(5), 559\u2013575 (2008)","journal-title":"Behav. Brain Sci."},{"issue":"3","key":"397_CR47","doi-asserted-by":"publisher","first-page":"235","DOI":"10.1016\/j.plrev.2013.05.008","volume":"10","author":"PN Juslin","year":"2013","unstructured":"P.N. Juslin, From everyday emotions to aesthetic emotions: Towards a unified theory of musical emotions. Phys. Life Rev. 10(3), 235\u2013266 (2013)","journal-title":"Phys. Life Rev."},{"issue":"1","key":"397_CR48","doi-asserted-by":"publisher","first-page":"378","DOI":"10.1111\/nyas.13654","volume":"1423","author":"MT Pearce","year":"2018","unstructured":"M.T. Pearce, Statistical learning and probabilistic prediction in music cognition: Mechanisms of stylistic enculturation. Ann. N. Y. Acad. Sci. 1423(1), 378\u2013395 (2018)","journal-title":"Ann. N. Y. Acad. Sci."},{"key":"397_CR49","doi-asserted-by":"publisher","DOI":"10.3998\/mpub.155334","volume-title":"Listening to Popular Music: Or, How I Learned to Stop Worrying and Love Led Zeppelin","author":"T Gracyk","year":"2007","unstructured":"T. Gracyk, Listening to Popular Music: Or, How I Learned to Stop Worrying and Love Led Zeppelin (University of Michigan Press, Ann Arbor, 2007)"},{"key":"397_CR50","volume-title":"Audio-Vision: Sound on Screen","author":"M Chion","year":"1994","unstructured":"M. Chion, Audio-Vision: Sound on Screen (Columbia University Press, New York, 1994)"},{"key":"397_CR51","doi-asserted-by":"publisher","DOI":"10.1093\/oso\/9780198165897.001.0001","volume-title":"Analysing Musical Multimedia","author":"N Cook","year":"1998","unstructured":"N. Cook, Analysing Musical Multimedia (Oxford University Press, Oxford, 1998)"},{"key":"397_CR52","first-page":"17","volume-title":"Congruence-association Model of Music and Multimedia: Origin and Evolution","author":"AJ Cohen","year":"2013","unstructured":"A.J. Cohen, Congruence-association Model of Music and Multimedia: Origin and Evolution (Oxford University Press, Oxford, 2013), pp.17\u201347"},{"issue":"5","key":"397_CR53","doi-asserted-by":"publisher","first-page":"744","DOI":"10.3758\/BF03193776","volume":"69","author":"A Vatakis","year":"2007","unstructured":"A. Vatakis, C. Spence, Crossmodal binding: Evaluating the \u201cunity assumption\u2019\u2019 using audiovisual speech stimuli. Percept. Psychophys. 69(5), 744\u2013756 (2007)","journal-title":"Percept. Psychophys."},{"key":"397_CR54","doi-asserted-by":"publisher","DOI":"10.4324\/9780203508527","volume-title":"The Creative Mind: Myths and Mechanisms","author":"MA Boden","year":"2004","unstructured":"M.A. Boden, The Creative Mind: Myths and Mechanisms (Routledge, London, 2004)"},{"key":"397_CR55","volume-title":"Philosophy of Art: A Contemporary Introduction","author":"N Carroll","year":"1999","unstructured":"N. Carroll, Philosophy of Art: A Contemporary Introduction (Routledge, New York, 1999)"},{"key":"397_CR56","volume-title":"G\u00f6del, Escher, Bach: An Eternal Golden Braid","author":"DR Hofstadter","year":"1979","unstructured":"D.R. Hofstadter, G\u00f6del, Escher, Bach: An Eternal Golden Braid (Basic Books, New York, 1979)"},{"key":"397_CR57","volume-title":"The Intentional Stance","author":"DC Dennett","year":"1987","unstructured":"D.C. Dennett, The Intentional Stance (MIT Press, Cambridge, 1987)"},{"key":"397_CR58","volume-title":"The Study of Ethnomusicology: Thirty-one Issues and Concepts","author":"B Nettl","year":"2005","unstructured":"B. Nettl, The Study of Ethnomusicology: Thirty-one Issues and Concepts (University of Illinois Press, Urbana, 2005)"},{"key":"397_CR59","volume-title":"Cultural Psychology","author":"SJ Heine","year":"2011","unstructured":"S.J. Heine, Cultural Psychology (W. W. Norton & Company, New York, 2011)"},{"key":"397_CR60","volume-title":"Chinese Music","author":"J Jin","year":"2011","unstructured":"J. Jin, Chinese Music (Cambridge University Press, Cambridge, 2011)"},{"key":"397_CR61","volume-title":"Locating East Asia in Western Art Music","author":"YU Everett","year":"2004","unstructured":"Y.U. Everett, F. Lau, Locating East Asia in Western Art Music (Wesleyan University Press, Middletown, 2004)"}],"container-title":["EURASIP Journal on Audio, Speech, and Music Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1186\/s13636-025-00397-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1186\/s13636-025-00397-3\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1186\/s13636-025-00397-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,2,12]],"date-time":"2025-02-12T13:35:00Z","timestamp":1739367300000},"score":1,"resource":{"primary":{"URL":"https:\/\/asmp-eurasipjournals.springeropen.com\/articles\/10.1186\/s13636-025-00397-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2,12]]},"references-count":61,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2025,12]]}},"alternative-id":["397"],"URL":"https:\/\/doi.org\/10.1186\/s13636-025-00397-3","relation":{},"ISSN":["1687-4722"],"issn-type":[{"value":"1687-4722","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,2,12]]},"assertion":[{"value":"18 October 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 January 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 February 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"8"}}