{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,29]],"date-time":"2025-12-29T18:52:29Z","timestamp":1767034349709,"version":"3.41.0"},"publisher-location":"Cham","reference-count":22,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031928079","type":"print"},{"value":"9783031928086","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-92808-6_11","type":"book-chapter","created":{"date-parts":[[2025,5,27]],"date-time":"2025-05-27T15:59:25Z","timestamp":1748361565000},"page":"173-186","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Art2Mus: Bridging Visual Arts and\u00a0Music Through Cross-Modal Generation"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-9597-064X","authenticated-orcid":false,"given":"Ivan","family":"Rinaldi","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0007-6602-7504","authenticated-orcid":false,"given":"Nicola","family":"Fanelli","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6489-8628","authenticated-orcid":false,"given":"Giovanna","family":"Castellano","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0883-2691","authenticated-orcid":false,"given":"Gennaro","family":"Vessio","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,5,12]]},"reference":[{"key":"11_CR1","unstructured":"Agostinelli, A., et al.: MusicLM: generating music from text. arXiv preprint arXiv:2301.11325 (2023)"},{"key":"11_CR2","unstructured":"Benzi, K., Defferrard, M., Vandergheynst, P., Bresson, X.: FMA: a dataset for music analysis. arXiv preprint arXiv:1612.01840 (2016)"},{"key":"11_CR3","doi-asserted-by":"publisher","first-page":"108859","DOI":"10.1016\/J.KNOSYS.2022.108859","volume":"248","author":"G Castellano","year":"2022","unstructured":"Castellano, G., Digeno, V., Sansaro, G., Vessio, G.: Leveraging knowledge graphs and deep learning for automatic art analysis. Knowl. Based Syst. 248, 108859 (2022). https:\/\/doi.org\/10.1016\/J.KNOSYS.2022.108859","journal-title":"Knowl. Based Syst."},{"key":"11_CR4","unstructured":"Chung, H.W., et al.: Scaling instruction-finetuned language models. arXiv preprint arXiv:2210.11416 (2022)"},{"key":"11_CR5","doi-asserted-by":"publisher","unstructured":"Chung, Y., et al.: w2v-BERT: combining contrastive learning and masked language modeling for self-supervised speech pre-training. In: IEEE Automatic Speech Recognition and Understanding Workshop, ASRU 2021, Cartagena, Colombia, December 13-17, 2021, pp. 244\u2013250. IEEE (2021). https:\/\/doi.org\/10.1109\/ASRU51503.2021.9688253","DOI":"10.1109\/ASRU51503.2021.9688253"},{"key":"11_CR6","unstructured":"Copet, J., et al.: Simple and controllable music generation. In: Oh, A., Naumann, T., Globerson, A., Saenko, K., Hardt, M., Levine, S. (eds.) Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans, LA, USA, December 10-16, 2023 (2023). http:\/\/papers.nips.cc\/paper_files\/paper\/2023\/hash\/94b472a1842cd7c56dcb125fb2765fbd-Abstract-Conference.html"},{"key":"11_CR7","doi-asserted-by":"publisher","unstructured":"Girdhar, R., et al.: ImageBind one embedding space to bind them all. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2023, Vancouver, BC, Canada, June 17-24, 2023, pp. 15180\u201315190. IEEE (2023). https:\/\/doi.org\/10.1109\/CVPR52729.2023.01457","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"11_CR8","doi-asserted-by":"publisher","unstructured":"Hershey, S., et al.: CNN architectures for large-scale audio classification. In: 2017 IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2017, New Orleans, LA, USA, March 5-9, 2017, pp. 131\u2013135. IEEE (2017). https:\/\/doi.org\/10.1109\/ICASSP.2017.7952132","DOI":"10.1109\/ICASSP.2017.7952132"},{"key":"11_CR9","unstructured":"Huang, P., et al.: Masked autoencoders that listen. In: Koyejo, S., Mohamed, S., Agarwal, A., Belgrave, D., Cho, K., Oh, A. (eds.) Advances in Neural Information Processing Systems 35: Annual Conference on Neural Information Processing Systems 2022, NeurIPS 2022, New Orleans, LA, USA, November 28 - December 9, 2022 (2022). http:\/\/papers.nips.cc\/paper_files\/paper\/2022\/hash\/b89d5e209990b19e33b418e14f323998-Abstract-Conference.html"},{"key":"11_CR10","unstructured":"Huang, Q., Jansen, A., Lee, J., Ganti, R., Li, J.Y., Ellis, D.P.W.: MuLan: a joint embedding of music audio and natural language. arXiv preprint arXiv:2208.12415 (2022)"},{"key":"11_CR11","unstructured":"Hussain, A.S., Liu, S., Sun, C., Shan, Y.: M2UGen: multi-modal music understanding and generation with the power of large language models. arXiv preprint arXiv:2311.11255 (2023)"},{"key":"11_CR12","doi-asserted-by":"crossref","unstructured":"Kilgour, K., Zuluaga, M., Roblek, D., Sharifi, M.: Fr\u00e9chet audio distance: a metric for evaluating music enhancement algorithms. arXiv preprint arXiv:1812.08466 (2018)","DOI":"10.21437\/Interspeech.2019-2219"},{"key":"11_CR13","unstructured":"Kingma, D.P., Salimans, T., Poole, B., Ho, J.: Variational diffusion models. arXiv preprint arXiv:2107.00630 (2021)"},{"key":"11_CR14","doi-asserted-by":"publisher","unstructured":"Liu, H., et al.: AudioLDM 2: learning holistic audio generation with self-supervised pretraining. IEEE ACM Trans. Audio Speech Lang. Process. 32, 2871\u20132883 (2024). https:\/\/doi.org\/10.1109\/TASLP.2024.3399607","DOI":"10.1109\/TASLP.2024.3399607"},{"key":"11_CR15","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: Oh, A., Naumann, T., Globerson, A., Saenko, K., Hardt, M., Levine, S. (eds.) Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans, LA, USA, December 10 - 16, 2023 (2023). http:\/\/papers.nips.cc\/paper_files\/paper\/2023\/hash\/6dcf277ea32ce3288914faf369fe6de0-Abstract-Conference.html"},{"key":"11_CR16","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I.: Language models are unsupervised multitask learners (2019)"},{"key":"11_CR17","doi-asserted-by":"publisher","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2022, New Orleans, LA, USA, June 18-24, 2022, pp. 10674\u201310685. IEEE (2022). https:\/\/doi.org\/10.1109\/CVPR52688.2022.01042","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"11_CR18","doi-asserted-by":"publisher","unstructured":"Shen, J., et al.: Natural TTS synthesis by conditioning Wavenet on MEL spectrogram predictions. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2018, Calgary, AB, Canada, April 15-20, 2018, pp. 4779\u20134783. IEEE (2018). https:\/\/doi.org\/10.1109\/ICASSP.2018.8461368","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"11_CR19","unstructured":"Touvron, H., et al.: Llama 2: open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"11_CR20","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Guyon, I., et al. (eds.) Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4-9, 2017, Long Beach, CA, USA, pp. 5998\u20136008 (2017). https:\/\/proceedings.neurips.cc\/paper\/2017\/hash\/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html"},{"key":"11_CR21","doi-asserted-by":"publisher","unstructured":"Zeghidour, N., Luebs, A., Omran, A., Skoglund, J., Tagliasacchi, M.: SoundStream: an end-to-end neural audio codec. IEEE ACM Trans. Audio Speech Lang. Process. 30, 495\u2013507 (2022). https:\/\/doi.org\/10.1109\/TASLP.2021.3129994","DOI":"10.1109\/TASLP.2021.3129994"},{"key":"11_CR22","unstructured":"Zhang, R., Zhang, Y., Shao, K., Shan, Y., Xia, G.: Vis2Mus: exploring multimodal representation mapping for controllable music generation. arXiv preprint arXiv:2211.05543 (2022)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024 Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-92808-6_11","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,27]],"date-time":"2025-05-27T15:59:33Z","timestamp":1748361573000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-92808-6_11"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031928079","9783031928086"],"references-count":22,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-92808-6_11","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"12 May 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}