{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T19:52:26Z","timestamp":1776887546074,"version":"3.51.2"},"publisher-location":"Cham","reference-count":41,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031728549","type":"print"},{"value":"9783031728556","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,9]],"date-time":"2024-11-09T00:00:00Z","timestamp":1731110400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,9]],"date-time":"2024-11-09T00:00:00Z","timestamp":1731110400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72855-6_23","type":"book-chapter","created":{"date-parts":[[2024,11,8]],"date-time":"2024-11-08T18:49:05Z","timestamp":1731091745000},"page":"397-413","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["Training A Small Emotional Vision Language Model for\u00a0Visual Art Comprehension"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-1590-5886","authenticated-orcid":false,"given":"Jing","family":"Zhang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1464-9500","authenticated-orcid":false,"given":"Liang","family":"Zheng","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3094-7735","authenticated-orcid":false,"given":"Meng","family":"Wang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2594-254X","authenticated-orcid":false,"given":"Dan","family":"Guo","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,9]]},"reference":[{"key":"23_CR1","doi-asserted-by":"crossref","unstructured":"Achlioptas, P., Ovsjanikov, M., Guibas, L., Tulyakov, S.: Affection: learning affective explanations for real-world visual data. In: CVPR, pp. 6641\u20136651 (2023)","DOI":"10.1109\/CVPR52729.2023.00642"},{"key":"23_CR2","doi-asserted-by":"crossref","unstructured":"Achlioptas, P., Ovsjanikov, M., Haydarov, K., Elhoseiny, M., Guibas, L.J.: Artemis: affective language for visual art. In: CVPR, pp. 11569\u201311579 (2021)","DOI":"10.1109\/CVPR46437.2021.01140"},{"key":"23_CR3","doi-asserted-by":"publisher","first-page":"176","DOI":"10.1162\/tacl_a_00540","volume":"11","author":"A Ananthram","year":"2023","unstructured":"Ananthram, A., Winn, O., Muresan, S.: Feelingblue: a corpus for understanding the emotional connotation of color in context. TACL 11, 176\u2013190 (2023)","journal-title":"TACL"},{"key":"23_CR4","doi-asserted-by":"crossref","unstructured":"Bai, Z., Nakashima, Y., Garcia, N.: Explain me the painting: multi-topic knowledgeable art description generation. In: ICCV, pp. 5422\u20135432 (2021)","DOI":"10.1109\/ICCV48922.2021.00537"},{"key":"23_CR5","doi-asserted-by":"crossref","unstructured":"Cen, J., Qing, C., Ou, H., Xu, X., Tan, J.: Masanet: multi-aspect semantic auxiliary network for visual sentiment analysis. IEEE TAC 1\u201312 (2024)","DOI":"10.1109\/TAFFC.2023.3331776"},{"key":"23_CR6","unstructured":"Chen, T., Borth, D., Darrell, T., Chang, S.F.: Deepsentibank: visual sentiment concept classification with deep convolutional neural networks. arXiv preprint arXiv:1410.8586 (2014)"},{"key":"23_CR7","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: ICML, pp. 1597\u20131607 (2020)"},{"key":"23_CR8","doi-asserted-by":"crossref","unstructured":"Chen, W., Chen, X., Zhang, J., Huang, K.: Beyond triplet loss: a deep quadruplet network for person re-identification. In: CVPR, pp. 403\u2013412 (2017)","DOI":"10.1109\/CVPR.2017.145"},{"issue":"3\u20134","key":"23_CR9","doi-asserted-by":"publisher","first-page":"169","DOI":"10.1080\/02699939208411068","volume":"6","author":"P Ekman","year":"1992","unstructured":"Ekman, P., et al.: An argument for basic emotions. Cogn. Emot. 6(3\u20134), 169\u2013200 (1992)","journal-title":"Cogn. Emot."},{"key":"23_CR10","doi-asserted-by":"crossref","unstructured":"Garcia, N., et al.: A dataset and baselines for visual question answering on art. In: ECCV Workshop, pp. 92\u2013108 (2020)","DOI":"10.1007\/978-3-030-66096-3_8"},{"key":"23_CR11","unstructured":"Gugger, S., Howard, J.: AdamW and super-convergence is now the fastest way to train neural nets. Accessed 19 2018"},{"key":"23_CR12","doi-asserted-by":"crossref","unstructured":"Hasan, M.K., et al.: Humor knowledge enriched transformer for understanding multimodal humor. In: AAAI, pp. 12972\u201312980 (2021)","DOI":"10.1609\/aaai.v35i14.17534"},{"key":"23_CR13","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"23_CR14","unstructured":"Holtzman, A., Buys, J., Du, L., Forbes, M., Choi, Y.: The curious case of neural text degeneration. arXiv preprint arXiv:1904.09751 (2019)"},{"key":"23_CR15","unstructured":"Hu, E.J., et al.: Lora: low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)"},{"key":"23_CR16","doi-asserted-by":"crossref","unstructured":"Jung, C., Kwon, G., Ye, J.C.: Exploring patch-wise semantic relation for contrastive learning in image-to-image translation tasks. In: CVPR, pp. 18260\u201318269 (2022)","DOI":"10.1109\/CVPR52688.2022.01772"},{"key":"23_CR17","doi-asserted-by":"crossref","unstructured":"Li, T., Hu, Y., Wu, X.: Image captioning with inherent sentiment. In: ICME, pp.\u00a01\u20136 (2021)","DOI":"10.1109\/ICME51207.2021.9428453"},{"key":"23_CR18","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: NeurIPS, pp. 1\u201325 (2023)"},{"key":"23_CR19","doi-asserted-by":"crossref","unstructured":"Liu, J., Chen, Y., Xu, J.: Multimedia event extraction from news with a unified contrastive learning framework. In: ACM MM, pp. 1945\u20131953 (2022)","DOI":"10.1145\/3503161.3548132"},{"key":"23_CR20","doi-asserted-by":"publisher","first-page":"163","DOI":"10.1016\/j.neucom.2022.01.068","volume":"490","author":"Y Lu","year":"2022","unstructured":"Lu, Y., Guo, C., Dai, X., Wang, F.Y.: Data-efficient image captioning of fine art paintings via virtual-real semantic alignment training. Neurocomputing 490, 163\u2013180 (2022)","journal-title":"Neurocomputing"},{"key":"23_CR21","doi-asserted-by":"crossref","unstructured":"Mathews, A., Xie, L., He, X.: Senticap: generating image descriptions with sentiments. In: AAAI, pp.\u00a01\u20137 (2016)","DOI":"10.1609\/aaai.v30i1.10475"},{"key":"23_CR22","doi-asserted-by":"crossref","unstructured":"Mohamed, Y., Khan, F.F., Haydarov, K., Elhoseiny, M.: It is okay to not be okay: overcoming emotional bias in affective image captioning by contrastive data collection. In: CVPR, pp. 21263\u201321272 (2022)","DOI":"10.1109\/CVPR52688.2022.02058"},{"key":"23_CR23","doi-asserted-by":"crossref","unstructured":"Mohammad, S.: Obtaining reliable human ratings of valence, arousal, and dominance for 20,000 english words. In: ACL, pp. 174\u2013184 (2018)","DOI":"10.18653\/v1\/P18-1017"},{"key":"23_CR24","unstructured":"Mokady, R., Hertz, A., Bermano, A.H.: Clipcap: clip prefix for image captioning. arXiv preprint arXiv:2111.09734 (2021)"},{"key":"23_CR25","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: ICML, pp. 8748\u20138763 (2021)"},{"issue":"8","key":"23_CR26","first-page":"1","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I., et al.: Language models are unsupervised multitask learners. OpenAI Blog 1(8), 1\u201324 (2019)","journal-title":"OpenAI Blog"},{"issue":"1","key":"23_CR27","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1037\/0033-295X.110.1.145","volume":"110","author":"JA Russell","year":"2003","unstructured":"Russell, J.A.: Core affect and the psychological construction of emotion. Psychol. Rev. 110(1), 1\u201328 (2003)","journal-title":"Psychol. Rev."},{"key":"23_CR28","doi-asserted-by":"crossref","unstructured":"Ruta, D., et al.: Stylebabel: artistic style tagging and captioning. In: ECCV, pp. 219\u2013236 (2022)","DOI":"10.1007\/978-3-031-20074-8_13"},{"key":"23_CR29","doi-asserted-by":"crossref","unstructured":"Sammani, F., Mukherjee, T., Deligiannis, N.: NLX-GPT: a model for natural language explanations in vision and vision-language tasks. In: CVPR, pp. 8322\u20138332 (2022)","DOI":"10.1109\/CVPR52688.2022.00814"},{"key":"23_CR30","unstructured":"Sanh, V., Debut, L., Chaumond, J., Wolf, T.: Distilbert, a distilled version of bert: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108 (2019)"},{"key":"23_CR31","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NeurIPS, pp. 1\u201311 (2017)"},{"key":"23_CR32","first-page":"4563","volume":"25","author":"G Wang","year":"2023","unstructured":"Wang, G., Guo, Y., Xu, Z., Wong, Y., Kankanhalli, M.S.: Semantic-aware triplet loss for image classification. IEEE TMM 25, 4563\u20134572 (2023)","journal-title":"IEEE TMM"},{"issue":"4","key":"23_CR33","doi-asserted-by":"publisher","first-page":"1073","DOI":"10.1007\/s11263-023-01752-7","volume":"131","author":"X Wu","year":"2023","unstructured":"Wu, X., Li, T.: Sentimental visual captioning using multimodal transformer. IJCV 131(4), 1073\u20131090 (2023)","journal-title":"IJCV"},{"key":"23_CR34","unstructured":"Xu, K., et al.: Show, attend and tell: neural image caption generation with visual attention. In: ICML, pp. 2048\u20132057 (2015)"},{"key":"23_CR35","doi-asserted-by":"crossref","unstructured":"Xu, L., Wang, Z., Wu, B., Lui, S.: MDAN: multi-level dependent attention network for visual emotion analysis. In: CVPR, pp. 9479\u20139488 (2022)","DOI":"10.1109\/CVPR52688.2022.00926"},{"key":"23_CR36","first-page":"7432","volume":"30","author":"J Yang","year":"2021","unstructured":"Yang, J., Li, J., Wang, X., Ding, Y., Gao, X.: Stimuli-aware visual emotion analysis. IEEE TIP 30, 7432\u20137445 (2021)","journal-title":"IEEE TIP"},{"issue":"4","key":"23_CR37","first-page":"3269","volume":"14","author":"K Yang","year":"2023","unstructured":"Yang, K., Zhang, T., Alhuzali, H., Ananiadou, S.: Cluster-level contrastive learning for emotion recognition in conversations. IEEE TAC 14(4), 3269\u20133280 (2023)","journal-title":"IEEE TAC"},{"key":"23_CR38","doi-asserted-by":"crossref","unstructured":"You, Q., Luo, J., Jin, H., Yang, J.: Robust image sentiment analysis using progressively trained and domain transferred deep networks. In: AAAI, pp.\u00a01\u20138 (2015)","DOI":"10.1609\/aaai.v29i1.9179"},{"key":"23_CR39","unstructured":"Zhang, Y., Wang, M., Tiwari, P., Li, Q., Wang, B., Qin, J.: Dialoguellm: context and emotion knowledge-tuned llama models for emotion recognition in conversations. arXiv preprint arXiv:2310.11374 (2023)"},{"key":"23_CR40","doi-asserted-by":"crossref","unstructured":"Zhao, W., Wu, X., Zhang, X.: Memcap: memorizing style knowledge for image captioning. In: AAAI, pp. 12984\u201312992 (2020)","DOI":"10.1609\/aaai.v34i07.6998"},{"key":"23_CR41","doi-asserted-by":"crossref","unstructured":"Zhong, P., Wang, D., Miao, C.: Knowledge-enriched transformer for emotion detection in textual conversations. arXiv preprint arXiv:1909.10681 (2019)","DOI":"10.18653\/v1\/D19-1016"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72855-6_23","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,8]],"date-time":"2024-11-08T19:06:20Z","timestamp":1731092780000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72855-6_23"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,9]]},"ISBN":["9783031728549","9783031728556"],"references-count":41,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72855-6_23","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,9]]},"assertion":[{"value":"9 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}