{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T07:21:38Z","timestamp":1743060098446,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":39,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819620609"},{"type":"electronic","value":"9789819620616"}],"license":[{"start":{"date-parts":[[2024,12,31]],"date-time":"2024-12-31T00:00:00Z","timestamp":1735603200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,31]],"date-time":"2024-12-31T00:00:00Z","timestamp":1735603200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-2061-6_20","type":"book-chapter","created":{"date-parts":[[2024,12,30]],"date-time":"2024-12-30T05:45:45Z","timestamp":1735537545000},"page":"268-281","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["LITA: LMM-Guided Image-Text Alignment for\u00a0Art Assessment"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-1492-915X","authenticated-orcid":false,"given":"Tatsumi","family":"Sunada","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0005-0603-3377","authenticated-orcid":false,"given":"Kaede","family":"Shiohara","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4650-8841","authenticated-orcid":false,"given":"Ling","family":"Xiao","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1784-2314","authenticated-orcid":false,"given":"Toshihiko","family":"Yamasaki","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,31]]},"reference":[{"key":"20_CR1","unstructured":"Achiam, J., et\u00a0al.: GPT-4 technical report. arXiv preprint arXiv:2303.08774 (2023)"},{"key":"20_CR2","doi-asserted-by":"crossref","unstructured":"Amirshahi, S.A., Denzler, J.: Judging aesthetic quality in paintings based on artistic inspired color features. In: DICTA, pp.\u00a01\u20138 (2017)","DOI":"10.1109\/DICTA.2017.8227452"},{"key":"20_CR3","doi-asserted-by":"crossref","unstructured":"Chen, Q., et al.: Adaptive fractional dilated convolution network for image aesthetics assessment. In: CVPR, pp. 14114\u201314123 (2020)","DOI":"10.1109\/CVPR42600.2020.01412"},{"key":"20_CR4","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: CVPR, pp. 248\u2013255 (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"20_CR5","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: NAACL, pp. 4171\u20134186 (2019)"},{"key":"20_CR6","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. ICLR (2021)"},{"key":"20_CR7","doi-asserted-by":"crossref","unstructured":"Guo, X., Kurita, T., Asano, C.M., Asano, A.: Visual complexity assessment of painting images. In: ICIP, pp. 388\u2013392 (2013)","DOI":"10.1109\/ICIP.2013.6738080"},{"key":"20_CR8","doi-asserted-by":"crossref","unstructured":"He, S., Ming, A., Zheng, S., Zhong, H., Ma, H.: EAT: an enhancer for aesthetics-oriented transformers. In: ACMMM, pp. 1023\u20131032 (2023)","DOI":"10.1145\/3581783.3611881"},{"key":"20_CR9","doi-asserted-by":"crossref","unstructured":"He, S., Zhang, Y., Xie, R., Jiang, D., Ming, A.: Rethinking image aesthetics assessment: models, datasets and benchmarks. In: IJCAI, pp. 942\u2013948 (2022)","DOI":"10.24963\/ijcai.2022\/132"},{"key":"20_CR10","doi-asserted-by":"crossref","unstructured":"Hii, Y.L., See, J., Kairanbay, M., Wong, L.K.: Multigap: multi-pooled inception network with text augmentation for aesthetic prediction of photographs. In: ICIP, pp. 1722\u20131726 (2017)","DOI":"10.1109\/ICIP.2017.8296576"},{"key":"20_CR11","doi-asserted-by":"crossref","unstructured":"Hosu, V., Goldlucke, B., Saupe, D.: Effective aesthetics prediction with multi-level spatially pooled features. In: CVPR, pp. 9375\u20139383 (2019)","DOI":"10.1109\/CVPR.2019.00960"},{"key":"20_CR12","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: ICML, pp. 4904\u20134916 (2021)"},{"key":"20_CR13","doi-asserted-by":"crossref","unstructured":"Ke, J., Ye, K., Yu, J., Wu, Y., Milanfar, P., Yang, F.: VILA: learning image aesthetics from user comments with vision-language pretraining. In: CVPR, pp. 10041\u201310051 (2023)","DOI":"10.1109\/CVPR52729.2023.00968"},{"key":"20_CR14","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: ICML, pp. 19730\u201319742 (2023)"},{"key":"20_CR15","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning. In: CVPR, pp. 26296\u201326306 (2024)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"20_CR16","unstructured":"Liu, H., et al.: LLaVA-NEXT: improved reasoning, OCR, and world knowledge, January 2024. https:\/\/llava-vl.github.io\/blog\/2024-01-30-llava-next\/"},{"key":"20_CR17","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: NeurIPS (2023)"},{"key":"20_CR18","doi-asserted-by":"crossref","unstructured":"Lo, K.Y., Liu, K.H., Chen, C.S.: Intelligent photographing interface with on-device aesthetic quality assessment. In: ACCVW, pp. 533\u2013544 (2013)","DOI":"10.1007\/978-3-642-37484-5_43"},{"key":"20_CR19","doi-asserted-by":"crossref","unstructured":"Murray, N., Marchesotti, L., Perronnin, F.: AVA: a large-scale database for aesthetic visual analysis. In: CVPR, pp. 2408\u20132415 (2012)","DOI":"10.1109\/CVPR.2012.6247954"},{"key":"20_CR20","doi-asserted-by":"crossref","unstructured":"Nishiyama, M., Okabe, T., Sato, I., Sato, Y.: Aesthetic quality classification of photographs based on color harmony. In: CVPR, pp. 33\u201340 (2011)","DOI":"10.1109\/CVPR.2011.5995539"},{"key":"20_CR21","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML, pp. 8748\u20138763 (2021)"},{"key":"20_CR22","unstructured":"Ramesh, A., et al.: Zero-shot text-to-image generation. In: ICML, pp. 8821\u20138831. PMLR (2021)"},{"key":"20_CR23","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: CVPR, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"issue":"2","key":"20_CR24","first-page":"169","volume":"41","author":"RM Sakia","year":"1992","unstructured":"Sakia, R.M.: The box-cox transformation technique: a review. J. Royal Stat. Soc. Ser. D Stat. 41(2), 169\u2013178 (1992)","journal-title":"J. Royal Stat. Soc. Ser. D Stat."},{"key":"20_CR25","doi-asserted-by":"crossref","unstructured":"She, D., Lai, Y.K., Yi, G., Xu, K.: Hierarchical layout-aware graph convolutional network for unified aesthetics assessment. In: CVPR, pp. 8475\u20138484 (2021)","DOI":"10.1109\/CVPR46437.2021.00837"},{"key":"20_CR26","doi-asserted-by":"crossref","unstructured":"Sheng, K., Dong, W., Ma, C., Mei, X., Huang, F., Hu, B.G.: Attention-based multi-patch aggregation for image aesthetic assessment. In: ACMMM, pp. 879\u2013886 (2018)","DOI":"10.1145\/3240508.3240554"},{"key":"20_CR27","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2024.127434","volume":"582","author":"T Shi","year":"2024","unstructured":"Shi, T., Chen, C., Li, X., Hao, A.: Semantic and style based multiple reference learning for artistic and general image aesthetic assessment. Neurocomputing 582, 127434 (2024)","journal-title":"Neurocomputing"},{"issue":"8","key":"20_CR28","first-page":"3998","volume":"27","author":"H Talebi","year":"2018","unstructured":"Talebi, H., Milanfar, P.: NIMA: neural image assessment. TIP 27(8), 3998\u20134011 (2018)","journal-title":"TIP"},{"issue":"1","key":"20_CR29","first-page":"2477","volume":"1","author":"W Yang","year":"2015","unstructured":"Yang, W.: Figure and landscape photo quality assessment based on visual aesthetics. JoICS 1(1), 2477\u20132486 (2015)","journal-title":"JoICS"},{"key":"20_CR30","doi-asserted-by":"crossref","unstructured":"Wang, J., Chan, K.C., Loy, C.C.: Exploring clip for assessing the look and feel of images. In: AAAI, pp. 2555\u20132563 (2023)","DOI":"10.1609\/aaai.v37i2.25353"},{"key":"20_CR31","doi-asserted-by":"publisher","first-page":"2783","DOI":"10.1007\/s11042-022-13333-w","volume":"82","author":"L Wang","year":"2023","unstructured":"Wang, L., Wang, X., Yamasaki, T.: Image aesthetics prediction using multiple patches preserving the original aspect ratio of contents. Multimedia Tools Appl. 82, 2783\u20132804 (2023)","journal-title":"Multimedia Tools Appl."},{"key":"20_CR32","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"193","DOI":"10.1007\/978-3-031-50069-5_17","volume-title":"CGI 2023","author":"Y Wang","year":"2023","unstructured":"Wang, Y., Cao, W., Sheng, N., Shi, H., Guo, C., Ke, Y.: TSC-Net: theme-style-color guided artistic image aesthetics assessment network. In: Sheng, B., Bi, L., Kim, J., Magnenat-Thalmann, N., Thalmann, D. (eds.) CGI 2023. LNCS, vol. 14495, pp. 193\u2013203. Springer, Cham (2023). https:\/\/doi.org\/10.1007\/978-3-031-50069-5_17"},{"key":"20_CR33","unstructured":"Wu, H., et al.: Q-align: teaching LMMs for visual scoring via discrete text-defined levels. arXiv preprint arXiv:2312.17090 (2023)"},{"key":"20_CR34","doi-asserted-by":"crossref","unstructured":"Yi, R., Tian, H., Gu, Z., Lai, Y.K., Rosin, P.L.: Towards artistic image aesthetics assessment: a large-scale dataset and a new method. In: CVPR, pp. 22388\u201322397 (2023)","DOI":"10.1109\/CVPR52729.2023.02144"},{"key":"20_CR35","doi-asserted-by":"crossref","unstructured":"Yu, Z., Guan, F., Lu, Y., Li, X., Chen, Z.: SF-IQA: quality and similarity integration for AI generated image quality assessment. In: CVPR, pp. 6692\u20136701 (2024)","DOI":"10.1109\/CVPRW63382.2024.00663"},{"key":"20_CR36","first-page":"1548","volume":"29","author":"H Zeng","year":"2019","unstructured":"Zeng, H., Cao, Z., Zhang, L., Bovik, A.C.: A unified probabilistic formulation of image aesthetic assessment. TIP 29, 1548\u20131561 (2019)","journal-title":"TIP"},{"key":"20_CR37","doi-asserted-by":"publisher","first-page":"14","DOI":"10.1016\/j.neucom.2020.10.046","volume":"430","author":"X Zhang","year":"2021","unstructured":"Zhang, X., Gao, X., He, L., Lu, W.: MSCAN: multimodal self-and-collaborative attention network for image aesthetic prediction tasks. Neurocomputing 430, 14\u201323 (2021)","journal-title":"Neurocomputing"},{"key":"20_CR38","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Lu, X., Zhang, J., Wang, J.Z.: Joint image and text representation for aesthetics analysis. In: ACMMM, pp. 262\u2013266 (2016)","DOI":"10.1145\/2964284.2967223"},{"issue":"3","key":"20_CR39","doi-asserted-by":"publisher","first-page":"1798","DOI":"10.1109\/TCYB.2020.2984670","volume":"52","author":"H Zhu","year":"2020","unstructured":"Zhu, H., Li, L., Wu, J., Zhao, S., Ding, G., Shi, G.: Personalized image aesthetics assessment via meta-learning with bilevel gradient optimization. Trans. Cybern. 52(3), 1798\u20131811 (2020)","journal-title":"Trans. Cybern."}],"container-title":["Lecture Notes in Computer Science","MultiMedia Modeling"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-2061-6_20","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,30]],"date-time":"2024-12-30T06:05:41Z","timestamp":1735538741000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-2061-6_20"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,31]]},"ISBN":["9789819620609","9789819620616"],"references-count":39,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-2061-6_20","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,12,31]]},"assertion":[{"value":"31 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"MMM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Multimedia Modeling","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Nara","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Japan","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9 January 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 January 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"31","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"mmm2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/mmm2025.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}