{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,4,22]],"date-time":"2025-04-22T04:04:44Z","timestamp":1745294684511,"version":"3.40.4"},"reference-count":56,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2025,3,9]],"date-time":"2025-03-09T00:00:00Z","timestamp":1741478400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,3,9]],"date-time":"2025-03-09T00:00:00Z","timestamp":1741478400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["62471142"],"award-info":[{"award-number":["62471142"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"name":"Natural Science Foundation of Fujian Province, China","award":["2023J01067"],"award-info":[{"award-number":["2023J01067"]}]},{"name":"Major Science and Echnology Project of Fujian Province","award":["2021HZ022007"],"award-info":[{"award-number":["2021HZ022007"]}]},{"name":"Industry-Academy Cooperation Project","award":["2021H6022"],"award-info":[{"award-number":["2021H6022"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2025,4]]},"DOI":"10.1007\/s00530-025-01736-2","type":"journal-article","created":{"date-parts":[[2025,3,9]],"date-time":"2025-03-09T17:28:17Z","timestamp":1741541297000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Progressive fusion of local and global image features for cross-modal image aesthetic assessment"],"prefix":"10.1007","volume":"31","author":[{"given":"Yuzhen","family":"Niu","sequence":"first","affiliation":[]},{"given":"Siling","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Shanshan","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Fusheng","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,3,9]]},"reference":[{"issue":"3","key":"1736_CR1","doi-asserted-by":"publisher","first-page":"151","DOI":"10.1007\/s00530-024-01353-5","volume":"30","author":"X Wu","year":"2024","unstructured":"Wu, X., Shi, Z.: Full reference image quality assessment based on dual-space multi-feature fusion. Multimed. Syst. 30(3), 151 (2024)","journal-title":"Multimed. Syst."},{"issue":"1","key":"1736_CR2","doi-asserted-by":"publisher","first-page":"40","DOI":"10.1007\/s00530-023-01206-7","volume":"30","author":"M-L P\u00e9rez-Delgado","year":"2024","unstructured":"P\u00e9rez-Delgado, M.-L., Celebi, M.E.: A comparative study of color quantization methods using various image quality assessment indices. Multimed. Syst. 30(1), 40 (2024)","journal-title":"Multimed. Syst."},{"key":"1736_CR3","doi-asserted-by":"crossref","unstructured":"Vo, N., Jiang, L., Sun, C., Murphy, K., Li, L.-J., Fei-Fei, L., Hays, J.: Composing text and image for image retrieval\u2014an empirical odyssey. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6439\u20136448 (2019)","DOI":"10.1109\/CVPR.2019.00660"},{"issue":"4","key":"1736_CR4","doi-asserted-by":"publisher","first-page":"1463","DOI":"10.1109\/TCSVT.2020.3010181","volume":"31","author":"X Chai","year":"2020","unstructured":"Chai, X., Shao, F., Jiang, Q., Ho, Y.-S.: Roundness-preserving warping for aesthetic enhancement-based stereoscopic image editing. IEEE Trans. Circ. Syst. Video Technol. 31(4), 1463\u20131477 (2020)","journal-title":"IEEE Trans. Circ. Syst. Video Technol."},{"key":"1736_CR5","doi-asserted-by":"crossref","unstructured":"Fang, N., Zhang, Y., Wei, Q., Guo, Y.: Composition-aware learning for aesthetic assessment of edited photos. In: Proceedings of the IEEE International Conference on Computer and Communications, pp. 2151\u20132155 (2020)","DOI":"10.1109\/ICCC51575.2020.9345309"},{"key":"1736_CR6","doi-asserted-by":"crossref","unstructured":"Su, H.-H., Chen, T.-W., Kao, C.-C., Hsu, W.H., Chien, S.-Y.: Scenic photo quality assessment with bag of aesthetics-preserving features. In: Proceedings of the ACM International Conference on Multimedia, pp. 1213\u20131216 (2011)","DOI":"10.1145\/2072298.2071977"},{"key":"1736_CR7","unstructured":"Ke, Y., Tang, X., Jing, F.: The design of high-level features for photo quality assessment. In: Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition, pp. 419\u2013426 (2006)"},{"key":"1736_CR8","doi-asserted-by":"crossref","unstructured":"Datta, R., Joshi, D., Li, J., Wang, J.Z.: Studying aesthetics in photographic images using a computational approach. In: Proceedings of the European Conference on Computer Vision, pp. 288\u2013301 (2006)","DOI":"10.1007\/11744078_23"},{"key":"1736_CR9","doi-asserted-by":"crossref","unstructured":"Dong, Z., Shen, X., Li, H., Tian, X.: Photo quality assessment with DCNN that understands image well. In: Proceedings of the International Conference on Multimedia Modeling, pp. 524\u2013535 (2015)","DOI":"10.1007\/978-3-319-14442-9_57"},{"key":"1736_CR10","doi-asserted-by":"publisher","first-page":"500","DOI":"10.1016\/j.image.2016.05.004","volume":"47","author":"Y Kao","year":"2016","unstructured":"Kao, Y., Huang, K., Maybank, S.: Hierarchical aesthetic quality assessment using deep convolutional neural networks. Signal Process. Image Commun. 47, 500\u2013510 (2016)","journal-title":"Signal Process. Image Commun."},{"key":"1736_CR11","doi-asserted-by":"crossref","unstructured":"Lu, K.-H., Chang, K.-Y., Chen, C.-S.: Image aesthetic assessment via deep semantic aggregation. In: 2016 IEEE Global Conference on Signal and Information Processing, pp. 232\u2013236. IEEE (2016)","DOI":"10.1109\/GlobalSIP.2016.7905838"},{"key":"1736_CR12","doi-asserted-by":"publisher","first-page":"5009","DOI":"10.1109\/TIP.2022.3191853","volume":"31","author":"L Celona","year":"2022","unstructured":"Celona, L., Leonardi, M., Napoletano, P., Rozza, A.: Composition and style attributes guided image aesthetic assessment. IEEE Trans. Image Process. 31, 5009\u20135024 (2022)","journal-title":"IEEE Trans. Image Process."},{"key":"1736_CR13","first-page":"1","volume":"73","author":"H Chen","year":"2024","unstructured":"Chen, H., Shao, F., Mu, B., Jiang, Q.: Image aesthetics assessment with emotion-aware multi-branch network. IEEE Trans. Instrum. Meas. 73, 1\u201315 (2024)","journal-title":"IEEE Trans. Instrum. Meas."},{"key":"1736_CR14","doi-asserted-by":"crossref","unstructured":"He, S., Zhang, Y., Xie, R., Jiang, D., Ming, A.: Rethinking image aesthetics assessment: models, datasets and benchmarks. In: Proceedings of the International Joint Conference on Artificial Intelligence, pp. 942\u2013948 (2022)","DOI":"10.24963\/ijcai.2022\/132"},{"key":"1736_CR15","doi-asserted-by":"crossref","unstructured":"Peng, Z., Huang, W., Gu, S., Xie, L., Wang, Y., Jiao, J., Ye, Q.: Conformer: local features coupling global representations for visual recognition. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 367\u2013376 (2021)","DOI":"10.1109\/ICCV48922.2021.00042"},{"key":"1736_CR16","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: Imagenet classification with deep convolutional neural networks. Adv. Neural Inf. Process. Syst. 25 (2012)"},{"key":"1736_CR17","doi-asserted-by":"crossref","unstructured":"Maaz, M., Shaker, A., Cholakkal, H., Khan, S., Zamir, S.W., Anwer, R.M., Shahbaz Khan, F.: Edgenext: efficiently amalgamated CNN-transformer architecture for mobile vision applications. In: Proceedings of the European Conference on Computer Vision, pp. 3\u201320 (2022)","DOI":"10.1007\/978-3-031-25082-8_1"},{"issue":"5","key":"1736_CR18","doi-asserted-by":"publisher","first-page":"2483","DOI":"10.1007\/s00530-023-01141-7","volume":"29","author":"Y Ke","year":"2023","unstructured":"Ke, Y., Wang, Y., Wang, K., Qin, F., Guo, J., Yang, S.: Image aesthetics assessment using composite features from transformer and CNN. Multimed. Syst. 29(5), 2483\u20132494 (2023)","journal-title":"Multimed. Syst."},{"key":"1736_CR19","doi-asserted-by":"publisher","first-page":"611","DOI":"10.1109\/TMM.2020.2985526","volume":"23","author":"X Zhang","year":"2020","unstructured":"Zhang, X., Gao, X., Lu, W., He, L., Li, J.: Beyond vision: a multimodal recurrent attention convolutional neural network for unified image aesthetic prediction tasks. IEEE Trans. Multimed. 23, 611\u2013623 (2020)","journal-title":"IEEE Trans. Multimed."},{"key":"1736_CR20","doi-asserted-by":"crossref","unstructured":"Hii, Y.-L., See, J., Kairanbay, M., Wong, L.-K.: Multigap: multi-pooled inception network with text augmentation for aesthetic prediction of photographs. In: Proceedings of the IEEE International Conference on Image Processing, pp. 1722\u20131726 (2017)","DOI":"10.1109\/ICIP.2017.8296576"},{"key":"1736_CR21","doi-asserted-by":"crossref","unstructured":"Miao, H., Zhang, Y., Wang, D., Feng, S.: Multimodal aesthetic analysis assisted by styles through a multimodal co-transformer model. In: Proceedings of the IEEE International Conference on Computational Science and Engineering, pp. 43\u201350 (2021)","DOI":"10.1109\/CSE53436.2021.00016"},{"key":"1736_CR22","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Lu, X., Zhang, J., Wang, J.Z.: Joint image and text representation for aesthetics analysis. In: Proceedings of the ACM International Conference on Multimedia, pp. 262\u2013266 (2016)","DOI":"10.1145\/2964284.2967223"},{"key":"1736_CR23","doi-asserted-by":"crossref","unstructured":"Zhu, T., Li, L., Chen, P., Wu, J., Yang, Y., Li, Y., Guo, Y.: Attribute-assisted multimodal network for image aesthetics assessment. In: Proceedings of the IEEE International Conference on Multimedia and Expo, pp. 2477\u20132482 (2023)","DOI":"10.1109\/ICME55011.2023.00422"},{"key":"1736_CR24","doi-asserted-by":"crossref","unstructured":"Nie, X., Hu, B., Gao, X., Li, L., Zhang, X., Xiao, B.: BMI-Net: a brain-inspired multimodal interaction network for image aesthetic assessment. In: Proceedings of the ACM International Conference on Multimedia, pp. 5514\u20135522 (2023)","DOI":"10.1145\/3581783.3611996"},{"key":"1736_CR25","doi-asserted-by":"publisher","first-page":"14","DOI":"10.1016\/j.neucom.2020.10.046","volume":"430","author":"X Zhang","year":"2021","unstructured":"Zhang, X., Gao, X., He, L., Lu, W.: MSCAN: multimodal self-and-collaborative attention network for image aesthetic prediction tasks. Neurocomputing 430, 14\u201323 (2021)","journal-title":"Neurocomputing"},{"key":"1736_CR26","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., et al.: An image is worth $$16 \\times 16$$ words: Transformers for image recognition at scale. Preprint (2020)"},{"key":"1736_CR27","first-page":"5998","volume":"30","author":"A Vaswani","year":"2017","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141., Polosukhin, I.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30, 5998\u20136008 (2017)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"1736_CR28","doi-asserted-by":"crossref","unstructured":"Cho, K., Van\u00a0Merri\u00ebnboer, B., Gulcehre, C., Bahdanau, D., Bougares, F., Schwenk, H., Bengio, Y.: Learning phrase representations using RNN encoder-decoder for statistical machine translation. In: Proceedings of the IEEE Conference on Empirical Methods in Natural La, pp. 1724\u20131734 (2014)","DOI":"10.3115\/v1\/D14-1179"},{"issue":"6","key":"1736_CR29","doi-asserted-by":"publisher","first-page":"1452","DOI":"10.1109\/TPAMI.2017.2723009","volume":"40","author":"B Zhou","year":"2017","unstructured":"Zhou, B., Lapedriza, A., Khosla, A., Oliva, A., Torralba, A.: Places: a 10 million image database for scene recognition. IEEE Trans. Pattern Anal. Mach. Intell. 40(6), 1452\u20131464 (2017)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"05","key":"1736_CR30","doi-asserted-by":"publisher","first-page":"2770","DOI":"10.1109\/TPAMI.2023.3334624","volume":"46","author":"X Deng","year":"2024","unstructured":"Deng, X., Xu, J., Gao, F., Sun, X., Xu, M.: Deep-CDL: deep multi-scale multi-modal convolutional dictionary learning network. IEEE Trans. Pattern Anal. Mach. Intell. 46(05), 2770\u20132787 (2024)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1736_CR31","doi-asserted-by":"publisher","first-page":"442","DOI":"10.1109\/TMM.2024.3521720","volume":"27","author":"J Xu","year":"2024","unstructured":"Xu, J., Deng, X., Fu, Y., Xu, M., Li, S.: MDSC-Net: multi-modal discriminative sparse coding driven RGB-D classification network. IEEE Trans. Multimed. 27, 442\u2013454 (2024)","journal-title":"IEEE Trans. Multimed."},{"key":"1736_CR32","doi-asserted-by":"crossref","unstructured":"Sun, M., Ma, W., Liu, Y.: Global and local feature interaction with vision transformer for few-shot image classification. In: Proceedings of the ACM International Conference on Information & Knowledge Management, pp. 4530\u20134534 (2022)","DOI":"10.1145\/3511808.3557604"},{"key":"1736_CR33","doi-asserted-by":"publisher","first-page":"2152","DOI":"10.1109\/JBHI.2024.3350077","volume":"28","author":"X Yi","year":"2024","unstructured":"Yi, X., Fu, Y., Liu, R., Zhang, H., Hua, R.: TSGET: two-stage global enhanced transformer for automatic radiology report generation. IEEE J. Biomed. Health Inform. 28(4), 2152\u20132162 (2024)","journal-title":"IEEE J. Biomed. Health Inform."},{"key":"1736_CR34","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"1736_CR35","doi-asserted-by":"crossref","unstructured":"Ke, J., Wang, Q., Wang, Y., Milanfar, P., Yang, F.: MUSIQ: multi-scale image quality transformer. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5148\u20135157 (2021)","DOI":"10.1109\/ICCV48922.2021.00510"},{"key":"1736_CR36","doi-asserted-by":"crossref","unstructured":"Loper, E., Bird, S.: NLTK: the natural language toolkit. In: Proceedings of the COLING Interactive Presentation Sessions, pp. 69\u201372 (2006)","DOI":"10.3115\/1225403.1225421"},{"key":"1736_CR37","doi-asserted-by":"crossref","unstructured":"Pennington, J., Socher, R., Manning, C.D.: Glove: global vectors for word representation. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing, pp. 1532\u20131543 (2014)","DOI":"10.3115\/v1\/D14-1162"},{"key":"1736_CR38","doi-asserted-by":"crossref","unstructured":"Geva, M., Schuster, R., Berant, J., Levy, O.: Transformer feed-forward layers are key-value memories. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing, pp. 5484\u20135495 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.446"},{"key":"1736_CR39","doi-asserted-by":"crossref","unstructured":"Jin, X., Wu, L., Li, X., Chen, S., Peng, S., Chi, J., Ge, S., Song, C., Zhao, G.: Predicting aesthetic score distribution through cumulative Jensen\u2013Shannon divergence. In: Proceedings of the AAAI Conference on Artificial Intelligence (2018)","DOI":"10.1609\/aaai.v32i1.11286"},{"key":"1736_CR40","doi-asserted-by":"crossref","unstructured":"Murray, N., Marchesotti, L., Perronnin, F.: AVA: a large-scale database for aesthetic visual analysis. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2408\u20132415 (2012)","DOI":"10.1109\/CVPR.2012.6247954"},{"key":"1736_CR41","doi-asserted-by":"crossref","unstructured":"Datta, R., Joshi, D., Li, J., Wang, J.Z.: Studying aesthetics in photographic images using a computational approach. In: Computer Vision\u2013ECCV 2006: 9th European Conference on Computer Vision, Graz, Austria, May 7\u201313, 2006, Proceedings, Part III 9, pp. 288\u2013301. Springer (2006)","DOI":"10.1007\/11744078_23"},{"key":"1736_CR42","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. Preprint (2014)"},{"issue":"8","key":"1736_CR43","doi-asserted-by":"publisher","first-page":"3998","DOI":"10.1109\/TIP.2018.2831899","volume":"27","author":"H Talebi","year":"2018","unstructured":"Talebi, H., Milanfar, P.: NIMA: neural image assessment. IEEE Trans. Image Process. 27(8), 3998\u20134011 (2018)","journal-title":"IEEE Trans. Image Process."},{"key":"1736_CR44","doi-asserted-by":"crossref","unstructured":"She, D., Lai, Y.-K., Yi, G., Xu, K.: Hierarchical layout-aware graph convolutional network for unified aesthetics assessment. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 8475\u20138484 (2021)","DOI":"10.1109\/CVPR46437.2021.00837"},{"key":"1736_CR45","doi-asserted-by":"crossref","unstructured":"Ghosal, K., Smolic, A.: Image aesthetics assessment using graph attention network. In: Proceedings of the IEEE International Conference on Pattern Recognition, pp. 3160\u20133167 (2022)","DOI":"10.1109\/ICPR56361.2022.9956162"},{"issue":"9","key":"1736_CR46","doi-asserted-by":"publisher","first-page":"5716","DOI":"10.1109\/TCYB.2022.3169017","volume":"53","author":"J Yang","year":"2022","unstructured":"Yang, J., Zhou, Y., Zhao, Y., Lu, W., Gao, X.: MetaMP: metalearning-based multipatch image aesthetics assessment. IEEE Trans. Cybern. 53(9), 5716\u20135728 (2022)","journal-title":"IEEE Trans. Cybern."},{"issue":"11","key":"1736_CR47","doi-asserted-by":"publisher","first-page":"8654","DOI":"10.1109\/TNNLS.2022.3151787","volume":"34","author":"G Jia","year":"2022","unstructured":"Jia, G., Li, P., He, R.: Theme-aware aesthetic distribution prediction with full-resolution photographs. IEEE Trans. Neural Netw. Learn. Syst. 34(11), 8654\u20138668 (2022)","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"issue":"9","key":"1736_CR48","doi-asserted-by":"publisher","first-page":"4798","DOI":"10.1109\/TCSVT.2023.3249185","volume":"33","author":"L Li","year":"2023","unstructured":"Li, L., Huang, Y., Wu, J., Yang, Y., Li, Y., Guo, Y., Shi, G.: Theme-aware visual attribute reasoning for image aesthetics assessment. IEEE Trans. Circ. Syst. Video Technol. 33(9), 4798\u20134811 (2023)","journal-title":"IEEE Trans. Circ. Syst. Video Technol."},{"key":"1736_CR49","doi-asserted-by":"crossref","unstructured":"Ke, J., Ye, K., Yu, J., Wu, Y., Milanfar, P., Yang, F.: Vila: learning image aesthetics from user comments with vision-language pretraining. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 10041\u201310051 (2023)","DOI":"10.1109\/CVPR52729.2023.00968"},{"key":"1736_CR50","doi-asserted-by":"crossref","unstructured":"Li, S., Liang, H., Xie, M., He, X.: Multi-scale and multi-patch aggregation network based on dual-column vision fusion for image aesthetics assessment. In: 2024 IEEE International Conference on Multimedia and Expo, pp. 1\u20136. IEEE (2024)","DOI":"10.1109\/ICME57554.2024.10687850"},{"key":"1736_CR51","doi-asserted-by":"publisher","first-page":"9316","DOI":"10.1109\/TMM.2024.3389452","volume":"26","author":"Y Huang","year":"2024","unstructured":"Huang, Y., Li, L., Chen, P., Wu, J., Yang, Y., Li, Y., Shi, G.: Coarse-to-fine image aesthetics assessment with dynamic attribute selection. IEEE Trans. Multimed. 26, 9316\u20139329 (2024)","journal-title":"IEEE Trans. Multimed."},{"issue":"11","key":"1736_CR52","first-page":"1","volume":"20","author":"T Shi","year":"2024","unstructured":"Shi, T., Chen, C., Wu, Z., Hao, A., Fang, Y.: Improving image aesthetic assessment via multiple image joint learning. ACM Trans. Multimed. Comput. Commun. Appl. 20(11), 1\u201324 (2024)","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl."},{"key":"1736_CR53","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2024.127434","volume":"582","author":"T Shi","year":"2024","unstructured":"Shi, T., Chen, C., Li, X., Hao, A.: Semantic and style based multiple reference learning for artistic and general image aesthetic assessment. Neurocomputing 582, 127434 (2024)","journal-title":"Neurocomputing"},{"key":"1736_CR54","doi-asserted-by":"crossref","unstructured":"He, S., Ming, A., Zheng, S., Zhong, H., Ma, H.: Eat: An enhancer for aesthetics-oriented transformers. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 1023\u20131032 (2023)","DOI":"10.1145\/3581783.3611881"},{"key":"1736_CR55","doi-asserted-by":"crossref","unstructured":"Chen, Q., Zhang, W., Zhou, N., Lei, P., Xu, Y., Zheng, Y., Fan, J.: Adaptive fractional dilated convolution network for image aesthetics assessment. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 14114\u201314123 (2020)","DOI":"10.1109\/CVPR42600.2020.01412"},{"key":"1736_CR56","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et al.: Learning transferable visual models from natural language supervision. In: Proceedings of the International Conference on Machine Learning, pp. 8748\u20138763 (2021)"}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01736-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-025-01736-2\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01736-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,21]],"date-time":"2025-04-21T19:35:11Z","timestamp":1745264111000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-025-01736-2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,9]]},"references-count":56,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2025,4]]}},"alternative-id":["1736"],"URL":"https:\/\/doi.org\/10.1007\/s00530-025-01736-2","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"type":"print","value":"0942-4962"},{"type":"electronic","value":"1432-1882"}],"subject":[],"published":{"date-parts":[[2025,3,9]]},"assertion":[{"value":"8 October 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 February 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 March 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"145"}}