{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T09:14:04Z","timestamp":1773998044113,"version":"3.50.1"},"reference-count":78,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,12,25]],"date-time":"2025-12-25T00:00:00Z","timestamp":1766620800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,12,25]],"date-time":"2025-12-25T00:00:00Z","timestamp":1766620800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62136001"],"award-info":[{"award-number":["62136001"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Program for Youth Innovative Research Team of BUPT","award":["2023YQTD02"],"award-info":[{"award-number":["2023YQTD02"]}]},{"name":"Program for Youth Innovative Research Team of BUPT","award":["U23B2052"],"award-info":[{"award-number":["U23B2052"]}]},{"DOI":"10.13039\/501100009592","name":"Beijing Municipal Science & Technology Commission, Administrative Commission of Zhongguancun Science Park","doi-asserted-by":"publisher","award":["Z241100003524012"],"award-info":[{"award-number":["Z241100003524012"]}],"id":[{"id":"10.13039\/501100009592","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2026,1]]},"DOI":"10.1007\/s11263-025-02678-y","type":"journal-article","created":{"date-parts":[[2025,12,25]],"date-time":"2025-12-25T15:58:23Z","timestamp":1766678303000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Affective Image Editing: Shaping Emotional Factors via Text Descriptions"],"prefix":"10.1007","volume":"134","author":[{"given":"Peixuan","family":"Zhang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shuchen","family":"Weng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chengxuan","family":"Zhu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Binghao","family":"Tang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zijian","family":"Jia","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Si","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Boxin","family":"Shi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,12,25]]},"reference":[{"key":"2678_CR1","unstructured":"Achiam, J., Adler, S., Agarwal, S., Ahmad, L., Akkaya, I., Aleman, F. L., ... & McGrew, B. (2023). Gpt-4 technical report. arXiv preprint arXiv:2303.08774"},{"key":"2678_CR2","doi-asserted-by":"crossref","unstructured":"Achlioptas, P., Ovsjanikov, M., Haydarov, K., Elhoseiny, M., & Guibas, L. J. (2021). Artemis: Affective language for visual art. In: Proc. of Computer Vision and Pattern Recognition","DOI":"10.1109\/CVPR46437.2021.01140"},{"key":"2678_CR3","doi-asserted-by":"crossref","unstructured":"Achlioptas, P., Ovsjanikov, M., Guibas, L., & Tulyakov, S. (2023). Affection: Learning affective explanations for real-world visual data. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 6641\u20136651.","DOI":"10.1109\/CVPR52729.2023.00642"},{"key":"2678_CR4","doi-asserted-by":"crossref","unstructured":"Anand, S., Devulapally, N. K., Bhattacharjee, S. D., & Yuan, J. (2023). Multi-label emotion analysis in conversation via multimodal knowledge distillation.","DOI":"10.1145\/3581783.3612517"},{"key":"2678_CR5","doi-asserted-by":"crossref","unstructured":"Brooks, T., Holynski, A., & Efros, A. A. (2023). InstructPix2Pix: Learning to follow image editing instructions. In: Proc. of Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"2678_CR6","first-page":"77174","volume":"36","author":"S Weng","year":"2023","unstructured":"Weng, S., Zhang, P., Li, Y., Li, S., & Shi, B. (2023). L-CAD: Language-based colorization with any-level descriptions using diffusion priors. Proc of Advances in Neural Information Processing Systems, 36, 77174\u201377186.","journal-title":"Proc of Advances in Neural Information Processing Systems"},{"key":"2678_CR7","unstructured":"Chen, J., Yu, J., Ge, C., Yao, L., Xie, E., Wu, Y., ... & Li, Z. (2023). PixArt-$$\\alpha $$: Fast training of diffusion transformer for photorealistic text-to-image synthesis. arXiv preprint arXiv:2310.00426"},{"key":"2678_CR8","doi-asserted-by":"crossref","unstructured":"Chen, L., Li, J., Dong, X., Zhang, P., He, C., Wang, J., Zhao, F., & Lin, D. (2024). ShareGPT4V: Improving large multi-modal models with better captions. In: Proc. of European Conference on Computer Vision.","DOI":"10.1007\/978-3-031-72643-9_22"},{"key":"2678_CR9","doi-asserted-by":"crossref","unstructured":"Cheng, D., Gong, Y., Zhou, S., Wang, J., & Zheng, N. (2016). Person re-identification by multi-channel parts-based cnn with improved triplet loss function. In: CVPR.","DOI":"10.1109\/CVPR.2016.149"},{"issue":"3","key":"2678_CR10","doi-asserted-by":"publisher","first-page":"7","DOI":"10.2753\/JOA0091-3367370301","volume":"37","author":"RM Chowdhury","year":"2008","unstructured":"Chowdhury, R. M., Olsen, G. D., & Pracejus, J. W. (2008). Affective responses to images in print advertising: Affect integration in a simultaneous presentation context. Journal of Advertising, 37(3), 7\u201318.","journal-title":"Journal of Advertising"},{"key":"2678_CR11","doi-asserted-by":"crossref","unstructured":"Cohn, J. F., & Kanade, T. (2007). Use of automated facial image analysis for measurement of emotion expression. Handbook of emotion elicitation and assessment.","DOI":"10.1093\/oso\/9780195169157.003.0015"},{"key":"2678_CR12","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, LJ., Li, K., & Fei-Fei, L. (2009). ImageNet: A large-scale hierarchical image database. In: Proc. of Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"2678_CR13","unstructured":"Devlin, J., Chang, M. W., Lee, K., & Toutanova, K. (2019). BERT: Pre-training of deep bidirectional transformers for language understanding. In: North American Chapter of the Association for Computational Linguistics."},{"key":"2678_CR14","unstructured":"Fu, T. J., Hu, W., Du, X., Wang, W. Y., Yang, Y., & Gan, Z. (2024). Guiding instruction-based image editing via multimodal large language models."},{"key":"2678_CR15","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In: Proc. of Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR.2016.90"},{"key":"2678_CR16","unstructured":"Hertz, A., Mokady, R., Tenenbaum, J., Aberman, K., Pritch, Y., & Cohen-Or, D. (2023). Prompt-to-prompt image editing with cross attention control."},{"key":"2678_CR17","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., & Hochreiter, S. (2017). GANs trained by a two time-scale update rule converge to a local nash equilibrium. Proc of Advances in Neural Information Processing Systems."},{"key":"2678_CR18","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., & Abbeel, P. (2020). Denoising diffusion probabilistic models. Proc of Advances in Neural Information Processing Systems, 33, 6840\u20136851.","journal-title":"Proc of Advances in Neural Information Processing Systems"},{"key":"2678_CR19","doi-asserted-by":"crossref","unstructured":"Ito, T., Tsubouchi, K., Sakaji, H., Yamashita, T., & Izumi, K. (2020). Word-level contextual sentiment analysis with interpretability. In: Proc. of the AAAI Conference on Artificial Intelligence.","DOI":"10.1609\/aaai.v34i04.5845"},{"issue":"18","key":"2678_CR20","doi-asserted-by":"publisher","first-page":"2317","DOI":"10.1097\/00001756-200312190-00006","volume":"14","author":"H Kim","year":"2003","unstructured":"Kim, H., Somerville, L. H., Johnstone, T., Alexander, A. L., & Whalen, P. J. (2003). Inverse amygdala and medial prefrontal cortex responses to surprised faces. Neuroreport, 14(18), 2317\u20132322.","journal-title":"Neuroreport"},{"key":"2678_CR21","unstructured":"Kingma, D. P., & Ba, J. (2014). Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980"},{"key":"2678_CR22","first-page":"22199","volume":"35","author":"T Kojima","year":"2022","unstructured":"Kojima, T., Gu, S. S., Reid, M., Matsuo, Y., & Iwasawa, Y. (2022). Large language models are zero-shot reasoners. Proc of Advances in Neural Information Processing Systems, 35, 22199\u201322213.","journal-title":"Proc of Advances in Neural Information Processing Systems"},{"issue":"1","key":"2678_CR23","doi-asserted-by":"publisher","first-page":"79","DOI":"10.1214\/aoms\/1177729694","volume":"22","author":"S Kullback","year":"1951","unstructured":"Kullback, S., & Leibler, R. A. (1951). On information and sufficiency. The Annals of Mathematical Statistics, 22(1), 79\u201386.","journal-title":"The Annals of Mathematical Statistics"},{"key":"2678_CR24","doi-asserted-by":"crossref","unstructured":"Kumari, N., Zhang, B., Zhang, R., Shechtman, E., & Zhu, J. Y. (2023). Multi-concept customization of text-to-image diffusion. In: Proc. of Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52729.2023.00192"},{"key":"2678_CR25","doi-asserted-by":"crossref","unstructured":"Kundu, T., & Saravanan, C. (2017). Advancements and recent trends in emotion recognition using facial image analysis and machine learning models. In: International Conference on Electrical, Electronics, Communication, Computer, and Optimization Techniques.","DOI":"10.1109\/ICEECCOT.2017.8284512"},{"key":"2678_CR26","unstructured":"Li, C., Wang, J., Zhang, Y., Zhu, K., Hou, W., Lian, J., Luo, F., Yang, Q., & Xie, X. (2023). Large language models understand and can be enhanced by emotional stimuli. arXiv preprint arXiv:2307.11760"},{"key":"2678_CR27","unstructured":"Li, J., Li, D., Savarese, S., & Hoi, S. (2023). BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In: Proc. of International Conference on Machine Learning."},{"key":"2678_CR28","unstructured":"Li, Z., Chen, G., Shao, R., Jiang, D., & Nie, L. (2024). Enhancing the emotional generation capability of large language models via emotional chain-of-thought. arXiv preprint arXiv:2401.06836"},{"key":"2678_CR29","unstructured":"Lin, Q., Zhang, J., Ong, Y. S., Zhang, M. (2024). Make me happier: Evoking emotions through image diffusion models. arXiv preprint arXiv:2403.08255"},{"key":"2678_CR30","first-page":"34892","volume":"36","author":"H Liu","year":"2024","unstructured":"Liu, H., Li, C., Wu, Q., & Lee, Y. J. (2024). Visual instruction tuning. Proc of Advances in Neural Information Processing Systems, 36, 34892\u201334916.","journal-title":"Proc of Advances in Neural Information Processing Systems"},{"key":"2678_CR31","doi-asserted-by":"crossref","unstructured":"Liu, S., Zhang, X., & Yang, J. (2022). SER30K: A large-scale dataset for sticker emotion recognition.","DOI":"10.1145\/3503161.3548407"},{"key":"2678_CR32","doi-asserted-by":"crossref","unstructured":"Liu, Z., Yang, K., Zhang, T., Xie, Q., Yu, Z., & Ananiadou, S. (2024). EmoLLMs: A series of emotional large language models and annotation tools for comprehensive affective analysis. arXiv preprint arXiv:2401.08508","DOI":"10.1145\/3637528.3671552"},{"key":"2678_CR33","doi-asserted-by":"crossref","unstructured":"Mathews, A., Xie, L., & He, X. (2016). Senticap: Generating image descriptions with sentiments. In: Proceedings of the AAAI conference on artificial intelligence, vol\u00a030.","DOI":"10.1609\/aaai.v30i1.10475"},{"key":"2678_CR34","unstructured":"Meng, C., He, Y., Song, Y., Song, J., Wu, J., Zhu, J. Y., & Ermon, S. (2022). SDEdit: Guided image synthesis and editing with stochastic differential equations."},{"issue":"4","key":"2678_CR35","doi-asserted-by":"publisher","first-page":"626","DOI":"10.3758\/BF03192732","volume":"37","author":"JA Mikels","year":"2005","unstructured":"Mikels, J. A., Fredrickson, B. L., Larkin, G. R., Lindberg, C. M., Maglio, S. J., & Reuter-Lorenz, P. A. (2005). Emotional category data on images from the international affective picture system. Behavior Research Methods, 37(4), 626\u2013630.","journal-title":"Behavior Research Methods"},{"key":"2678_CR36","doi-asserted-by":"crossref","unstructured":"Mohamed, Y., Khan, F. F., Haydarov, K., & Elhoseiny, M. (2022). It is okay to not be okay: Overcoming emotional bias in affective image captioning by contrastive data collection. In: Proc. of Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52688.2022.02058"},{"key":"2678_CR37","doi-asserted-by":"crossref","unstructured":"Mohammad, S. (2018). Obtaining reliable human ratings of valence, arousal, and dominance for 20,000 english words.","DOI":"10.18653\/v1\/P18-1017"},{"key":"2678_CR38","doi-asserted-by":"crossref","unstructured":"Mokady, R., Hertz, A., Aberman, K., Pritch, Y., & Cohen-Or, D. (2023). Null-text inversion for editing real images using guided diffusion models. In: Proc. of Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52729.2023.00585"},{"issue":"5","key":"2678_CR39","doi-asserted-by":"publisher","first-page":"640","DOI":"10.1037\/a0016819","volume":"9","author":"M Neta","year":"2009","unstructured":"Neta, M., Norris, C. J., & Whalen, P. J. (2009). Corrugator muscle responses are associated with individual differences in positivity-negativity bias. Emotion, 9(5), 640.","journal-title":"Emotion"},{"issue":"4","key":"2678_CR40","doi-asserted-by":"publisher","first-page":"722","DOI":"10.1080\/02699931.2020.1862063","volume":"35","author":"M Neta","year":"2021","unstructured":"Neta, M., Berkebile, M. M., & Freeman, J. B. (2021). The dynamic process of ambiguous emotion perception. Cognition and Emotion, 35(4), 722\u2013729.","journal-title":"Cognition and Emotion"},{"key":"2678_CR41","doi-asserted-by":"crossref","unstructured":"Parmar, G., Kumar\u00a0Singh, K., Zhang, R., Li, Y., Lu, J., & Zhu, J. Y. (2023). Zero-shot image-to-image translation. In: Proc. of ACM SIGGRAPH.","DOI":"10.1145\/3588432.3591513"},{"key":"2678_CR42","doi-asserted-by":"crossref","unstructured":"Paskaleva, R., Holubakha, M., Ilic, A., Motamed, S., Van\u00a0Gool, L., & Paudel, D. (2024). A unified and interpretable emotion representation and expression generation. In: Proc. of Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52733.2024.00237"},{"key":"2678_CR43","unstructured":"Podell, D., English, Z., Lacey, K., Blattmann, A., Dockhorn, T., M\u00fcller, J., Penna, J., & Rombach, R. (2023). SDXL: Improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952"},{"key":"2678_CR44","unstructured":"Radford, A., Kim, J. W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., ... & Sutskever, I. (2021). Learning transferable visual models from natural language supervision. In: Proc. of International Conference on Machine Learning."},{"key":"2678_CR45","doi-asserted-by":"crossref","unstructured":"Rao, T., Xu, M., Liu, H., Wang, J., & Burnett, I. (2016). Multi-scale blocks based image emotion classification using multiple instance learning. In: Proc. of International Conference on Image Processing.","DOI":"10.1109\/ICIP.2016.7532434"},{"key":"2678_CR46","doi-asserted-by":"publisher","first-page":"429","DOI":"10.1016\/j.neucom.2018.12.053","volume":"333","author":"T Rao","year":"2019","unstructured":"Rao, T., Li, X., Zhang, H., & Xu, M. (2019). Multi-level region-based convolutional neural network for image emotion classification. Neurocomputing, 333, 429\u2013439.","journal-title":"Neurocomputing"},{"key":"2678_CR47","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., & Ommer, B. (2022). High-resolution image synthesis with latent diffusion models. In: Proc. of Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"2678_CR48","doi-asserted-by":"crossref","unstructured":"Ruiz, N., Li, Y., Jampani, V., Pritch, Y., Rubinstein, M., & Aberman, K. (2023). DreamBooth: Fine tuning text-to-image diffusion models for subject-driven generation. In: Proc. of Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"2678_CR49","unstructured":"Saharia, C., Chan, W., Saxena, S., Li, L., Whang, J., Denton, E. L., ... & Norouzi, M. (2022). Photorealistic text-to-image diffusion models with deep language understanding. Proc of Advances in Neural Information Processing Systems 35, 36479-36494"},{"key":"2678_CR50","unstructured":"Schuhmann, C., Beaumont, R., Vencu, R., Gordon, C., Wightman, R., Cherti, M., ... & Jitsev, J. (2022). LAION-5B: An open large-scale dataset for training next generation image-text models. Proc of Advances in Neural Information Processing Systems."},{"key":"2678_CR51","unstructured":"Song, J., Meng, C., & Ermon, S. (2020). Denoising diffusion implicit models"},{"key":"2678_CR52","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., Wojna, Z. (2016). Rethinking the inception architecture for computer vision. In: Proc. of International Conference on Computer Vision.","DOI":"10.1109\/CVPR.2016.308"},{"key":"2678_CR53","unstructured":"Touvron, H., Martin, L., Stone, K., Albert, P., Almahairi, A., Babaei, Y., ... & Scialom, T. (2023). Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288"},{"key":"2678_CR54","doi-asserted-by":"publisher","first-page":"180","DOI":"10.3389\/fpsyg.2016.00180","volume":"7","author":"MK Uhrig","year":"2016","unstructured":"Uhrig, M. K., Trautmann, N., Baumg\u00e4rtner, U., Treede, R. D., Henrich, F., Hiller, W., & Marschall, S. (2016). Emotion elicitation: A comparison of pictures and films. Frontiers in Psychology, 7, 180.","journal-title":"Frontiers in Psychology"},{"key":"2678_CR55","doi-asserted-by":"crossref","unstructured":"Wang, L., Jia, G., Jiang, N., Wu, H., & Yang, J. (2022). Ease: Robust facial expression recognition via emotion ambiguity-sensitive cooperative networks.","DOI":"10.1145\/3503161.3548005"},{"key":"2678_CR56","doi-asserted-by":"crossref","unstructured":"Wang, X., Jia, J., Yin, J., & Cai, L. (2013). Interpretable aesthetic features for affective image classification. In: Proc. of International Conference on Image Processing.","DOI":"10.1109\/ICIP.2013.6738665"},{"key":"2678_CR57","doi-asserted-by":"crossref","unstructured":"Weng, S., Zhang, P., Chang, Z., Wang, X., Li, S., & Shi, B. (2023). Affective image filter: Reflecting emotions from text to images. In: Proc. of International Conference on Computer Vision.","DOI":"10.1109\/ICCV51070.2023.00992"},{"key":"2678_CR58","doi-asserted-by":"crossref","unstructured":"Xie, S., Zhang, Z., Lin, Z., Hinz, T., & Zhang, K. (2023). SmartBrush: Text and shape guided object inpainting with diffusion model. In: Proc. of Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52729.2023.02148"},{"key":"2678_CR59","doi-asserted-by":"crossref","unstructured":"Xu, T., Zhang, P., Huang, Q., Zhang, H., Gan, Z., Huang, X., & He, X. (2018). AttnGAN: Fine-grained text to image generation with attentional generative adversarial networks. In: Proc. of Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR.2018.00143"},{"key":"2678_CR60","doi-asserted-by":"crossref","unstructured":"Yang, D., Chen, Z., Wang, Y., Wang, S., Li, M., Liu, S., Zhao, X., Huang, S., Dong, Z., Zhai, P., & Zhang. L. (2023). Context de-confounded emotion recognition. In: Proc. of Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52729.2023.01822"},{"key":"2678_CR61","doi-asserted-by":"publisher","first-page":"7432","DOI":"10.1109\/TIP.2021.3106813","volume":"30","author":"J Yang","year":"2021","unstructured":"Yang, J., Li, J., Wang, X., Ding, Y., & Gao, X. (2021). Stimuli-aware visual emotion analysis. IEEE Transactions on Image Processing, 30, 7432\u20137445.","journal-title":"IEEE Transactions on Image Processing"},{"key":"2678_CR62","doi-asserted-by":"crossref","unstructured":"Yang, J., Huang, Q., Ding, T., Lischinski, D., Cohen-Or, D., & Huang, H. (2023). EmoSet: A large-scale visual emotion dataset with rich attributes. In: Proc. of International Conference on Computer Vision.","DOI":"10.1109\/ICCV51070.2023.01864"},{"key":"2678_CR63","doi-asserted-by":"crossref","unstructured":"Yang, J., Feng, J., & Huang, H. (2024). EmoGen: Emotional image content generation with text-to-image diffusion models. In: Proc. of Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52733.2024.00608"},{"key":"2678_CR64","doi-asserted-by":"crossref","unstructured":"Yang, J., Feng, J., Luo, W., Lischinski, D., Cohen-Or, D., & Huang, H. (2025). Emoedit: Evoking emotions through image manipulation. In: Proc. of Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52734.2025.02299"},{"key":"2678_CR65","unstructured":"Yang, L., Yu, Z., Meng, C., Xu, M., Ermon, S., & Cui, B. (2024). Mastering text-to-image diffusion: Recaptioning, planning, and generating with multimodal llms. In: Proc. of International Conference on Machine Learning."},{"key":"2678_CR66","unstructured":"Yang, Z., Li, L., Lin, K., Wang, J., Lin, C. C., Liu, Z., & Wang, L. (2023). The dawn of LMMs: Preliminary explorations with GPT-4V (ision). arXiv preprint arXiv:2309.17421"},{"key":"2678_CR67","doi-asserted-by":"publisher","first-page":"1640","DOI":"10.1109\/TMM.2020.3001527","volume":"23","author":"X Yao","year":"2020","unstructured":"Yao, X., She, D., Zhang, H., Yang, J., Cheng, M. M., & Wang, L. (2020). Adaptive deep metric learning for affective image retrieval and classification. IEEE Transactions on Multimedia, 23, 1640\u20131653.","journal-title":"IEEE Transactions on Multimedia"},{"key":"2678_CR68","doi-asserted-by":"crossref","unstructured":"You, Q., Luo, J., Jin, H., & Yang, J. (2016). Building a large scale dataset for image emotion recognition: The fine print and the benchmark. In: Proc. of the AAAI Conference on Artificial Intelligence.","DOI":"10.1609\/aaai.v30i1.9987"},{"key":"2678_CR69","doi-asserted-by":"crossref","unstructured":"Zabari, N., Azulay, A., Gorkor, A., Halperin, T., & Fried, O. (2023). Diffusing colors: Image colorization with text guided diffusion. In: Proc. of ACM SIGGRAPH Asia.","DOI":"10.1145\/3610548.3618180"},{"key":"2678_CR70","doi-asserted-by":"crossref","unstructured":"Zhang, H., Xu, T., Li, H., Zhang, S., Wang, X., Huang, X., Metaxas, D. N. (2017). StackGAN: Text to photo-realistic image synthesis with stacked generative adversarial networks. In: Proc. of International Conference on Computer Vision.","DOI":"10.1109\/ICCV.2017.629"},{"key":"2678_CR71","first-page":"31428","volume":"36","author":"K Zhang","year":"2024","unstructured":"Zhang, K., Mo, L., Chen, W., Sun, H., & Su, Y. (2024). MagicBrush: A manually annotated dataset for instruction-guided image editing. Proc of Advances in Neural Information Processing Systems, 36, 31428\u201331449.","journal-title":"Proc of Advances in Neural Information Processing Systems"},{"key":"2678_CR72","doi-asserted-by":"crossref","unstructured":"Zhang, L., Wang, S., & Liu, B. (2018). Deep learning for sentiment analysis: A survey. Wiley Interdisciplinary Reviews: Data Mining and Knowledge Discovery.","DOI":"10.1002\/widm.1253"},{"key":"2678_CR73","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., & Agrawala, M. (2023). Adding conditional control to text-to-image diffusion models. In: Proc. of International Conference on Computer Vision.","DOI":"10.1109\/ICCV51070.2023.00355"},{"issue":"11","key":"2678_CR74","doi-asserted-by":"publisher","first-page":"11019","DOI":"10.1109\/TKDE.2022.3230975","volume":"35","author":"W Zhang","year":"2022","unstructured":"Zhang, W., Li, X., Deng, Y., Bing, L., & Lam, W. (2022). A survey on aspect-based sentiment analysis: Tasks, methods, and challenges. IEEE Transactions on Knowledge and Data Engineering, 35(11), 11019\u201311038.","journal-title":"IEEE Transactions on Knowledge and Data Engineering"},{"key":"2678_CR75","doi-asserted-by":"crossref","unstructured":"Zhao, S., Gao, Y., Jiang, X., Yao, H., Chua, T. S., & Sun, X. (2014). Exploring principles-of-art features for image emotion recognition.","DOI":"10.1145\/2647868.2654930"},{"key":"2678_CR76","doi-asserted-by":"crossref","unstructured":"Zhao, S., Yao, H., Yang, Y., & Zhang, Y. (2014). Affective image retrieval via multi-graph learning.","DOI":"10.1145\/2647868.2655035"},{"key":"2678_CR77","doi-asserted-by":"crossref","unstructured":"Zhao, S., Yao, X., Yang, J., Jia, G., Ding, G., Chua, T. S., Schuller, B. W., Keutzer, K. (2021). Affective image content analysis: Two decades review and new perspectives.","DOI":"10.1109\/TPAMI.2021.3094362"},{"key":"2678_CR78","doi-asserted-by":"crossref","unstructured":"Zhou B, Lapedriza A, Khosla A, Oliva A, Torralba A (2017) Places: A 10 million image database for scene recognition.","DOI":"10.1167\/17.10.296"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02678-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02678-y","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02678-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,20]],"date-time":"2026-02-20T15:43:57Z","timestamp":1771602237000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02678-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,25]]},"references-count":78,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2026,1]]}},"alternative-id":["2678"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02678-y","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,12,25]]},"assertion":[{"value":"12 April 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 September 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 December 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"16"}}