{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:14:27Z","timestamp":1765008867380,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":40,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,12,9]]},"DOI":"10.1145\/3743093.3771032","type":"proceedings-article","created":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:08:11Z","timestamp":1765008491000},"page":"1-7","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["EmoSEM: Segment and Explain Emotion Stimuli in Visual Art"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-1590-5886","authenticated-orcid":false,"given":"Jing","family":"Zhang","sequence":"first","affiliation":[{"name":"Hefei University of Technology, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2594-254X","authenticated-orcid":false,"given":"Dan","family":"Guo","sequence":"additional","affiliation":[{"name":"Hefei University of Technology, Hefei, China and Institute of Artificial Intelligence, Hefei Comprehensive National Science Center, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-2227-8826","authenticated-orcid":false,"given":"Zhangbin","family":"Li","sequence":"additional","affiliation":[{"name":"Hefei University of Technology, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3094-7735","authenticated-orcid":false,"given":"Meng","family":"Wang","sequence":"additional","affiliation":[{"name":"Hefei University of Technology, Hefei, China"}]}],"member":"320","published-online":{"date-parts":[[2025,12,6]]},"reference":[{"key":"e_1_3_3_3_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01140"},{"key":"e_1_3_3_3_3_2","first-page":"65","volume-title":"Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65\u201372."},{"key":"e_1_3_3_3_4_2","doi-asserted-by":"crossref","unstructured":"Margaret\u00a0M Bradley and Peter\u00a0J Lang. 2007. The International Affective Picture System (IAPS) in the study of emotion and attention. (2007).","DOI":"10.1093\/oso\/9780195169157.003.0003"},{"key":"e_1_3_3_3_5_2","unstructured":"Jinglun Cen Chunmei Qing Haochun Ou Xiangmin Xu and Junpeng Tan. 2024. MASANet: Multi-Aspect Semantic Auxiliary Network for Visual Sentiment Analysis. IEEE TAC (2024) 1\u201312."},{"key":"e_1_3_3_3_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2015.7351656"},{"key":"e_1_3_3_3_7_2","doi-asserted-by":"publisher","DOI":"10.1145\/3652583.3658102"},{"key":"e_1_3_3_3_8_2","doi-asserted-by":"crossref","unstructured":"Alan\u00a0S Cowen and Dacher Keltner. 2020. Universal facial expressions uncovered in art of the ancient Americas: A computational approach. Science advances 6 34 (2020) eabb1005.","DOI":"10.1126\/sciadv.abb1005"},{"key":"e_1_3_3_3_9_2","doi-asserted-by":"publisher","DOI":"10.4324\/9781315770727"},{"key":"e_1_3_3_3_10_2","doi-asserted-by":"crossref","unstructured":"Paul Ekman et\u00a0al. 1992. An argument for basic emotions. Cognition and emotion 6 3-4 (1992) 169\u2013200.","DOI":"10.1080\/02699939208411068"},{"key":"e_1_3_3_3_11_2","doi-asserted-by":"crossref","unstructured":"Mauajama Firdaus Hardik Chauhan Asif Ekbal and Pushpak Bhattacharyya. 2020. EmoSen: Generating sentiment and emotion controlled responses in a multimodal dialogue system. IEEE Transactions on Affective Computing 13 3 (2020) 1555\u20131566.","DOI":"10.1109\/TAFFC.2020.3015491"},{"key":"e_1_3_3_3_12_2","doi-asserted-by":"crossref","unstructured":"Jack Hessel Ari Holtzman Maxwell Forbes Ronan\u00a0Le Bras and Yejin Choi. 2021. Clipscore: A reference-free evaluation metric for image captioning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2104.08718 (2021).","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"e_1_3_3_3_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01273"},{"key":"e_1_3_3_3_14_2","doi-asserted-by":"crossref","unstructured":"Kiyohito Iigaya Sanghyun Yi Iman\u00a0A Wahle Koranis Tanwisuth and John\u00a0P O\u2019Doherty. 2021. Aesthetic preference for art can be predicted from a mixture of low-and high-level visual features. Nature Human Behaviour 5 6 (2021) 743\u2013755.","DOI":"10.1038\/s41562-021-01124-6"},{"key":"e_1_3_3_3_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_3_3_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICME51207.2021.9428453"},{"key":"e_1_3_3_3_17_2","first-page":"74","volume-title":"Text summarization branches out","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74\u201381."},{"key":"e_1_3_3_3_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.324"},{"key":"e_1_3_3_3_19_2","doi-asserted-by":"publisher","DOI":"10.1145\/3652583.3658104"},{"key":"e_1_3_3_3_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02022"},{"key":"e_1_3_3_3_21_2","first-page":"1","volume-title":"NeurIPS","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong\u00a0Jae Lee. 2023. Visual Instruction Tuning. In NeurIPS. 1\u201325."},{"key":"e_1_3_3_3_22_2","unstructured":"Jiasen Lu Dhruv Batra Devi Parikh and Stefan Lee. 2019. Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_3_3_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01045"},{"key":"e_1_3_3_3_24_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v30i1.10475"},{"key":"e_1_3_3_3_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/3DV.2016.79"},{"key":"e_1_3_3_3_26_2","volume-title":"Proceedings of the eleventh international conference on language resources and evaluation (LREC 2018)","author":"Mohammad Saif","year":"2018","unstructured":"Saif Mohammad and Svetlana Kiritchenko. 2018. Wikiart emotions: An annotated dataset of emotions evoked by art. In Proceedings of the eleventh international conference on language resources and evaluation (LREC 2018)."},{"key":"e_1_3_3_3_27_2","first-page":"311","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311\u2013318."},{"key":"e_1_3_3_3_28_2","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et\u00a0al. 2019. Language models are unsupervised multitask learners. OpenAI blog 1 8 (2019) 9."},{"key":"e_1_3_3_3_29_2","unstructured":"Tianhe Ren Yihao Chen Qing Jiang Zhaoyang Zeng Yuda Xiong Wenlong Liu Zhengyu Ma Junyi Shen Yuan Gao Xiaoke Jiang et\u00a0al. 2025. Dino-x: A unified vision model for open-world object detection and understanding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2411.14347 (2025)."},{"key":"e_1_3_3_3_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00560"},{"key":"e_1_3_3_3_31_2","doi-asserted-by":"crossref","unstructured":"Paul\u00a0J Silvia. 2005. Emotional responses to art: From collation and arousal to cognition and emotion. Review of general psychology 9 4 (2005) 342\u2013357.","DOI":"10.1037\/1089-2680.9.4.342"},{"key":"e_1_3_3_3_32_2","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan\u00a0N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_3_33_2","doi-asserted-by":"crossref","unstructured":"Xinxiao Wu and Tong Li. 2023. Sentimental Visual Captioning using Multimodal Transformer. IJCV 131 4 (2023) 1073\u20131090.","DOI":"10.1007\/s11263-023-01752-7"},{"key":"e_1_3_3_3_34_2","doi-asserted-by":"crossref","unstructured":"Zhiwei Xu and Shangfei Wang. 2021. Emotional attention detection and correlation exploration for image emotion distribution learning. IEEE Transactions on Affective Computing 14 1 (2021) 357\u2013369.","DOI":"10.1109\/TAFFC.2021.3071131"},{"key":"e_1_3_3_3_35_2","doi-asserted-by":"crossref","unstructured":"Jingyuan Yang Xinbo Gao Leida Li Xiumei Wang and Jinshan Ding. 2021. SOLVER: Scene-object interrelated visual emotion reasoning network. IEEE Transactions on Image Processing 30 (2021) 8686\u20138701.","DOI":"10.1109\/TIP.2021.3118983"},{"key":"e_1_3_3_3_36_2","doi-asserted-by":"crossref","unstructured":"Jingyuan Yang Jie Li Leida Li Xiumei Wang Yuxuan Ding and Xinbo Gao. 2022. Seeking subjectivity in visual emotion distribution learning. IEEE Transactions on Image Processing 31 (2022) 5189\u20135202.","DOI":"10.1109\/TIP.2022.3193749"},{"key":"e_1_3_3_3_37_2","doi-asserted-by":"crossref","unstructured":"Jingyuan Yang Jie Li Xiumei Wang Yuxuan Ding and Xinbo Gao. 2021. Stimuli-aware visual emotion analysis. IEEE TIP 30 (2021) 7432\u20137445.","DOI":"10.1109\/TIP.2021.3106813"},{"key":"e_1_3_3_3_38_2","doi-asserted-by":"crossref","unstructured":"Jufeng Yang Dongyu She Ming Sun Ming-Ming Cheng Paul\u00a0L Rosin and Liang Wang. 2018. Visual sentiment prediction based on automatic discovery of affective regions. IEEE Transactions on Multimedia 20 9 (2018) 2513\u20132525.","DOI":"10.1109\/TMM.2018.2803520"},{"key":"e_1_3_3_3_39_2","first-page":"397","volume-title":"European Conference on Computer Vision","author":"Zhang Jing","year":"2024","unstructured":"Jing Zhang, Liang Zheng, Meng Wang, and Dan Guo. 2024. Training a small emotional vision language model for visual art comprehension. In European Conference on Computer Vision. Springer, 397\u2013413."},{"key":"e_1_3_3_3_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"e_1_3_3_3_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2017.8296357"}],"event":{"name":"MMAsia '25: ACM Multimedia Asia","location":"Kuala Lumpur Malaysia","acronym":"MMAsia '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 7th ACM International Conference on Multimedia in Asia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3743093.3771032","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:10:41Z","timestamp":1765008641000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3743093.3771032"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":40,"alternative-id":["10.1145\/3743093.3771032","10.1145\/3743093"],"URL":"https:\/\/doi.org\/10.1145\/3743093.3771032","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]},"assertion":[{"value":"2025-12-06","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}