{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T17:18:57Z","timestamp":1765041537133,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":55,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nd\/4.0\/"}],"funder":[{"name":"JSPS KAKENHI","award":["JP22K12091 and JP23H00497"],"award-info":[{"award-number":["JP22K12091 and JP23H00497"]}]},{"name":"JST FOREST Grant","award":["JPMJFR216O"],"award-info":[{"award-number":["JPMJFR216O"]}]},{"name":"JST CREST Grant","award":["JPMJCR20D3"],"award-info":[{"award-number":["JPMJCR20D3"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3652583.3658102","type":"proceedings-article","created":{"date-parts":[[2024,6,7]],"date-time":"2024-06-07T06:30:40Z","timestamp":1717741840000},"page":"515-523","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Retrieving Emotional Stimuli in Artworks"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2544-7744","authenticated-orcid":false,"given":"Tianwei","family":"Chen","sequence":"first","affiliation":[{"name":"Osaka University, Suita, Osaka, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9200-6359","authenticated-orcid":false,"given":"Noa","family":"Garcia","sequence":"additional","affiliation":[{"name":"Osaka University, Suita, Osaka, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8879-5957","authenticated-orcid":false,"given":"Liangzhi","family":"Li","sequence":"additional","affiliation":[{"name":"Osaka University, Suita, Osaka, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8000-3567","authenticated-orcid":false,"given":"Yuta","family":"Nakashima","sequence":"additional","affiliation":[{"name":"Osaka University, Suita, Osaka, Japan"}]}],"member":"320","published-online":{"date-parts":[[2024,6,7]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Guibas","author":"Achlioptas Panos","year":"2021","unstructured":"Panos Achlioptas, Maks Ovsjanikov, Kilichbek Haydarov, Mohamed Elhoseiny, and Leonidas J. Guibas. 2021. ArtEmis: Affective Language for Visual Art. CVPR, 11564--11574."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Peter Anderson Xiaodong He Chris Buehler Damien Teney Mark Johnson Stephen Gould and Lei Zhang. 2018. Bottom-Up and Top-Down Attention for Image Captioning and Visual Question Answering. In CVPR. 6077--6086.","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Zechen Bai Yuta Nakashima and Noa Garcia. 2021. Explain Me the Painting: Multi-Topic Knowledgeable Art Description Generation. In ICCV. 5402--5412.","DOI":"10.1109\/ICCV48922.2021.00537"},{"key":"e_1_3_2_1_4_1","volume-title":"Efros","author":"Brooks Tim","year":"2023","unstructured":"Tim Brooks, Aleksander Holynski, and Alexei A. Efros. 2023. InstructPix2Pix: Learning to Follow Image Editing Instructions. In CVPR. 18392--18402."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2699184"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.2190\/EM.27.1.f"},{"key":"e_1_3_2_1_7_1","volume-title":"Crowley and Andrew Zisserman","author":"Elliot","year":"2014","unstructured":"Elliot J. Crowley and Andrew Zisserman. 2014a. The State of the Art: Object Retrieval in Paintings using Discriminative Regions. In BMVC."},{"key":"e_1_3_2_1_8_1","volume-title":"Crowley and Andrew Zisserman","author":"Elliot","year":"2014","unstructured":"Elliot J. Crowley and Andrew Zisserman. 2014b. The State of the Art: Object Retrieval in Paintings using Discriminative Regions. In BMVC."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3169234"},{"key":"e_1_3_2_1_10_1","volume-title":"Emotional Attention: A Study of Image Sentiment and Visual Attention. In CVPR. 7521--7531.","author":"Fan Shaojing","year":"2018","unstructured":"Shaojing Fan, Zhiqi Shen, Ming Jiang, Bryan L. Koenig, Juan Xu, M. Kankanhalli, and Qi Zhao. 2018. Emotional Attention: A Study of Image Sentiment and Visual Attention. In CVPR. 7521--7531."},{"key":"e_1_3_2_1_11_1","volume-title":"ECCV Workshops","volume":"11130","author":"Garcia Noa","year":"2018","unstructured":"Noa Garcia and George Vogiatzis. 2018. How to Read Paintings: Semantic Art Understanding with Multi-modal Retrieval. In ECCV Workshops, Vol. 11130. 676--691."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-66096-3_8"},{"key":"e_1_3_2_1_13_1","volume-title":"ECCV Workshops.","author":"Gonthier Nicolas","year":"2018","unstructured":"Nicolas Gonthier, Yann Gousseau, Said Ladjal, and Olivier Bonfait. 2018a. Weakly supervised object detection in artworks. In ECCV Workshops."},{"key":"e_1_3_2_1_14_1","volume-title":"Weakly Supervised Object Detection in Artworks. In ECCV Workshops. 692--709","author":"Gonthier Nicolas","year":"2018","unstructured":"Nicolas Gonthier, Yann Gousseau, Sa\"i d Ladjal, and Olivier Bonfait. 2018b. Weakly Supervised Object Detection in Artworks. In ECCV Workshops. 692--709."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Kaiming He X. Zhang Shaoqing Ren and Jian Sun. 2015. Deep Residual Learning for Image Recognition. In CVPR. 770--778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"S. Kazemzadeh V. Ordonez M. Andr\u00e9 Matten and T. L. Berg. 2014. ReferItGame: Referring to Objects in Photographs of Natural Scenes. In EMNLP.","DOI":"10.3115\/v1\/D14-1086"},{"key":"e_1_3_2_1_17_1","unstructured":"Huijie Lin Jia Jia Quan Guo Yuanyuan Xue Jie Huang Lianhong Cai and Ling Feng. 2014. Psychological stress detection from cross-media microblog data using Deep Sparse Neural Network. In ICME. 1--6."},{"key":"e_1_3_2_1_18_1","unstructured":"Gaowen Liu Yan Yan Elisa Ricci Yi Yang Yahong Han Stefan Winkler and N. Sebe. 2015. Inferring Painting Style with Multi-Task Dictionary Learning. In IJCAI."},{"key":"e_1_3_2_1_19_1","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled Weight Decay Regularization. In ICLR."},{"key":"e_1_3_2_1_20_1","unstructured":"J. Lu D. Batra D. Parikh and S. Lee. 2019. ViLBERT : Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks. In NeurIPS."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"J. Lu V. Goswami M. Rohrbach D. Parikh and S. Lee. 2020. 12-in-1: Multi-Task Vision and Language Representation Learning. In CVPR. 10434--10443.","DOI":"10.1109\/CVPR42600.2020.01045"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Jana Machajdik and Allan Hanbury. 2010. Affective image classification using features inspired by psychology and art theory.","DOI":"10.1145\/1873951.1873965"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"J. Mao J. Huang A. Toshev O. Camburu A. L. Yuille and K. P. Murphy. 2016. Generation and Comprehension of Unambiguous Object Descriptions. In CVPR. 11--20.","DOI":"10.1109\/CVPR.2016.9"},{"key":"e_1_3_2_1_24_1","volume-title":"van Gemert","author":"Mensink Thomas","year":"2014","unstructured":"Thomas Mensink and Jan C. van Gemert. 2014. The Rijksmuseum Challenge: Museum-Centered Visual Recognition. In ICMR. 451."},{"key":"e_1_3_2_1_25_1","volume-title":"Kilichbek Haydarov, and Mohamed Elhoseiny.","author":"Mohamed Youssef","year":"2022","unstructured":"Youssef Mohamed, Faizan Farooq Khan, Kilichbek Haydarov, and Mohamed Elhoseiny. 2022. It is Okay to Not Be Okay: Overcoming Emotional Bias in Affective Image Captioning by Contrastive Data Collection. In CVPR. 21231--21240."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.newideapsych.2010.04.001"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"Kuan-Chuan Peng Amir Sadovnik Andrew C. Gallagher and Tsuhan Chen. 2016. Where do emotions come from? Predicting the Emotion Stimuli Map. In ICIP. 614--618.","DOI":"10.1109\/ICIP.2016.7532430"},{"key":"e_1_3_2_1_28_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In ICML."},{"key":"e_1_3_2_1_29_1","volume-title":"Hierarchical Text-Conditional Image Generation with CLIP Latents. ArXiv","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical Text-Conditional Image Generation with CLIP Latents. ArXiv , Vol. abs\/2204.06125 (2022)."},{"key":"e_1_3_2_1_30_1","first-page":"1137","article-title":"Faster R-CNN","volume":"39","author":"Ren Shaoqing","year":"2015","unstructured":"Shaoqing Ren, Kaiming He, Ross B. Girshick, and Jian Sun. 2015. Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks. TPAMI , Vol. 39 (2015), 1137--1149.","journal-title":"Towards Real-Time Object Detection with Region Proposal Networks. TPAMI"},{"key":"e_1_3_2_1_31_1","volume-title":"DEArt: Dataset of European Art. In ECCV Workshop","volume":"13801","author":"Reshetnikov Artem","year":"2022","unstructured":"Artem Reshetnikov, Maria-Cristina V. Marinescu, and Joaquim Mor\u00e9 L\u00f3 pez. 2022. DEArt: Dataset of European Art. In ECCV Workshop, Vol. 13801. 218--233."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Robin Rombach Andreas Blattmann Dominik Lorenz Patrick Esser and Bj\u00f6 rn Ommer. 2022. High-Resolution Image Synthesis with Latent Diffusion Models. In CVPR. 10674--10685.","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_33_1","volume-title":"Collomosse","author":"Ruta Dan","year":"2022","unstructured":"Dan Ruta, Andrew Gilbert, Pranav Aggarwal, Naveen Marri, Ajinkya Kale, John Briggs, Chris Speed, Hailin Jin, Baldo Faieta, Alex Filipkowski, Zhe Lin, and John P. Collomosse. 2022. StyleBabel: Artistic Style Tagging and Captioning. In ECCV."},{"key":"e_1_3_2_1_34_1","volume-title":"Conceptual Captions: A Cleaned, Hypernymed, Image Alt-text Dataset For Automatic Image Captioning. In ACL.","author":"Sharma P.","year":"2018","unstructured":"P. Sharma, N. Ding, S. Goodman, and R. Soricut. 2018. Conceptual Captions: A Cleaned, Hypernymed, Image Alt-text Dataset For Automatic Image Captioning. In ACL."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"crossref","unstructured":"XI Shen Alexei A. Efros and Mathieu Aubry. 2019. Discovering Visual Patterns in Art Collections With Spatially-Consistent Feature Learning. bibinfonumpages9270--9279 pages.","DOI":"10.1109\/CVPR.2019.00950"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1037\/1089-2680.9.4.342"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3273022"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","unstructured":"Raphael Tang Linqing Liu Akshat Pandey Zhiying Jiang Gefei Yang Karun Kumar Pontus Stenetorp Jimmy Lin and Ferhan Ture. 2023. What the DAAM: Interpreting Stable Diffusion Using Cross Attention. In ACL. 5644--5659.","DOI":"10.18653\/v1\/2023.acl-long.310"},{"key":"e_1_3_2_1_39_1","volume-title":"An Empirical Study Involving Art Classification. In ECCV Workshop","volume":"13801","author":"Tonkes Vincent","year":"2022","unstructured":"Vincent Tonkes and Matthia Sabatelli. 2022. How Well Do Vision Transformers (VTs) Transfer to the Non-natural Image Domain? An Empirical Study Involving Art Classification. In ECCV Workshop, Vol. 13801. 234--250."},{"key":"e_1_3_2_1_40_1","volume-title":"Lauw","author":"Truong Quoc-Tuan","year":"2017","unstructured":"Quoc-Tuan Truong and Hady W. Lauw. 2017. Visual Sentiment Analysis for Review Images with Item-Oriented and User-Oriented CNN. In ACM MM."},{"key":"e_1_3_2_1_41_1","volume-title":"Lauw","author":"Truong Quoc-Tuan","year":"2019","unstructured":"Quoc-Tuan Truong and Hady W. Lauw. 2019. VistaNet: Visual Aspect Attention Network for Multimodal Sentiment Analysis. In AAAI."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"Xin Wang Huijun Zhang Lei Cao and Ling Feng. 2020. Leverage Social Media for Personalized Stress Detection. In ACM MM.","DOI":"10.1145\/3394171.3413596"},{"key":"e_1_3_2_1_43_1","first-page":"1211","volume-title":"Belongie","author":"Wilber Michael J.","year":"2017","unstructured":"Michael J. Wilber, Chen Fang, Hailin Jin, Aaron Hertzmann, John P. Collomosse, and Serge J. Belongie. 2017. BAM! The Behance Artistic Media Dataset for Recognition Beyond Photography. , bibinfonumpages1211--1220 pages."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","unstructured":"Yankun Wu Yuta Nakashima and Noa Garcia. 2023. Not Only Generative Art: Stable Diffusion for Content-Style Disentanglement in Art Analysis. In ICMR. 199--208.","DOI":"10.1145\/3591106.3592262"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.newideapsych.2011.09.003"},{"key":"e_1_3_2_1_46_1","volume-title":"Xindi Shang, Zehuan Yuan, Ying Sun, and Jun Liu.","author":"Xu Li","year":"2023","unstructured":"Li Xu, Mark He Huang, Xindi Shang, Zehuan Yuan, Ying Sun, and Jun Liu. 2023. Meta Compositional Referring Expression Segmentation. In CVPR. 19478--19487."},{"key":"e_1_3_2_1_47_1","volume-title":"Lui","author":"Xu Liwen","year":"2022","unstructured":"Liwen Xu, Z. Wang, Bingwen Wu, and Simon S. Y. Lui. 2022. MDAN: Multi-level Dependent Attention Network for Visual Emotion Analysis. In CVPR. 9469--9478."},{"key":"e_1_3_2_1_48_1","first-page":"8686","article-title":"SOLVER","volume":"30","author":"Yang Jingyuan","year":"2021","unstructured":"Jingyuan Yang, Xinbo Gao, Leida Li, Xiumei Wang, and Jinshan Ding. 2021a. SOLVER: Scene-Object Interrelated Visual Emotion Reasoning Network. TIP , Vol. 30 (2021), 8686--8701.","journal-title":"Scene-Object Interrelated Visual Emotion Reasoning Network. TIP"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"crossref","unstructured":"Jingyuan Yang Qirui Huang Tingting Ding Dani Lischinski Daniel Cohen-Or and Hui Huang. 2023. EmoSet: A Large-scale Visual Emotion Dataset with Rich Attributes. In ICCV. 20326--20337.","DOI":"10.1109\/ICCV51070.2023.01864"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3106813"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Jufeng Yang Dongyu She Yu-Kun Lai Paul L. Rosin and Ming-Hsuan Yang. 2018. Weakly Supervised Coupled Networks for Visual Sentiment Analysis. CVPR 7584--7592.","DOI":"10.1109\/CVPR.2018.00791"},{"key":"e_1_3_2_1_52_1","volume-title":"Torr","author":"Yang Zhao","year":"2022","unstructured":"Zhao Yang, Jiaqi Wang, Yansong Tang, Kai Chen, Hengshuang Zhao, and Philip H. S. Torr. 2022. LAVT: Language-Aware Vision Transformer for Referring Image Segmentation. In CVPR. 18134--18144."},{"key":"e_1_3_2_1_53_1","volume-title":"Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2).","author":"Ypsilantis Nikolaos-Antonios","year":"2021","unstructured":"Nikolaos-Antonios Ypsilantis, Noa Garcia, Guangxing Han, Sarah Ibrahimi, Nanne Van Noord, and Giorgos Tolias. 2021. The met dataset: Instance-level recognition for artworks. In Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"crossref","unstructured":"Pengchuan Zhang Xiujun Li Xiaowei Hu Jianwei Yang Lei Zhang Lijuan Wang Yejin Choi and Jianfeng Gao. 2021. VinVL: Revisiting Visual Representations in Vision-Language Models. In CVPR. 5575--5584.","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"e_1_3_2_1_55_1","first-page":"6729","article-title":"Affective Image Content Analysis","volume":"44","author":"Zhao Sicheng","year":"2022","unstructured":"Sicheng Zhao, Xingxu Yao, Jufeng Yang, Guoli Jia, Guiguang Ding, Tat-Seng Chua, Bj\u00f6rn W. Schuller, and Kurt Keutzer. 2022. Affective Image Content Analysis: Two Decades Review and New Perspectives. TPAMI , Vol. 44 (2022), 6729--6751.","journal-title":"Two Decades Review and New Perspectives. TPAMI"}],"event":{"name":"ICMR '24: International Conference on Multimedia Retrieval","sponsor":["SIGMM ACM Special Interest Group on Multimedia","SIGSOFT ACM Special Interest Group on Software Engineering"],"location":"Phuket Thailand","acronym":"ICMR '24"},"container-title":["Proceedings of the 2024 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658102","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3652583.3658102","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T08:50:56Z","timestamp":1755766256000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658102"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":55,"alternative-id":["10.1145\/3652583.3658102","10.1145\/3652583"],"URL":"https:\/\/doi.org\/10.1145\/3652583.3658102","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}