{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,24]],"date-time":"2026-03-24T02:59:49Z","timestamp":1774321189076,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":60,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681332","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"3352-3361","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["XMeCap: Meme Caption Generation with Sub-Image Adaptability"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4381-486X","authenticated-orcid":false,"given":"Yuyan","family":"Chen","sequence":"first","affiliation":[{"name":"Shanghai Key Laboratory of Data Science, School of Computer Science, Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-9384-500X","authenticated-orcid":false,"given":"Songzhou","family":"Yan","sequence":"additional","affiliation":[{"name":"Shanghai Key Laboratory of Data Science, School of Computer Science, Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-4530-5516","authenticated-orcid":false,"given":"Zhihong","family":"Zhu","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2355-288X","authenticated-orcid":false,"given":"Zhixu","family":"Li","sequence":"additional","affiliation":[{"name":"Shanghai Key Laboratory of Data Science, School of Computer Science, Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8403-9591","authenticated-orcid":false,"given":"Yanghua","family":"Xiao","sequence":"additional","affiliation":[{"name":"Shanghai Key Laboratory of Data Science, School of Computer Science, Fudan University, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"When to Laugh and How Hard? A Multimodal Approach to Detecting Humor and its Intensity. arXiv preprint arXiv:2211.01889","author":"Alnajjar Khalid","year":"2022","unstructured":"Khalid Alnajjar, Mika H\u00e4m\u00e4l\u00e4inen, J\u00f6rg Tiedemann, Jorma Laaksonen, and Mikko Kurimo. 2022. When to Laugh and How Hard? A Multimodal Approach to Detecting Humor and its Intensity. arXiv preprint arXiv:2211.01889 (2022)."},{"key":"e_1_3_2_1_2_1","volume-title":"Dank Learning: Generating Memes Using Deep Neural Networks. arxiv","author":"Meltem Tolunay Abel L","year":"2018","unstructured":"Abel L Peirson V au2 and E Meltem Tolunay. 2018. Dank Learning: Generating Memes Using Deep Neural Networks. arxiv: 1806.04510 [cs.CL]"},{"key":"e_1_3_2_1_3_1","unstructured":"Jinze Bai Shuai Bai Shusheng Yang Shijie Wang Sinan Tan Peng Wang Junyang Lin Chang Zhou and Jingren Zhou. 2023. Qwen-VL: A Versatile Vision-Language Model for Understanding Localization Text Reading and Beyond. arxiv: 2308.12966 [cs.CV]"},{"key":"e_1_3_2_1_4_1","volume-title":"Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65--72","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65--72."},{"key":"e_1_3_2_1_5_1","volume-title":"Caption enriched samples for improving hateful memes detection. arXiv preprint arXiv:2109.10649","author":"Blaier Efrat","year":"2021","unstructured":"Efrat Blaier, Itzik Malkiel, and Lior Wolf. 2021. Caption enriched samples for improving hateful memes detection. arXiv preprint arXiv:2109.10649 (2021)."},{"key":"e_1_3_2_1_6_1","volume-title":"Towards Multimodal Sarcasm Detection (An Obviously Perfect Paper). arxiv","author":"Castro Santiago","year":"1906","unstructured":"Santiago Castro, Devamanyu Hazarika, Ver\u00f3nica P\u00e9rez-Rosas, Roger Zimmermann, Rada Mihalcea, and Soujanya Poria. 2019. Towards Multimodal Sarcasm Detection (An Obviously Perfect Paper). arxiv: 1906.01815 [cs.CL]"},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of the 29th International Conference on Computational Linguistics. 6752--6761","author":"Chauhan Dushyant Singh","year":"2022","unstructured":"Dushyant Singh Chauhan, Gopendra Vikram Singh, Aseem Arora, Asif Ekbal, and Pushpak Bhattacharyya. 2022. A Sentiment and Emotion aware Multimodal Multiparty Humor Recognition in Multilingual Conversational Setting. In Proceedings of the 29th International Conference on Computational Linguistics. 6752--6761."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3462244.3479959"},{"key":"e_1_3_2_1_9_1","unstructured":"Keqin Chen Zhao Zhang Weili Zeng Richong Zhang Feng Zhu and Rui Zhao. 2023 f. Shikra: Unleashing Multimodal LLM's Referential Dialogue Magic. arxiv: 2306.15195 [cs.CV]"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.7592\/EJHR2018.6.1.chen"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3583780.3614904"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3583780.3614905"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.131"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3539597.3570431"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.215"},{"key":"e_1_3_2_1_16_1","volume-title":"2023 e. XMQAs: Constructing Complex-Modified Question-Answering Dataset for Robust Question Understanding","author":"Chen Yuyan","year":"2023","unstructured":"Yuyan Chen, Yanghua Xiao, Zhixu Li, and Bang Liu. 2023 e. XMQAs: Constructing Complex-Modified Question-Answering Dataset for Robust Question Understanding. IEEE Transactions on Knowledge and Data Engineering (2023)."},{"key":"e_1_3_2_1_17_1","volume-title":"Grow-and-Clip: Informative-yet-Concise Evidence Distillation for Answer Explanation. In 2022 IEEE 38th International Conference on Data Engineering (ICDE). IEEE, 741--754","author":"Chen Yuyan","year":"2022","unstructured":"Yuyan Chen, Yanghua Xiao, and Bang Liu. 2022. Grow-and-Clip: Informative-yet-Concise Evidence Distillation for Answer Explanation. In 2022 IEEE 38th International Conference on Data Engineering (ICDE). IEEE, 741--754."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.130"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.173"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.128"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i16.29736"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3616855.3635860"},{"key":"e_1_3_2_1_23_1","volume-title":"Le","author":"Cubuk Ekin D.","year":"2019","unstructured":"Ekin D. Cubuk, Barret Zoph, Dandelion Mane, Vijay Vasudevan, and Quoc V. Le. 2019. AutoAugment: Learning Augmentation Policies from Data. arxiv: 1805.09501 [cs.CV]"},{"key":"e_1_3_2_1_24_1","volume-title":"InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. arXiv","author":"Dai W","year":"2023","unstructured":"W Dai, J Li, D Li, AMH Tiong, J Zhao, W Wang, B Li, P Fung, and S Hoi. [n.,d.]. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. arXiv 2023. arXiv preprint arXiv:2305.06500 ( [n.,d.])."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACII.2015.7344594"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i14.17534"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.3115\/1642011.1642015"},{"key":"e_1_3_2_1_29_1","unstructured":"Junnan Li Dongxu Li Silvio Savarese and Steven Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. arxiv: 2301.12597 [cs.CV]"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.20944\/preprints202407.0981.v1"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02240"},{"key":"e_1_3_2_1_32_1","volume-title":"Memeplate: A Chinese Multimodal Dataset for Humor Understanding in Meme Templates. In CCF International Conference on Natural Language Processing and Chinese Computing. Springer, 527--538","author":"Li Zefeng","year":"2022","unstructured":"Zefeng Li, Hongfei Lin, Liang Yang, Bo Xu, and Shaowu Zhang. 2022. Memeplate: A Chinese Multimodal Dataset for Humor Understanding in Meme Templates. In CCF International Conference on Natural Language Processing and Chinese Computing. Springer, 527--538."},{"key":"e_1_3_2_1_33_1","volume-title":"Performance Analysis on Deep Learning Models in Humor Detection Task. In 2022 International Conference on Machine Learning and Knowledge Engineering (MLKE). IEEE, 93--97","author":"Li Zhuohang","year":"2022","unstructured":"Zhuohang Li, Jiashuo Liu, and Yuci Wang. 2022. Performance Analysis on Deep Learning Models in Humor Detection Task. In 2022 International Conference on Machine Learning and Knowledge Engineering (MLKE). IEEE, 93--97."},{"key":"e_1_3_2_1_34_1","volume-title":"ROUGE: A Package for Automatic Evaluation of Summaries. In Text Summarization Branches Out","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. ROUGE: A Package for Automatic Evaluation of Summaries. In Text Summarization Branches Out. Association for Computational Linguistics, Barcelona, Spain, 74--81. https:\/\/www.aclweb.org\/anthology\/W04--1013"},{"key":"e_1_3_2_1_35_1","first-page":"40","article-title":"Neural Radiance Fields Convert 2D to 3D Texture","volume":"3","author":"Lin Zheng","year":"2024","unstructured":"Zheng Lin, Chenghao Wang, Zichao Li, Zhuoyue Wang, Xinqi Liu, and Yue Zhu. 2024. Neural Radiance Fields Convert 2D to 3D Texture. Applied Science and Biotechnology Journal for Advanced Research, Vol. 3, 3 (2024), 40--44.","journal-title":"Applied Science and Biotechnology Journal for Advanced Research"},{"key":"e_1_3_2_1_36_1","first-page":"27","article-title":"Text Sentiment Detection and Classification Based on Integrated Learning Algorithm","volume":"3","author":"Lin Zheng","year":"2024","unstructured":"Zheng Lin, Zeyu Wang, Yue Zhu, Zichao Li, and Hao Qin. 2024. Text Sentiment Detection and Classification Based on Integrated Learning Algorithm. Applied Science and Engineering Journal for Advanced Research, Vol. 3, 3 (2024), 27--33.","journal-title":"Applied Science and Engineering Journal for Advanced Research"},{"key":"e_1_3_2_1_37_1","unstructured":"Haotian Liu Chunyuan Li Yuheng Li and Yong Jae Lee. 2023. Improved Baselines with Visual Instruction Tuning. arxiv: 2310.03744 [cs.CV]"},{"key":"e_1_3_2_1_38_1","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong Jae Lee. 2023. Visual Instruction Tuning. arxiv: 2304.08485 [cs.CV]"},{"key":"e_1_3_2_1_39_1","volume-title":"Enhancing document-level event argument extraction with contextual clues and role relevance. arXiv preprint arXiv:2310.05991","author":"Liu Wanlong","year":"2023","unstructured":"Wanlong Liu, Shaohuan Cheng, Dingyi Zeng, and Hong Qu. 2023. Enhancing document-level event argument extraction with contextual clues and role relevance. arXiv preprint arXiv:2310.05991 (2023)."},{"key":"e_1_3_2_1_40_1","volume-title":"Beyond Single-Event Extraction: Towards Efficient Document-Level Multi-Event Argument Extraction. arXiv preprint arXiv:2405.01884","author":"Liu Wanlong","year":"2024","unstructured":"Wanlong Liu, Li Zhou, Dingyi Zeng, Yichen Xiao, Shaohuan Cheng, Chen Zhang, Grandee Lee, Malu Zhang, and Wenyu Chen. 2024. Beyond Single-Event Extraction: Towards Efficient Document-Level Multi-Event Argument Extraction. arXiv preprint arXiv:2405.01884 (2024)."},{"key":"e_1_3_2_1_41_1","volume-title":"Multi-modal application: Image Memes Generation. arXiv preprint arXiv:2112.01651","author":"Liu Zhiyuan","year":"2021","unstructured":"Zhiyuan Liu, Chuanzheng Sun, Yuxin Jiang, Shiqi Jiang, and Mei Ming. 2021. Multi-modal application: Image Memes Generation. arXiv preprint arXiv:2112.01651 (2021)."},{"key":"e_1_3_2_1_42_1","unstructured":"Jiasen Lu Christopher Clark Rowan Zellers Roozbeh Mottaghi and Aniruddha Kembhavi. 2022. Unified-IO: A Unified Model for Vision Language and Multi-Modal Tasks. arxiv: 2206.08916 [cs.CV]"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.348"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.716"},{"key":"e_1_3_2_1_45_1","volume-title":"Time Series Modeling for Heart Rate Prediction: From ARIMA to Transformers. arXiv preprint arXiv:2406.12199","author":"Ni Haowei","year":"2024","unstructured":"Haowei Ni, Shuchen Meng, Xieming Geng, Panfeng Li, Zhuoying Li, Xupeng Chen, Xiaotong Wang, and Shiyao Zhang. 2024. Time Series Modeling for Heart Rate Prediction: From ARIMA to Transformers. arXiv preprint arXiv:2406.12199 (2024). http:\/\/arxiv.org\/abs\/2406.12199"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00062"},{"key":"e_1_3_2_1_48_1","unstructured":"Hannes Ritschel Ilhan Aslan David Sedlbauer and Elisabeth Andr\u00e9. 2019. Irony man: Augmenting a social robot with the ability to use irony in multimodal communication with humans. (2019)."},{"key":"e_1_3_2_1_49_1","volume-title":"The PAAMS Collection: 18th International Conference, PAAMS 2020, L'Aquila, Italy, October 7--9, 2020, Proceedings 18","author":"Ritschel Hannes","year":"2020","unstructured":"Hannes Ritschel, Thomas Kiderle, Klaus Weber, Florian Lingenfelser, Tobias Baur, and Elisabeth Andr\u00e9. 2020. Multimodal joke generation and paralinguistic personalization for a socially-aware robot. In Advances in Practical Applications of Agents, Multi-Agent Systems, and Trustworthiness. The PAAMS Collection: 18th International Conference, PAAMS 2020, L'Aquila, Italy, October 7--9, 2020, Proceedings 18. Springer, 278--290."},{"key":"e_1_3_2_1_50_1","volume-title":"MemeBot: Towards automatic image meme generation. arXiv preprint arXiv:2004.14571","author":"Sadasivam Aadhavan","year":"2020","unstructured":"Aadhavan Sadasivam, Kausic Gunasekar, Hasan Davulcu, and Yezhou Yang. 2020. MemeBot: Towards automatic image meme generation. arXiv preprint arXiv:2004.14571 (2020)."},{"key":"e_1_3_2_1_51_1","volume-title":"Coling 2008: Companion volume: Posters. Coling 2008 Organizing Committee","author":"Sj\u00f6bergh Jonas","year":"2028","unstructured":"Jonas Sj\u00f6bergh and Kenji Araki. 2008. A Complete and Modestly Funny System for Generating and Performing Japanese Stand-Up Comedy. In Coling 2008: Companion volume: Posters. Coling 2008 Organizing Committee, Manchester, UK, 111--114. https:\/\/www.aclweb.org\/anthology\/C08--2028"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.pragma.2008.12.003"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3371158.3371403"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2021.3127324"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00432"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-88480-2_49"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/MIPR.2019.00109"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1153"},{"key":"e_1_3_2_1_59_1","unstructured":"Deyao Zhu Jun Chen Xiaoqian Shen Xiang Li and Mohamed Elhoseiny. 2023. MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models. arxiv: 2304.10592 [cs.CV]"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-short.15"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681332","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681332","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:43Z","timestamp":1750295863000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681332"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":60,"alternative-id":["10.1145\/3664647.3681332","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681332","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}