{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T18:52:51Z","timestamp":1776106371131,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":53,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,4,22]],"date-time":"2025-04-22T00:00:00Z","timestamp":1745280000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,4,28]]},"DOI":"10.1145\/3696410.3714895","type":"proceedings-article","created":{"date-parts":[[2025,4,22]],"date-time":"2025-04-22T22:57:28Z","timestamp":1745362648000},"page":"4734-4744","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Ask, Acquire, Understand: A Multimodal Agent-based Framework for Social Abuse Detection in Memes"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-2096-0270","authenticated-orcid":false,"given":"Xuanrui","family":"Lin","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-2152-0569","authenticated-orcid":false,"given":"Chao","family":"Jia","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-2811-1865","authenticated-orcid":false,"given":"Junhui","family":"Ji","sequence":"additional","affiliation":[{"name":"Zhipu AI, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-4580-9467","authenticated-orcid":false,"given":"Hui","family":"Han","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0191-7171","authenticated-orcid":false,"given":"Usman","family":"Naseem","sequence":"additional","affiliation":[{"name":"Macquarie University, Sydney, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,4,22]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Decoding Memes: A Comprehensive Analysis of Late and Early Fusion Models for Explainable Meme Analysis. In Companion Proceedings of the ACM on Web Conference","author":"Abdullakutty Faseela","year":"2024","unstructured":"Faseela Abdullakutty and Usman Naseem. 2024. Decoding Memes: A Comprehensive Analysis of Late and Early Fusion Models for Explainable Meme Analysis. In Companion Proceedings of the ACM on Web Conference 2024. 1681--1689."},{"key":"e_1_3_2_1_2_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_3_1","volume-title":"Shaden Shaar, Hamed Firooz, and Preslav Nakov.","author":"Alam Firoj","year":"2021","unstructured":"Firoj Alam, Stefano Cresci, Tanmoy Chakraborty, Fabrizio Silvestri, Dimiter Dimitrov, Giovanni Da San Martino, Shaden Shaar, Hamed Firooz, and Preslav Nakov. 2021. A survey on multimodal disinformation detection. arXiv preprint arXiv:2103.12541 (2021)."},{"key":"e_1_3_2_1_4_1","volume-title":"Openflamingo: An open-source framework for training large autoregressive vision-language models. arXiv preprint arXiv:2308.01390","author":"Awadalla Anas","year":"2023","unstructured":"Anas Awadalla, Irena Gao, Josh Gardner, Jack Hessel, Yusuf Hanafy, Wanrong Zhu, Kalyani Marathe, Yonatan Bitton, Samir Gadre, Shiori Sagawa, et al. 2023. Openflamingo: An open-source framework for training large autoregressive vision-language models. arXiv preprint arXiv:2308.01390 (2023)."},{"key":"e_1_3_2_1_5_1","volume-title":"Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_1_6_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. Advances in neural information processing systems Vol. 33 (2020) 1877--1901."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1239"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612498"},{"key":"e_1_3_2_1_9_1","volume-title":"Wen-Haw Chong, and Jing Jiang.","author":"Cao Rui","year":"2023","unstructured":"Rui Cao, Roy Ka-Wei Lee, Wen-Haw Chong, and Jing Jiang. 2023b. Prompting for multimodal hateful meme classification. arXiv preprint arXiv:2302.04156 (2023)."},{"key":"e_1_3_2_1_10_1","volume-title":"Chateval: Towards better llm-based evaluators through multi-agent debate. arXiv preprint arXiv:2308.07201","author":"Chan Chi-Min","year":"2023","unstructured":"Chi-Min Chan, Weize Chen, Yusheng Su, Jianxuan Yu, Wei Xue, Shanghang Zhang, Jie Fu, and Zhiyuan Liu. 2023. Chateval: Towards better llm-based evaluators through multi-agent debate. arXiv preprint arXiv:2308.07201 (2023)."},{"key":"e_1_3_2_1_11_1","volume-title":"Minigpt-v2: large language model as a unified interface for vision-language multi-task learning. arXiv preprint arXiv:2310.09478","author":"Chen Jun","year":"2023","unstructured":"Jun Chen, Deyao Zhu, Xiaoqian Shen, Xiang Li, Zechun Liu, Pengchuan Zhang, Raghuraman Krishnamoorthi, Vikas Chandra, Yunyang Xiong, and Mohamed Elhoseiny. 2023. Minigpt-v2: large language model as a unified interface for vision-language multi-task learning. arXiv preprint arXiv:2310.09478 (2023)."},{"key":"e_1_3_2_1_12_1","volume-title":"Unveiling Misogyny Memes: A Multimodal Analysis of Modality Effects on Identification. In Companion Proceedings of the ACM on Web Conference 2024. 1864","author":"Chen Shijing","year":"2024","unstructured":"Shijing Chen, Usman Naseem, Imran Razzak, and Flora Salim. 2024. Unveiling Misogyny Memes: A Multimodal Analysis of Modality Effects on Identification. In Companion Proceedings of the ACM on Web Conference 2024. 1864--1871."},{"key":"e_1_3_2_1_13_1","first-page":"1","article-title":"Scaling instruction-finetuned language models","volume":"25","author":"Chung Hyung Won","year":"2024","unstructured":"Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Yunxuan Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, et al. 2024. Scaling instruction-finetuned language models. Journal of Machine Learning Research, Vol. 25, 70 (2024), 1--53.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_14_1","volume-title":"Junqi Zhao, Weisheng Wang, Boyang Li, Pascale N Fung, and Steven Hoi.","author":"Dai Wenliang","year":"2024","unstructured":"Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale N Fung, and Steven Hoi. 2024. Instructblip: Towards general-purpose vision-language models with instruction tuning. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_15_1","volume-title":"Improving factuality and reasoning in language models through multiagent debate. arXiv preprint arXiv:2305.14325","author":"Du Yilun","year":"2023","unstructured":"Yilun Du, Shuang Li, Antonio Torralba, Joshua B Tenenbaum, and Igor Mordatch. 2023. Improving factuality and reasoning in language models through multiagent debate. arXiv preprint arXiv:2305.14325 (2023)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.5555\/3463952.3464010"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.semeval-1.74"},{"key":"e_1_3_2_1_18_1","volume-title":"Nando De Freitas, and Shimon Whiteson.","author":"Foerster Jakob","year":"2016","unstructured":"Jakob Foerster, Ioannis Alexandros Assael, Nando De Freitas, and Shimon Whiteson. 2016. Learning to communicate with deep multi-agent reinforcement learning. Advances in neural information processing systems, Vol. 29 (2016)."},{"key":"e_1_3_2_1_19_1","volume-title":"Multimodal-gpt: A vision and language model for dialogue with humans. arXiv preprint arXiv:2305.04790","author":"Gong Tao","year":"2023","unstructured":"Tao Gong, Chengqi Lyu, Shilong Zhang, Yudong Wang, Miao Zheng, Qian Zhao, Kuikun Liu, Wenwei Zhang, Ping Luo, and Kai Chen. 2023. Multimodal-gpt: A vision and language model for dialogue with humans. arXiv preprint arXiv:2305.04790 (2023)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3589334.3648146"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3543507.3587427"},{"key":"e_1_3_2_1_22_1","volume-title":"Supervised multimodal bitransformers for classifying images and text. arXiv preprint arXiv:1909.02950","author":"Kiela Douwe","year":"2019","unstructured":"Douwe Kiela, Suvrat Bhooshan, Hamed Firooz, Ethan Perez, and Davide Testuggine. 2019. Supervised multimodal bitransformers for classifying images and text. arXiv preprint arXiv:1909.02950 (2019)."},{"key":"e_1_3_2_1_23_1","first-page":"2611","article-title":"The hateful memes challenge: Detecting hate speech in multimodal memes","volume":"33","author":"Kiela Douwe","year":"2020","unstructured":"Douwe Kiela, Hamed Firooz, Aravind Mohan, Vedanuj Goswami, Amanpreet Singh, Pratik Ringshia, and Davide Testuggine. 2020a. The hateful memes challenge: Detecting hate speech in multimodal memes. Advances in Neural Information Processing Systems, Vol. 33 (2020), 2611--2624.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_24_1","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Kiela Douwe","year":"2020","unstructured":"Douwe Kiela, Hamed Firooz, Aravind Mohan, Vedanuj Goswami, Amanpreet Singh, Pratik Ringshia, and Davide Testuggine. 2020b. The Hateful Memes Challenge: Detecting Hate Speech in Multimodal Memes. In Advances in Neural Information Processing Systems, Vol. 33. Curran Associates, Inc., 2611--2624. https:\/\/proceedings.neurips.cc\/paper\/2020\/hash\/1b84c4cee2b8b3d823b30e2d604b1878-Abstract.html"},{"key":"e_1_3_2_1_25_1","volume-title":"Machel Reid, Yutaka Matsuo, and Yusuke Iwasawa.","author":"Kojima Takeshi","year":"2022","unstructured":"Takeshi Kojima, Shixiang Shane Gu, Machel Reid, Yutaka Matsuo, and Yusuke Iwasawa. 2022. Large language models are zero-shot reasoners. Advances in neural information processing systems, Vol. 35 (2022), 22199--22213."},{"key":"e_1_3_2_1_26_1","volume-title":"Hate-CLIPper: Multimodal hateful meme classification based on cross-modal interaction of CLIP features. arXiv preprint arXiv:2210.05916","author":"Kumar Gokul Karthik","year":"2022","unstructured":"Gokul Karthik Kumar and Karthik Nandakumar. 2022. Hate-CLIPper: Multimodal hateful meme classification based on cross-modal interaction of CLIP features. arXiv preprint arXiv:2210.05916 (2022)."},{"key":"e_1_3_2_1_27_1","volume-title":"Hani Itani, Dmitrii Khizbullin, and Bernard Ghanem.","author":"Li Guohao","year":"2023","unstructured":"Guohao Li, Hasan Abed Al Kader Hammoud, Hani Itani, Dmitrii Khizbullin, and Bernard Ghanem. 2023a. Camel: Communicative agents for'' mind'' exploration of large scale language model society. (2023)."},{"key":"e_1_3_2_1_28_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023b. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730--19742."},{"key":"e_1_3_2_1_29_1","volume-title":"Encouraging divergent thinking in large language models through multi-agent debate. arXiv preprint arXiv:2305.19118","author":"Liang Tian","year":"2023","unstructured":"Tian Liang, Zhiwei He, Wenxiang Jiao, Xing Wang, Yan Wang, Rui Wang, Yujiu Yang, Zhaopeng Tu, and Shuming Shi. 2023. Encouraging divergent thinking in large language models through multi-agent debate. arXiv preprint arXiv:2305.19118 (2023)."},{"key":"e_1_3_2_1_30_1","volume-title":"GOAT-Bench: Safety Insights to Large Multimodal Models through Meme-Based Social Abuse. arXiv preprint arXiv:2401.01523","author":"Lin Hongzhan","year":"2024","unstructured":"Hongzhan Lin, Ziyang Luo, Bo Wang, Ruichao Yang, and Jing Ma. 2024. GOAT-Bench: Safety Insights to Large Multimodal Models through Meme-Based Social Abuse. arXiv preprint arXiv:2401.01523 (2024)."},{"key":"e_1_3_2_1_31_1","volume-title":"Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Yuheng Li, and Yong Jae Lee. 2023a. Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744 (2023)."},{"key":"e_1_3_2_1_32_1","unstructured":"Haotian Liu Chunyuan Li Yuheng Li Bo Li Yuanhan Zhang Sheng Shen and Yong Jae Lee. 2024a. LLaVA-NeXT: Improved reasoning OCR and world knowledge. https:\/\/llava-vl.github.io\/blog\/2024-01--30-llava-next\/"},{"key":"e_1_3_2_1_33_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2024b. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_34_1","volume-title":"Training socially aligned language models in simulated human society. arXiv preprint arXiv:2305.16960","author":"Liu Ruibo","year":"2023","unstructured":"Ruibo Liu, Ruixin Yang, Chenyan Jia, Ge Zhang, Denny Zhou, Andrew M Dai, Diyi Yang, and Soroush Vosoughi. 2023b. Training socially aligned language models in simulated human society. arXiv preprint arXiv:2305.16960 (2023)."},{"key":"e_1_3_2_1_35_1","volume-title":"OpenAI Pieter Abbeel, and Igor Mordatch","author":"Lowe Ryan","year":"2017","unstructured":"Ryan Lowe, Yi I Wu, Aviv Tamar, Jean Harb, OpenAI Pieter Abbeel, and Igor Mordatch. 2017. Multi-agent actor-critic for mixed cooperative-competitive environments. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_36_1","volume-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_37_1","volume-title":"Roco: Dialectic multi-robot collaboration with large language models. arXiv preprint arXiv:2307.04738","author":"Mandi Zhao","year":"2023","unstructured":"Zhao Mandi, Shreeya Jain, and Shuran Song. 2023. Roco: Dialectic multi-robot collaboration with large language models. arXiv preprint arXiv:2307.04738 (2023)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3539597.3570450"},{"key":"e_1_3_2_1_39_1","unstructured":"Long Ouyang Jeffrey Wu Xu Jiang Diogo Almeida Carroll Wainwright Pamela Mishkin Chong Zhang Sandhini Agarwal Katarina Slama Alex Ray et al. 2022. Training language models to follow instructions with human feedback. Advances in neural information processing systems Vol. 35 (2022) 27730--27744."},{"key":"e_1_3_2_1_40_1","volume-title":"Preslav Nakov, and Tanmoy Chakraborty.","author":"Pramanick Shraman","year":"2021","unstructured":"Shraman Pramanick, Dimitar Dimitrov, Rituparna Mukherjee, Shivam Sharma, Md Shad Akhtar, Preslav Nakov, and Tanmoy Chakraborty. 2021a. Detecting harmful memes and their targets. arXiv preprint arXiv:2110.00413 (2021)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2110.00413"},{"key":"e_1_3_2_1_42_1","volume-title":"Preslav Nakov, and Tanmoy Chakraborty.","author":"Pramanick Shraman","year":"2021","unstructured":"Shraman Pramanick, Shivam Sharma, Dimitar Dimitrov, Md Shad Akhtar, Preslav Nakov, and Tanmoy Chakraborty. 2021c. MOMENTA: A multimodal framework for detecting harmful memes and their targets. arXiv preprint arXiv:2109.05184 (2021)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","unstructured":"Shraman Pramanick Shivam Sharma Dimitar Dimitrov Md Shad Akhtar Preslav Nakov and Tanmoy Chakraborty. 2021 d. MOMENTA: A Multimodal Framework for Detecting Harmful Memes and Their Targets. https:\/\/doi.org\/10.48550\/arXiv.2109.05184 arXiv:2109.05184 [cs].","DOI":"10.48550\/arXiv.2109.05184"},{"key":"e_1_3_2_1_44_1","volume-title":"Hamed Firooz, Alon Halevy, Fabrizio Silvestri, Preslav Nakov, Tanmoy Chakraborty, et al.","author":"Sharma Shivam","year":"2022","unstructured":"Shivam Sharma, Firoj Alam, Md Akhtar, Dimitar Dimitrov, Giovanni Da San Martino, Hamed Firooz, Alon Halevy, Fabrizio Silvestri, Preslav Nakov, Tanmoy Chakraborty, et al. 2022. Detecting and Understanding Harmful Memes: A Survey. arXiv preprint arXiv:2205.04274 (2022)."},{"key":"e_1_3_2_1_45_1","volume-title":"Proceedings of the Second Workshop on Trolling, Aggression and Cyberbullying. European Language Resources Association (ELRA)","author":"Suryawanshi Shardul","year":"2020","unstructured":"Shardul Suryawanshi, Bharathi Raja Chakravarthi, Mihael Arcan, and Paul Buitelaar. 2020. Multimodal Meme Dataset (MultiOFF) for Identifying Offensive Content in Image and Text. In Proceedings of the Second Workshop on Trolling, Aggression and Cyberbullying. European Language Resources Association (ELRA), Marseille, France, 32--41. https:\/\/aclanthology.org\/2020.trac-1.6"},{"key":"e_1_3_2_1_46_1","volume-title":"Cogvlm: Visual expert for pretrained language models. arXiv preprint arXiv:2311.03079","author":"Wang Weihan","year":"2023","unstructured":"Weihan Wang, Qingsong Lv, Wenmeng Yu, Wenyi Hong, Ji Qi, Yan Wang, Junhui Ji, Zhuoyi Yang, Lei Zhao, Xixuan Song, et al. 2023. Cogvlm: Visual expert for pretrained language models. arXiv preprint arXiv:2311.03079 (2023)."},{"key":"e_1_3_2_1_47_1","volume-title":"Brian Lester, Nan Du, Andrew M Dai, and Quoc V Le.","author":"Wei Jason","year":"2021","unstructured":"Jason Wei, Maarten Bosma, Vincent Y Zhao, Kelvin Guu, Adams Wei Yu, Brian Lester, Nan Du, Andrew M Dai, and Quoc V Le. 2021. Finetuned language models are zero-shot learners. arXiv preprint arXiv:2109.01652 (2021)."},{"key":"e_1_3_2_1_48_1","unstructured":"Jason Wei Yi Tay Rishi Bommasani Colin Raffel Barret Zoph Sebastian Borgeaud Dani Yogatama Maarten Bosma Denny Zhou Donald Metzler et al. 2022a. Emergent abilities of large language models. arXiv preprint arXiv:2206.07682 (2022)."},{"key":"e_1_3_2_1_49_1","volume-title":"Denny Zhou, et al.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al. 2022b. Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems, Vol. 35 (2022), 24824--24837."},{"key":"e_1_3_2_1_50_1","volume-title":"Examining the inter-consistency of large language models: An in-depth analysis via debate","author":"Xiong Kai","unstructured":"Kai Xiong, Xiao Ding, Yixin Cao, Ting Liu, and Bing Qin. 2023. Examining the inter-consistency of large language models: An in-depth analysis via debate. Association for Computational Linguistics."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547851"},{"key":"e_1_3_2_1_52_1","unstructured":"Qinghao Ye Haiyang Xu Guohai Xu Jiabo Ye Ming Yan Yiyang Zhou Junyang Wang Anwen Hu Pengcheng Shi Yaya Shi et al. 2023. mplug-owl: Modularization empowers large language models with multimodality. arXiv preprint arXiv:2304.14178 (2023)."},{"key":"e_1_3_2_1_53_1","volume-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592","author":"Zhu Deyao","year":"2023","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2023. Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)."}],"event":{"name":"WWW '25: The ACM Web Conference 2025","location":"Sydney NSW Australia","acronym":"WWW '25","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Proceedings of the ACM on Web Conference 2025"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696410.3714895","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3696410.3714895","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:53Z","timestamp":1750295933000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696410.3714895"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,22]]},"references-count":53,"alternative-id":["10.1145\/3696410.3714895","10.1145\/3696410"],"URL":"https:\/\/doi.org\/10.1145\/3696410.3714895","relation":{},"subject":[],"published":{"date-parts":[[2025,4,22]]},"assertion":[{"value":"2025-04-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}