{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,23]],"date-time":"2026-07-23T15:42:57Z","timestamp":1784821377518,"version":"3.55.0"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,13]],"date-time":"2024-05-13T00:00:00Z","timestamp":1715558400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,13]]},"DOI":"10.1145\/3589334.3648146","type":"proceedings-article","created":{"date-parts":[[2024,5,8]],"date-time":"2024-05-08T07:08:13Z","timestamp":1715152093000},"page":"4585-4594","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":12,"title":["CapAlign: Improving Cross Modal Alignment via Informative Captioning for Harmful Meme Detection"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-2811-1865","authenticated-orcid":false,"given":"Junhui","family":"Ji","sequence":"first","affiliation":[{"name":"School of Computer Science, University of Sydney, Sydney, Australia"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-2096-0270","authenticated-orcid":false,"given":"Xuanrui","family":"Lin","sequence":"additional","affiliation":[{"name":"School of Electronic Information and Electrical Engineering, Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0191-7171","authenticated-orcid":false,"given":"Usman","family":"Naseem","sequence":"additional","affiliation":[{"name":"School of Computing, Macquarie University, Sydney, Australia"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,5,13]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Shaden Shaar, Hamed Firooz, and Preslav Nakov.","author":"Alam Firoj","year":"2021","unstructured":"Firoj Alam, Stefano Cresci, Tanmoy Chakraborty, Fabrizio Silvestri, Dimiter Dimitrov, Giovanni Da San Martino, Shaden Shaar, Hamed Firooz, and Preslav Nakov. 2021. A survey on multimodal disinformation detection. arXiv preprint arXiv:2103.12541 (2021)."},{"key":"e_1_3_2_2_2_1","volume-title":"Shaden Shaar, Hamed Firooz, and Preslav Nakov.","author":"Alam Firoj","year":"2022","unstructured":"Firoj Alam, Stefano Cresci, Tanmoy Chakraborty, Fabrizio Silvestri, Dimiter Dimitrov, Giovanni Da San Martino, Shaden Shaar, Hamed Firooz, and Preslav Nakov. 2022. A Survey on Multimodal Disinformation Detection. arxiv: 2103.12541 [cs.MM]"},{"key":"e_1_3_2_2_3_1","volume-title":"Advances in Neural Information Processing Systems","volume":"35","author":"Alayrac Jean-Baptiste","year":"2022","unstructured":"Jean-Baptiste Alayrac, Jeff Donahue, Pauline Luc, Antoine Miech, Iain Barr, Yana Hasson, Karel Lenc, Arthur Mensch, Katherine Millican, Malcolm Reynolds, Roman Ring, Eliza Rutherford, Serkan Cabi, Tengda Han, Zhitao Gong, Sina Samangooei, Marianne Monteiro, Jacob L. Menick, Sebastian Borgeaud, Andy Brock, Aida Nematzadeh, Sahand Sharifzadeh, Miko\u00c5?aj Bi\u00c5?kowski, Ricardo Barreira, Oriol Vinyals, Andrew Zisserman, and Kar\u00c3\u00a9n Simonyan. 2022. Flamingo: a Visual Language Model for Few-Shot Learning. Advances in Neural Information Processing Systems, Vol. 35 (Dec. 2022), 23716--23736. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2022\/hash\/960a172bc7fbf0177ccccbb411a7d800-Abstract-Conference.html"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISDA.2009.230"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00193"},{"key":"e_1_3_2_2_6_1","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel Ziegler, Jeffrey Wu, Clemens Winter, Chris Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models are Few-Shot Learners. In Advances in Neural Information Processing Systems, Vol. 33. Curran Associates, Inc., 1877--1901. https:\/\/proceedings.neurips.cc\/paper\/2020\/hash\/1457c0d6bfcb4967418bfb8ac142f64a-Abstract.html"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1239"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612498"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.22"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"crossref","unstructured":"Rui Cao Roy Ka-Wei Lee Wen-Haw Chong and Jing Jiang. 2023 b. Prompting for Multimodal Hateful Meme Classification. arxiv: 2302.04156 [cs.CL]","DOI":"10.18653\/v1\/2022.emnlp-main.22"},{"key":"e_1_3_2_2_11_1","volume-title":"Advances in Neural Information Processing Systems","volume":"30","author":"Christiano Paul F","year":"2017","unstructured":"Paul F Christiano, Jan Leike, Tom Brown, Miljan Martic, Shane Legg, and Dario Amodei. 2017. Deep Reinforcement Learning from Human Preferences. In Advances in Neural Information Processing Systems, Vol. 30. Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2017\/hash\/d5e2c0adad503c91f91df240d0cd4e49-Abstract.html"},{"key":"e_1_3_2_2_12_1","unstructured":"Hyung Won Chung Le Hou Shayne Longpre Barret Zoph Yi Tay William Fedus Eric Li Xuezhi Wang Mostafa Dehghani Siddhartha Brahma et al. 2022. Scaling instruction-finetuned language models. arXiv preprint arXiv:2210.11416 (2022)."},{"key":"e_1_3_2_2_13_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","unstructured":"Yushi Hu Hang Hua Zhengyuan Yang Weijia Shi Noah A. Smith and Jiebo Luo. 2023. PromptCap: Prompt-Guided Task-Aware Image Captioning. https:\/\/doi.org\/10.48550\/arXiv.2211.09699 arXiv:2211.09699 [cs].","DOI":"10.48550\/arXiv.2211.09699"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.243"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3543507.3587427"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3543507.3587427"},{"key":"e_1_3_2_2_19_1","volume-title":"Supervised multimodal bitransformers for classifying images and text. arXiv preprint arXiv:1909.02950","author":"Kiela Douwe","year":"2019","unstructured":"Douwe Kiela, Suvrat Bhooshan, Hamed Firooz, Ethan Perez, and Davide Testuggine. 2019. Supervised multimodal bitransformers for classifying images and text. arXiv preprint arXiv:1909.02950 (2019)."},{"key":"e_1_3_2_2_20_1","volume-title":"Supervised Multimodal Bitransformers for Classifying Images and Text. arxiv","author":"Kiela Douwe","year":"1909","unstructured":"Douwe Kiela, Suvrat Bhooshan, Hamed Firooz, Ethan Perez, and Davide Testuggine. 2020a. Supervised Multimodal Bitransformers for Classifying Images and Text. arxiv: 1909.02950 [cs.CL]"},{"key":"e_1_3_2_2_21_1","first-page":"2611","article-title":"The hateful memes challenge: Detecting hate speech in multimodal memes","volume":"33","author":"Kiela Douwe","year":"2020","unstructured":"Douwe Kiela, Hamed Firooz, Aravind Mohan, Vedanuj Goswami, Amanpreet Singh, Pratik Ringshia, and Davide Testuggine. 2020b. The hateful memes challenge: Detecting hate speech in multimodal memes. Advances in Neural Information Processing Systems, Vol. 33 (2020), 2611--2624.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_22_1","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Kiela Douwe","year":"2020","unstructured":"Douwe Kiela, Hamed Firooz, Aravind Mohan, Vedanuj Goswami, Amanpreet Singh, Pratik Ringshia, and Davide Testuggine. 2020c. The Hateful Memes Challenge: Detecting Hate Speech in Multimodal Memes. In Advances in Neural Information Processing Systems, Vol. 33. Curran Associates, Inc., 2611--2624. https:\/\/proceedings.neurips.cc\/paper\/2020\/hash\/1b84c4cee2b8b3d823b30e2d604b1878-Abstract.html"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1412.6980"},{"key":"e_1_3_2_2_24_1","volume-title":"ICML 2022 Workshop on Knowledge Retrieval and Language Models.","author":"Kojima Takeshi","unstructured":"Takeshi Kojima, Shixiang Shane Gu, Machel Reid, Yutaka Matsuo, and Yusuke Iwasawa. [n.,d.]. Large Language Models are Zero-Shot Reasoners. In ICML 2022 Workshop on Knowledge Retrieval and Language Models."},{"key":"e_1_3_2_2_25_1","volume-title":"Hate-CLIPper: Multimodal Hateful Meme Classification based on Cross-modal Interaction of CLIP Features. arXiv preprint arXiv:2210.05916","author":"Kumar Gokul Karthik","year":"2022","unstructured":"Gokul Karthik Kumar and Karthik Nanadakumar. 2022. Hate-CLIPper: Multimodal Hateful Meme Classification based on Cross-modal Interaction of CLIP Features. arXiv preprint arXiv:2210.05916 (2022)."},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","unstructured":"Junnan Li Dongxu Li Silvio Savarese and Steven Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. https:\/\/doi.org\/10.48550\/arXiv.2301.12597 arXiv:2301.12597 [cs].","DOI":"10.48550\/arXiv.2301.12597"},{"key":"e_1_3_2_2_27_1","volume-title":"Proceedings of the 39th International Conference on Machine Learning. PMLR, 12888--12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. In Proceedings of the 39th International Conference on Machine Learning. PMLR, 12888--12900. https:\/\/proceedings.mlr.press\/v162\/li22n.html ISSN: 2640--3498."},{"key":"e_1_3_2_2_28_1","volume-title":"Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557","author":"Li Liunian Harold","year":"2019","unstructured":"Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, and Kai-Wei Chang. 2019. Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557 (2019)."},{"key":"e_1_3_2_2_29_1","volume-title":"ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks. arxiv","author":"Lu Jiasen","year":"1908","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks. arxiv: 1908.02265 [cs.CV]"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","unstructured":"Shie Mannor Dori Peleg and Reuven Rubinstein. 2005. The cross entropy method for classification. 561--568. https:\/\/doi.org\/10.1145\/1102351.1102422","DOI":"10.1145\/1102351.1102422"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3539597.3570450"},{"key":"e_1_3_2_2_32_1","first-page":"27730","article-title":"Training language models to follow instructions with human feedback","volume":"35","author":"Ouyang Long","year":"2022","unstructured":"Long Ouyang, Jeffrey Wu, Xu Jiang, Diogo Almeida, Carroll Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, et al. 2022a. Training language models to follow instructions with human feedback. Advances in Neural Information Processing Systems, Vol. 35 (2022), 27730--27744.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_33_1","volume-title":"Advances in Neural Information Processing Systems","volume":"35","author":"Ouyang Long","year":"2022","unstructured":"Long Ouyang, Jeffrey Wu, Xu Jiang, Diogo Almeida, Carroll Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul F. Christiano, Jan Leike, and Ryan Lowe. 2022b. Training language models to follow instructions with human feedback. Advances in Neural Information Processing Systems, Vol. 35 (Dec. 2022), 27730--27744. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2022\/hash\/b1efde53be364a73914f58805a001731-Abstract-Conference.html"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","unstructured":"Fabio Petroni Tim Rockt\u00c3\u00a4schel Patrick Lewis Anton Bakhtin Yuxiang Wu Alexander H. Miller and Sebastian Riedel. 2019. Language Models as Knowledge Bases? https:\/\/doi.org\/10.48550\/ARXIV.1909.01066","DOI":"10.48550\/ARXIV.1909.01066"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","unstructured":"Shraman Pramanick Dimitar Dimitrov Rituparna Mukherjee Shivam Sharma Md. Shad Akhtar Preslav Nakov and Tanmoy Chakraborty. 2021a. Detecting Harmful Memes and Their Targets. (2021). https:\/\/doi.org\/10.48550\/ARXIV.2110.00413","DOI":"10.48550\/ARXIV.2110.00413"},{"key":"e_1_3_2_2_36_1","volume-title":"Preslav Nakov, and Tanmoy Chakraborty.","author":"Pramanick Shraman","year":"2021","unstructured":"Shraman Pramanick, Shivam Sharma, Dimitar Dimitrov, Md Shad Akhtar, Preslav Nakov, and Tanmoy Chakraborty. 2021b. MOMENTA: A multimodal framework for detecting harmful memes and their targets. arXiv preprint arXiv:2109.05184 (2021)."},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2109.05184"},{"key":"e_1_3_2_2_38_1","volume-title":"International Conference on Machine Learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. PMLR, 8748--8763."},{"key":"e_1_3_2_2_39_1","volume-title":"Hamed Firooz, Alon Halevy, Fabrizio Silvestri, Preslav Nakov, Tanmoy Chakraborty, et al.","author":"Sharma Shivam","year":"2022","unstructured":"Shivam Sharma, Firoj Alam, Md Akhtar, Dimitar Dimitrov, Giovanni Da San Martino, Hamed Firooz, Alon Halevy, Fabrizio Silvestri, Preslav Nakov, Tanmoy Chakraborty, et al. 2022. Detecting and Understanding Harmful Memes: A Survey. arXiv preprint arXiv:2205.04274 (2022)."},{"key":"e_1_3_2_2_40_1","volume-title":"Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556","author":"Simonyan Karen","year":"2014","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)."},{"key":"e_1_3_2_2_41_1","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Stiennon Nisan","year":"2020","unstructured":"Nisan Stiennon, Long Ouyang, Jeffrey Wu, Daniel Ziegler, Ryan Lowe, Chelsea Voss, Alec Radford, Dario Amodei, and Paul F Christiano. 2020. Learning to summarize with human feedback. In Advances in Neural Information Processing Systems, Vol. 33. Curran Associates, Inc., 3008--3021. https:\/\/proceedings.neurips.cc\/paper\/2020\/hash\/1f89885d556929e98d3ef9b86448f951-Abstract.html"},{"key":"e_1_3_2_2_42_1","volume-title":"Advances in Neural Information Processing Systems","volume":"34","author":"Tsimpoukelli Maria","year":"2021","unstructured":"Maria Tsimpoukelli, Jacob L Menick, Serkan Cabi, S. M. Ali Eslami, Oriol Vinyals, and Felix Hill. 2021. Multimodal Few-Shot Learning with Frozen Language Models. In Advances in Neural Information Processing Systems, Vol. 34. Curran Associates, Inc., 200--212. https:\/\/proceedings.neurips.cc\/paper\/2021\/hash\/01b7575c38dac42f3cfb7d500438b875-Abstract.html"},{"key":"e_1_3_2_2_43_1","volume-title":"International Conference on Learning Representations.","author":"Wei Jason","unstructured":"Jason Wei, Maarten Bosma, Vincent Zhao, Kelvin Guu, Adams Wei Yu, Brian Lester, Nan Du, Andrew M Dai, and Quoc V Le. [n.,d.] a. Finetuned Language Models are Zero-Shot Learners. In International Conference on Learning Representations."},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2201.11903"},{"key":"e_1_3_2_2_45_1","unstructured":"Jason Wei Xuezhi Wang Dale Schuurmans Maarten Bosma Fei Xia Ed H Chi Quoc V Le Denny Zhou et al. [n. d.] b. Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.29"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.634"},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547851"},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICME52920.2022.9859654"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","unstructured":"Deyao Zhu Jun Chen Kilichbek Haydarov Xiaoqian Shen Wenxuan Zhang and Mohamed Elhoseiny. 2023 a. ChatGPT Asks BLIP-2 Answers: Automatic Questioning Towards Enriched Visual Descriptions. https:\/\/doi.org\/10.48550\/arXiv.2303.06594 arXiv:2303.06594 [cs].","DOI":"10.48550\/arXiv.2303.06594"},{"key":"e_1_3_2_2_51_1","unstructured":"Deyao Zhu Jun Chen Kilichbek Haydarov Xiaoqian Shen Wenxuan Zhang and Mohamed Elhoseiny. 2023 b. ChatGPT Asks BLIP-2 Answers: Automatic Questioning Towards Enriched Visual Descriptions. arxiv: 2303.06594 [cs.CV]io"}],"event":{"name":"WWW '24: The ACM Web Conference 2024","location":"Singapore Singapore","acronym":"WWW '24","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Proceedings of the ACM Web Conference 2024"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3589334.3648146","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3589334.3648146","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:28:06Z","timestamp":1755822486000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3589334.3648146"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,13]]},"references-count":51,"alternative-id":["10.1145\/3589334.3648146","10.1145\/3589334"],"URL":"https:\/\/doi.org\/10.1145\/3589334.3648146","relation":{},"subject":[],"published":{"date-parts":[[2024,5,13]]},"assertion":[{"value":"2024-05-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}