{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,7,3]],"date-time":"2025-07-03T12:10:05Z","timestamp":1751544605979,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","funder":[{"name":"University of Science, VNU-HCM","award":["T2025-03"],"award-info":[{"award-number":["T2025-03"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,25]]},"DOI":"10.1145\/3709020.3734830","type":"proceedings-article","created":{"date-parts":[[2025,7,3]],"date-time":"2025-07-03T11:28:01Z","timestamp":1751542081000},"page":"1-10","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["LLaVA-SNIPPER: Scene-Graph-based Inference with Multimodal LLMs for Explainable Out-of-Context Misinformation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7729-2927","authenticated-orcid":false,"given":"Trong-Thuan","family":"Nguyen","sequence":"first","affiliation":[{"name":"University of Science, VNU-HCM, Ho Chi Minh, Vietnam and Vietnam National University, Ho Chi Minh, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3046-3041","authenticated-orcid":false,"given":"Minh-Triet","family":"Tran","sequence":"additional","affiliation":[{"name":"University of Science, VNU-HCM, Ho Chi Minh, Vietnam and Vietnam National University, Ho Chi Minh, Vietnam"}]}],"member":"320","published-online":{"date-parts":[[2025,8,24]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01452"},{"key":"e_1_3_3_2_3_2","unstructured":"Ebtesam Almazrouei Hamza Alobeidli Abdulaziz Alshamsi Alessandro Cappelli Ruxandra Cojocaru Merouane Debbah Etienne Goffinet Daniel Heslow Julien Launay Quentin Malartic Badreddine Noune Baptiste Pannier and Guilherme Penedo. 2023. Falcon-40B: an open large language model with state-of-the-art performance. (2023)."},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"crossref","first-page":"557","DOI":"10.1007\/978-981-16-0733-2_39","volume-title":"Proceedings of second international conference on computing, communications, and cyber-security: IC4S 2020","author":"Chadha Anupama","year":"2021","unstructured":"Anupama Chadha, Vaibhav Kumar, Sonu Kashyap, and Mayank Gupta. 2021. Deepfake: an overview. In Proceedings of second international conference on computing, communications, and cyber-security: IC4S 2020. Springer, 557\u2013566."},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","unstructured":"Wenliang Dai Junnan Li Dongxu Li Anthony Meng\u00a0Huat Tiong Junqi Zhao Weisheng Wang Boyang Li Pascale Fung and Steven C.\u00a0H. Hoi. 2023. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. CoRR abs\/2305.06500 (2023). 10.48550\/ARXIV.2305.06500 arXiv:https:\/\/arXiv.org\/abs\/2305.06500","DOI":"10.48550\/ARXIV.2305.06500"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00179"},{"key":"e_1_3_3_2_7_2","volume-title":"International Conference on Learning Representations","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=YicbFdNTTy"},{"key":"e_1_3_3_2_8_2","unstructured":"Yimeng Gu Mengqi Zhang Ignacio Castro Shu Wu and Gareth Tyson. 2024. Learning Domain-Invariant Features for Out-of-Context News Detection. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.07430 (2024)."},{"key":"e_1_3_3_2_9_2","volume-title":"International Conference on Learning Representations","author":"Hu Edward\u00a0J","year":"2022","unstructured":"Edward\u00a0J Hu, yelong shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2022. LoRA: Low-Rank Adaptation of Large Language Models. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=nZeVKeeFYf9"},{"key":"e_1_3_3_2_10_2","first-page":"24229","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Im Jinbae","year":"2024","unstructured":"Jinbae Im, JeongYeon Nam, Nokyung Park, Hyungmin Lee, and Seunghyun Park. 2024. Egtr: Extracting graph from transformer for scene graph generation. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 24229\u201324238."},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298990"},{"key":"e_1_3_3_2_12_2","first-page":"1197","volume-title":"Proceedings of the Asian Conference on Computer Vision","author":"Kalla Jayateja","year":"2024","unstructured":"Jayateja Kalla, Soma Biswas, et\u00a0al. 2024. CoVLM: Leveraging Consensus from Vision-Language Models for Semi-supervised Multimodal Fake News Detection. In Proceedings of the Asian Conference on Computer Vision. 1197\u20131214."},{"key":"e_1_3_3_2_13_2","unstructured":"Kumud Lakara Georgia Channing Juil Sock Christian Rupprecht Philip Torr John Collomosse and Christian\u00a0Schroeder de Witt. 2025. LLM-Consensus: Multi-Agent Debate for Visual Misinformation Detection. arxiv:https:\/\/arXiv.org\/abs\/2410.20140\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2410.20140"},{"key":"e_1_3_3_2_14_2","unstructured":"Liunian\u00a0Harold Li Mark Yatskar Da Yin Cho-Jui Hsieh and Kai-Wei Chang. 2019. VisualBERT: A Simple and Performant Baseline for Vision and Language. CoRR abs\/1908.03557 (2019). arXiv:https:\/\/arXiv.org\/abs\/1908.03557http:\/\/arxiv.org\/abs\/1908.03557"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2021.EMNLP-MAIN.542"},{"key":"e_1_3_3_2_16_2","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong\u00a0Jae Lee. 2024. Visual instruction tuning. Advances in neural information processing systems 36 (2024)."},{"key":"e_1_3_3_2_17_2","first-page":"38","volume-title":"European Conference on Computer Vision","author":"Liu Shilong","year":"2024","unstructured":"Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Qing Jiang, Chunyuan Li, Jianwei Yang, Hang Su, et\u00a0al. 2024. Grounding dino: Marrying dino with grounded pre-training for open-set object detection. In European Conference on Computer Vision. Springer, 38\u201355."},{"key":"e_1_3_3_2_18_2","first-page":"11645","volume-title":"Proceedings of the AAAI conference on artificial intelligence","volume":"34","author":"Liu Yongfei","year":"2020","unstructured":"Yongfei Liu, Bo Wan, Xiaodan Zhu, and Xuming He. 2020. Learning cross-modal context graph for visual grounding. In Proceedings of the AAAI conference on artificial intelligence, Vol.\u00a034. 11645\u201311652."},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2021.EMNLP-MAIN.545"},{"key":"e_1_3_3_2_20_2","first-page":"2819","volume-title":"Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision","author":"Mu Michael","year":"2023","unstructured":"Michael Mu, Sreyasee Das\u00a0Bhattacharjee, and Junsong Yuan. 2023. Self-supervised distilled learning for multi-modal misinformation identification. In Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision. 2819\u20132828."},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00144"},{"key":"e_1_3_3_2_22_2","unstructured":"Trong-Thuan Nguyen Pha Nguyen Jackson Cothren Alper Yilmaz and Khoa Luu. 2025. HyperGLM: HyperGraph for Video Scene Graph Generation and Anticipation. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2025)."},{"key":"e_1_3_3_2_23_2","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Nguyen Trong-Thuan","year":"2024","unstructured":"Trong-Thuan Nguyen, Pha Nguyen, and Khoa Luu. 2024. HIG: Hierarchical Interlacement Graph Approach to Scene Graph Generation in Video Understanding. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_3_2_24_2","volume-title":"The Thirty-eighth Annual Conference on Neural Information Processing Systems","author":"Nguyen Trong-Thuan","year":"2024","unstructured":"Trong-Thuan Nguyen, Pha Nguyen, Li Xin, Cothren Jackson, Yilmaz Alper, and Khoa Luu. 2024. CYCLO: Cyclic Graph Transformer Approach to Multi-Object Relationship Modeling in Aerial Videos. In The Thirty-eighth Annual Conference on Neural Information Processing Systems."},{"key":"e_1_3_3_2_25_2","unstructured":"OpenAI. 2023. GPT-4V(ision) System Card. https:\/\/cdn.openai.com\/papers\/GPTV_System_Card.pdf."},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/3592572.3592842"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/3592572.3592842"},{"key":"e_1_3_3_2_28_2","first-page":"13052","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","author":"Qi Peng","year":"2024","unstructured":"Peng Qi, Zehong Yan, Wynne Hsu, and Mong\u00a0Li Lee. 2024. Sniffer: Multimodal large language model for explainable out-of-context misinformation detection. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 13052\u201313062."},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"crossref","unstructured":"Tianwen Qian Jingjing Chen Shaoxiang Chen Bo Wu and Yu-Gang Jiang. 2022. Scene graph refinement network for visual question answering. IEEE Transactions on Multimedia 25 (2022) 3950\u20133961.","DOI":"10.1109\/TMM.2022.3169065"},{"key":"e_1_3_3_2_30_2","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748\u20138763."},{"key":"e_1_3_3_2_31_2","series-title":"Proceedings of Machine Learning Research","first-page":"8748","volume-title":"Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18-24 July 2021, Virtual Event","volume":"139","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18-24 July 2021, Virtual Event(Proceedings of Machine Learning Research, Vol.\u00a0139), Marina Meila and Tong Zhang (Eds.). PMLR, 8748\u20138763. http:\/\/proceedings.mlr.press\/v139\/radford21a.html"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"crossref","unstructured":"Md\u00a0Shohel Rana Mohammad\u00a0Nur Nobi Beddhu Murali and Andrew\u00a0H Sung. 2022. Deepfake detection: A systematic literature review. IEEE access 10 (2022) 25494\u201325513.","DOI":"10.1109\/ACCESS.2022.3154404"},{"key":"e_1_3_3_2_33_2","first-page":"256","volume-title":"European Conference on Computer Vision","author":"Sima Chonghao","year":"2024","unstructured":"Chonghao Sima, Katrin Renz, Kashyap Chitta, Li Chen, Hanxue Zhang, Chengen Xie, Jens Bei\u00dfwenger, Ping Luo, Andreas Geiger, and Hongyang Li. 2024. Drivelm: Driving with graph visual question answering. In European Conference on Computer Vision. Springer, 256\u2013274."},{"key":"e_1_3_3_2_34_2","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar et\u00a0al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.13971 (2023)."},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093614"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","DOI":"10.1145\/3219819.3219903"},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"crossref","unstructured":"Mika Westerlund. 2019. The emergence of deepfake technology: A review. Technology innovation management review 9 11 (2019).","DOI":"10.22215\/timreview\/1282"},{"key":"e_1_3_3_2_38_2","unstructured":"Shengqiong Wu Hao Fei Hanwang Zhang and Tat-Seng Chua. 2024. Imagine that! abstract-to-intricate text-to-image synthesis with scene graph hallucination diffusion. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00997"},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548409"},{"key":"e_1_3_3_2_41_2","unstructured":"WU Yin Zhengxuan Zhang WANG Fuling Yuyu Luo Hui Xiong and Nan Tang. [n. d.]. Detecting Out-of-Context Misinformation via Multi-Agent and Multi-Grained Retrieval. ([n. d.])."},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"publisher","unstructured":"Yizhou Zhang Loc Trinh Defu Cao Zijun Cui and Yan Liu. 2023. Detecting Out-of-Context Multimodal Misinformation with interpretable neural-symbolic model. CoRR abs\/2304.07633 (2023). 10.48550\/ARXIV.2304.07633 arXiv:https:\/\/arXiv.org\/abs\/2304.07633","DOI":"10.48550\/ARXIV.2304.07633"},{"key":"e_1_3_3_2_43_2","first-page":"2839","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"Zhao Chengyang","year":"2023","unstructured":"Chengyang Zhao, Yikang Shen, Zhenfang Chen, Mingyu Ding, and Chuang Gan. 2023. Textpsg: Panoptic scene graph generation from textual descriptions. In Proceedings of the IEEE\/CVF International Conference on Computer Vision. 2839\u20132850."},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"crossref","unstructured":"Haiteng Zhao Shengchao Liu Ma Chang Hannan Xu Jie Fu Zhihong Deng Lingpeng Kong and Qi Liu. 2024. Gimlet: A unified graph-text model for instruction-based molecule zero-shot learning. Advances in Neural Information Processing Systems 36 (2024).","DOI":"10.1101\/2023.05.30.542904"},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-47436-2_27"}],"event":{"name":"SCID '25: The 2nd Workshop on Security-Centric Strategies for Combating Information Disorder","sponsor":["SIGSAC ACM Special Interest Group on Security, Audit, and Control"],"location":"Hanoi Vietnam","acronym":"SCID '25"},"container-title":["Proceedings of the 2nd Workshop on Security-Centric Strategies for Combating Information Disorder"],"original-title":[],"deposited":{"date-parts":[[2025,7,3]],"date-time":"2025-07-03T11:28:23Z","timestamp":1751542103000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3709020.3734830"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,24]]},"references-count":44,"alternative-id":["10.1145\/3709020.3734830","10.1145\/3709020"],"URL":"https:\/\/doi.org\/10.1145\/3709020.3734830","relation":{},"subject":[],"published":{"date-parts":[[2025,8,24]]},"assertion":[{"value":"2025-08-24","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}