{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T15:04:17Z","timestamp":1775228657027,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":41,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"Strategic Priority Research Program of Chinese Academy of Sciences","award":["No. XDC02040400"],"award-info":[{"award-number":["No. XDC02040400"]}]},{"name":"National Key Research and Development of China","award":["No. 2021YFB3100600"],"award-info":[{"award-number":["No. 2021YFB3100600"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3613850","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"5696-5704","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":75,"title":["Cross-modal Contrastive Learning for Multimodal Fake News Detection"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6200-5523","authenticated-orcid":false,"given":"Longzheng","family":"Wang","sequence":"first","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1866-6615","authenticated-orcid":false,"given":"Chuang","family":"Zhang","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0258-7840","authenticated-orcid":false,"given":"Hongbo","family":"Xu","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2963-0914","authenticated-orcid":false,"given":"Yongxiu","family":"Xu","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4514-2056","authenticated-orcid":false,"given":"Xiaohan","family":"Xu","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7199-0310","authenticated-orcid":false,"given":"Siqi","family":"Wang","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences &amp; School of Cyber Security, University of Chinese Academy of Sciences, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"32897","article-title":"Vlmo: Unified vision-language pre-training with mixture-of-modality-experts","volume":"35","author":"Bao Hangbo","year":"2022","unstructured":"Hangbo Bao, Wenhui Wang, Li Dong, Qiang Liu, Owais Khan Mohammed, Kriti Aggarwal, Subhojit Som, Songhao Piao, and Furu Wei. 2022. Vlmo: Unified vision-language pre-training with mixture-of-modality-experts. Advances in Neural Information Processing Systems 35 (2022), 32897--32912.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_2_1","volume-title":"Explainable tsetlin machine framework for fake news detection with credibility score assessment. arXiv preprint arXiv:2105.09114","author":"Bhattarai Bimal","year":"2021","unstructured":"Bimal Bhattarai, Ole-Christoffer Granmo, and Lei Jiao. 2021. Explainable tsetlin machine framework for fake news detection with credibility score assessment. arXiv preprint arXiv:2105.09114 (2021)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/s13735-017-0143-x"},{"key":"e_1_3_2_1_4_1","volume-title":"International conference on machine learning. PMLR, 1597--1607","author":"Chen Ting","year":"2020","unstructured":"Ting Chen, Simon Kornblith, Mohammad Norouzi, and Geoffrey Hinton. 2020. A simple framework for contrastive learning of visual representations. In International conference on machine learning. PMLR, 1597--1607."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00950"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3485447.3511968"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1002\/pra2.2015.145052010082"},{"key":"e_1_3_2_1_8_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_9_1","volume-title":"Simcse: Simple contrastive learning of sentence embeddings. arXiv preprint arXiv:2104.08821","author":"Gao Tianyu","year":"2021","unstructured":"Tianyu Gao, Xingcheng Yao, and Danqi Chen. 2021. Simcse: Simple contrastive learning of sentence embeddings. arXiv preprint arXiv:2104.08821 (2021)."},{"key":"e_1_3_2_1_10_1","volume-title":"PyramidCLIP: Hierarchical Feature Alignment for Vision-language Model Pre-training. arXiv preprint arXiv:2204.14095","author":"Gao Yuting","year":"2022","unstructured":"Yuting Gao, Jinfeng Liu, Zihan Xu, Jun Zhang, Ke Li, and Chunhua Shen. 2022. PyramidCLIP: Hierarchical Feature Alignment for Vision-language Model Pre-training. arXiv preprint arXiv:2204.14095 (2022)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3269206.3271709"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00745"},{"key":"e_1_3_2_1_15_1","volume-title":"et al","author":"Huo Yuqi","year":"2021","unstructured":"Yuqi Huo, Manli Zhang, Guangzhen Liu, Haoyu Lu, Yizhao Gao, Guoxing Yang, Jingyuan Wen, Heng Zhang, Baogui Xu, Weihao Zheng, et al . 2021. WenLan: Bridging vision and language by large-scale multi-modal pre-training. arXiv preprint arXiv:2103.06561 (2021)."},{"key":"e_1_3_2_1_16_1","volume-title":"International Conference on Machine Learning. PMLR, 4904--4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In International Conference on Machine Learning. PMLR, 4904--4916."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123454"},{"key":"e_1_3_2_1_18_1","volume-title":"Novel visual and statistical image features for microblogs news verification","author":"Jin Zhiwei","year":"2016","unstructured":"Zhiwei Jin, Juan Cao, Yongdong Zhang, Jianshe Zhou, and Qi Tian. 2016. Novel visual and statistical image features for microblogs news verification. IEEE transactions on multimedia 19, 3 (2016), 598--608."},{"key":"e_1_3_2_1_19_1","volume-title":"Manish Gupta, and Vasudeva Varma.","author":"Khattar Dhruv","year":"2019","unstructured":"Dhruv Khattar, Jaipal Singh Goud, Manish Gupta, and Vasudeva Varma. 2019. Mvae: Multimodal variational autoencoder for fake news detection. In The world wide web conference. 2915--2921."},{"key":"e_1_3_2_1_20_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_21_1","volume-title":"International Conference on Machine Learning. PMLR, 12888--12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International Conference on Machine Learning. PMLR, 12888--12900."},{"key":"e_1_3_2_1_22_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems 34","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems 34 (2021), 9694--9705."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/2806416.2806651"},{"key":"e_1_3_2_1_24_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. 2019. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_25_1","volume-title":"A stylometric inquiry into hyperpartisan and fake news. arXiv preprint arXiv:1702.05638","author":"Potthast Martin","year":"2017","unstructured":"Martin Potthast, Johannes Kiesel, Kevin Reinartz, Janek Bevendorff, and Benno Stein. 2017. A stylometric inquiry into hyperpartisan and fake news. arXiv preprint arXiv:1702.05638 (2017)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDM.2019.00062"},{"key":"e_1_3_2_1_27_1","first-page":"3834","article-title":"Neural User Response Generator: Fake News Detection with Collective User Intelligence","volume":"18","author":"Qian Feng","year":"2018","unstructured":"Feng Qian, Chengyue Gong, Karishma Sharma, and Yan Liu. 2018. Neural User Response Generator: Fake News Detection with Collective User Intelligence.. In IJCAI, Vol. 18. 3834--3840.","journal-title":"IJCAI"},{"key":"e_1_3_2_1_28_1","volume-title":"International Conference on Machine Learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_29_1","volume-title":"Leveraging Intra and Inter Modality Relationship for Multimodal Fake News Detection. In Companion Proceedings of the Web Conference","author":"Singhal Shivangi","year":"2022","unstructured":"Shivangi Singhal, Tanisha Pandey, Saksham Mrig, Rajiv Ratn Shah, and Ponnurangam Kumaraguru. 2022. Leveraging Intra and Inter Modality Relationship for Multimodal Fake News Detection. In Companion Proceedings of the Web Conference 2022. 726--734."},{"key":"e_1_3_2_1_30_1","article-title":"Visualizing data using t-SNE","volume":"9","author":"der Maaten Laurens Van","year":"2008","unstructured":"Laurens Van der Maaten and Geoffrey Hinton. 2008. Visualizing data using t-SNE. Journal of machine learning research 9, 11 (2008).","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3219819.3219903"},{"key":"e_1_3_2_1_32_1","volume-title":"Cross-Modal Knowledge Distillation in Multi-Modal Fake News Detection. In ICASSP 2022--2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 4733--4737","author":"Wei Zimian","year":"2022","unstructured":"Zimian Wei, Hengyue Pan, Linbo Qiao, Xin Niu, Peijie Dong, and Dongsheng Li. 2022. Cross-Modal Knowledge Distillation in Multi-Modal Fake News Detection. In ICASSP 2022--2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 4733--4737."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-acl.226"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ipm.2021.102610"},{"key":"e_1_3_2_1_35_1","volume-title":"Consert: A contrastive framework for self-supervised sentence representation transfer. arXiv preprint arXiv:2105.11741","author":"Yan Yuanmeng","year":"2021","unstructured":"Yuanmeng Yan, Rumei Li, Sirui Wang, Fuzheng Zhang, Wei Wu, and Weiran Xu. 2021. Consert: A contrastive framework for self-supervised sentence representation transfer. arXiv preprint arXiv:2105.11741 (2021)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"crossref","unstructured":"Feng Yu Qiang Liu Shu Wu Liang Wang Tieniu Tan et al. 2017. A Convolutional Approach for Misinformation Identification.. In IJCAI. 3901--3907.","DOI":"10.24963\/ijcai.2017\/545"},{"key":"e_1_3_2_1_37_1","volume-title":"Coca: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917","author":"Yu Jiahui","year":"2022","unstructured":"Jiahui Yu, Zirui Wang, Vijay Vasudevan, Legg Yeung, Mojtaba Seyedhosseini, and Yonghui Wu. 2022. Coca: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917 (2022)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350850"},{"key":"e_1_3_2_1_39_1","volume-title":"SAFE: Similarity-Aware Multi-modal Fake News Detection. In Pacific-Asia Conference on Knowledge Discovery and Data Mining. Springer, 354--367","author":"Zhou Xinyi","year":"2020","unstructured":"Xinyi Zhou, Jindi Wu, and Reza Zafarani. 2020. SAFE: Similarity-Aware Multi-modal Fake News Detection. In Pacific-Asia Conference on Knowledge Discovery and Data Mining. Springer, 354--367."},{"key":"e_1_3_2_1_40_1","volume-title":"Multimodal Fake News Detection via CLIP-Guided Learning. arXiv preprint arXiv:2205.14304","author":"Zhou Yangming","year":"2022","unstructured":"Yangming Zhou, Qichao Ying, Zhenxing Qian, Sheng Li, and Xinpeng Zhang. 2022. Multimodal Fake News Detection via CLIP-Guided Learning. arXiv preprint arXiv:2205.14304 (2022)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3161603"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3613850","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3613850","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T23:55:49Z","timestamp":1755820549000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3613850"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":41,"alternative-id":["10.1145\/3581783.3613850","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3613850","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}