{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,16]],"date-time":"2026-06-16T04:35:55Z","timestamp":1781584555769,"version":"3.54.5"},"reference-count":64,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2025,4,21]],"date-time":"2025-04-21T00:00:00Z","timestamp":1745193600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,4,21]],"date-time":"2025-04-21T00:00:00Z","timestamp":1745193600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Multimed Info Retr"],"published-print":{"date-parts":[[2025,6]]},"DOI":"10.1007\/s13735-025-00365-9","type":"journal-article","created":{"date-parts":[[2025,4,21]],"date-time":"2025-04-21T08:48:51Z","timestamp":1745225331000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Multimodal scene-graph matching for cheapfakes detection"],"prefix":"10.1007","volume":"14","author":[{"given":"Minh-Tam","family":"Nguyen","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Quynh T.","family":"Nguyen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Minh Son","family":"Dao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Binh T.","family":"Nguyen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2025,4,21]]},"reference":[{"key":"365_CR1","doi-asserted-by":"crossref","unstructured":"Aneja S, Bregler C, Nie\u00dfner M (2023) Cosmos: catching out-of-context image misuse using self-supervised learning. In: Proceedings of the AAAI Conference on Artificial Intelligence 37:14084\u201314092","DOI":"10.1609\/aaai.v37i12.26648"},{"key":"365_CR2","doi-asserted-by":"crossref","unstructured":"Wang S, Wang R, Yao Z, Shan S, Chen X (2020) Cross-modal scene graph matching for relationship-aware image-text retrieval. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV)","DOI":"10.1109\/WACV45572.2020.9093614"},{"key":"365_CR3","doi-asserted-by":"publisher","unstructured":"Nguyen M, Nguyen BT, Gurrin C (2021) A deep local and global scene-graph matching for image-text retrieval. In: Fujita, H., P\u00e9rez-Meana, H. (eds.) New Trends in Intelligent Software Methodologies, Tools and Techniques - Proceedings of the 20th International Conference on New Trends in Intelligent Software Methodologies, Tools and Techniques, SoMet 202, Cancun, Mexico, 21-23 September, 2021. Frontiers in Artificial Intelligence and Applications, vol. 337, pp. 510\u2013523. https:\/\/doi.org\/10.3233\/FAIA210049","DOI":"10.3233\/FAIA210049"},{"key":"365_CR4","doi-asserted-by":"crossref","unstructured":"Johnson J, Krishna R, Stark M, Li L-J, Shamma D, Bernstein M, Fei-Fei L (2015) Image retrieval using scene graphs. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","DOI":"10.1109\/CVPR.2015.7298990"},{"key":"365_CR5","doi-asserted-by":"publisher","unstructured":"Wang Y, Dai P, Jia X, Zeng Z, Li R, Cao X (2023) Hi-sigir: Hierachical semantic-guided image-to-image retrieval via scene graph. In: Proceedings of the 31st ACM International Conference on Multimedia. MM \u201923, pp. 6400\u20136409. Association for Computing Machinery, New York, NY, USA. https:\/\/doi.org\/10.1145\/3581783.3612283","DOI":"10.1145\/3581783.3612283"},{"issue":"1","key":"365_CR6","doi-asserted-by":"publisher","first-page":"4","DOI":"10.1007\/s13735-023-00312-6","volume":"13","author":"S-I Papadopoulos","year":"2024","unstructured":"Papadopoulos S-I, Koutlis C, Papadopoulos S, Petrantonakis PC (2024) Verite: a robust benchmark for multimodal misinformation detection accounting for unimodal bias. Int J Multimedia Information Retrieval 13(1):4","journal-title":"Int J Multimedia Information Retrieval"},{"key":"365_CR7","doi-asserted-by":"publisher","first-page":"132","DOI":"10.1007\/978-3-031-53302-0_10","volume-title":"MultiMedia Modeling","author":"K-L Pham","year":"2024","unstructured":"Pham K-L, Nguyen-Nhat M-K, Dinh A-H, Le Q-T, Nguyen M-T, Tran A-D, Tran M-T, Dang-Nguyen D-T (2024) Ookpik - a collection of out-of-context image-caption pairs. In: Rudinac S, Hanjalic A, Liem C, Worring M, J\u00f3nsson B\u00c3, Liu B, Yamakata Y (eds) MultiMedia Modeling. Springer, Cham, pp 132\u2013144"},{"issue":"2","key":"365_CR8","doi-asserted-by":"publisher","first-page":"28","DOI":"10.1007\/s13735-023-00296-3","volume":"12","author":"S Tufchi","year":"2023","unstructured":"Tufchi S, Yadav A, Ahmed T (2023) A comprehensive survey of multimodal fake news detection techniques: advances, challenges, and opportunities. Int J Multimed Information Retrieval 12(2):28","journal-title":"Int J Multimed Information Retrieval"},{"key":"365_CR9","doi-asserted-by":"publisher","unstructured":"Akgul T, Civelek TE, Ugur D, Begen AC (2021) Cosmos on steroids: a cheap detector for cheapfakes. In: Proceedings of the 12th ACM Multimedia Systems Conference. MMSys \u201921, pp. 327\u2013331. Association for Computing Machinery, New York, NY, USA. https:\/\/doi.org\/10.1145\/3458305.3479968","DOI":"10.1145\/3458305.3479968"},{"issue":"11","key":"365_CR10","doi-asserted-by":"publisher","first-page":"423","DOI":"10.3390\/a15110423","volume":"15","author":"TV La","year":"2022","unstructured":"La TV, Dao MS, Le DD, Thai KP, Nguyen QH, Phan-Thi TK (2022) Leverage boosting and transformer on text-image matching for cheap fakes detection. Algorithms 15(11):423","journal-title":"Algorithms"},{"key":"365_CR11","doi-asserted-by":"publisher","unstructured":"Tran Q-T, Tran T-P, Dao M-S, La T-V, Tran A-D, Dang\u00a0Nguyen DT (2022) A textual-visual-entailment-based unsupervised algorithm for cheapfake detection. In: Proceedings of the 30th ACM International Conference on Multimedia. MM \u201922, pp. 7145\u20137149. Association for Computing Machinery, New York, NY, USA. https:\/\/doi.org\/10.1145\/3503161.3551596","DOI":"10.1145\/3503161.3551596"},{"key":"365_CR12","doi-asserted-by":"publisher","unstructured":"La T-V, Dao M-S, Tran Q-T, Tran T-P, Tran A-D, Dang-Nguyen D-T (2022) A combination of visual-semantic reasoning and text entailment-based boosting algorithm for cheapfake detection. In: Proceedings of the 30th ACM International Conference on Multimedia. MM \u201922, pp. 7140\u20137144. Association for Computing Machinery, New York, NY, USA. https:\/\/doi.org\/10.1145\/3503161.3551595","DOI":"10.1145\/3503161.3551595"},{"key":"365_CR13","doi-asserted-by":"publisher","unstructured":"Wu G, Wu W, Liu X, Xu K, Wan T, Wang W (2023) Cheap-fake detection with llm using prompt engineering. In: 2023 IEEE International Conference on Multimedia and Expo Workshops (ICMEW), pp. 105\u2013109. https:\/\/doi.org\/10.1109\/ICMEW59549.2023.00025","DOI":"10.1109\/ICMEW59549.2023.00025"},{"key":"365_CR14","doi-asserted-by":"publisher","unstructured":"Pham K-L, Nguyen M-T, Tran A-D, Dao M-S, Dang-Nguyen D-T (2023) Detecting cheapfakes using self-query adaptive-context learning. In: Proceedings of the 4th ACM Workshop on Intelligent Cross-Data Analysis and Retrieval. ICDAR \u201923, pp. 60\u201363. Association for Computing Machinery, New York, NY, USA. https:\/\/doi.org\/10.1145\/3592571.3592972","DOI":"10.1145\/3592571.3592972"},{"key":"365_CR15","doi-asserted-by":"publisher","unstructured":"Moholdt E, Khan SA, Dang-Nguyen D-T (2023) Detecting out-of-context image-caption pair in news: A counter-intuitive method. In: Proceedings of the 20th International Conference on Content-Based Multimedia Indexing. CBMI \u201923, pp. 203\u2013209. Association for Computing Machinery, New York, NY, USA. https:\/\/doi.org\/10.1145\/3617233.3617274","DOI":"10.1145\/3617233.3617274"},{"key":"365_CR16","doi-asserted-by":"publisher","unstructured":"Dao M-S, Zettsu K (2023) Leveraging knowledge graphs for cheapfakes detection: Beyond dataset evaluation. In: 2023 IEEE International Conference on Multimedia and Expo Workshops (ICMEW), pp. 99\u2013104. https:\/\/doi.org\/10.1109\/ICMEW59549.2023.00024","DOI":"10.1109\/ICMEW59549.2023.00024"},{"issue":"1","key":"365_CR17","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s13735-023-00315-3","volume":"13","author":"A Yadav","year":"2024","unstructured":"Yadav A, Gupta A (2024) An emotion-driven, transformer-based network for multimodal fake news detection. Int J Multimedia Information Retrieval 13(1):1\u201316","journal-title":"Int J Multimedia Information Retrieval"},{"key":"365_CR18","doi-asserted-by":"publisher","first-page":"88006","DOI":"10.1109\/ACCESS.2024.3418340","volume":"12","author":"C Xu","year":"2024","unstructured":"Xu C, Kechadi M-T (2024) An enhanced fake news detection system with fuzzy deep learning. IEEE Access 12:88006\u201388021. https:\/\/doi.org\/10.1109\/ACCESS.2024.3418340","journal-title":"IEEE Access"},{"key":"365_CR19","doi-asserted-by":"crossref","unstructured":"Devank, Kalla, J., Biswas, S.: Covlm: Leveraging consensus from vision-language models for semi-supervised multi-modal fake news detection. In: Cho, M., Laptev, I., Tran, D., Yao, A., Zha, H. (eds.) Computer Vision \u2013 ACCV 2024, pp. 172\u2013189. Springer, Singapore (2025)","DOI":"10.1007\/978-981-96-0960-4_11"},{"key":"365_CR20","unstructured":"Radford A, Kim JW, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J, Krueger G, Sutskever I (2021) Learning transferable visual models from natural language supervision. In: Meila, M., Zhang, T. (eds.) Proceedings of the 38th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol. 139, pp. 8748\u20138763. PMLR, ???. https:\/\/proceedings.mlr.press\/v139\/radford21a.html"},{"key":"365_CR21","unstructured":"Li J, Li D, Xiong C, Hoi S (2022) BLIP: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In: Chaudhuri, K., Jegelka, S., Song, L., Szepesvari, C., Niu, G., Sabato, S. (eds.) Proceedings of the 39th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol. 162, pp. 12888\u201312900. PMLR, ???. https:\/\/proceedings.mlr.press\/v162\/li22n.html"},{"key":"365_CR22","doi-asserted-by":"crossref","unstructured":"Shao R, Wu T, Liu Z (2023) Detecting and grounding multi-modal media manipulation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 6904\u20136913","DOI":"10.1109\/CVPR52729.2023.00667"},{"key":"365_CR23","doi-asserted-by":"publisher","unstructured":"Tahmasebi S, Hakimov S, Ewerth R, M\u00fcller-Budack E (2023) Improving generalization for multimodal fake news detection. In: Proceedings of the 2023 ACM International Conference on Multimedia Retrieval. ICMR \u201923, pp. 581\u2013585. Association for Computing Machinery, New York, NY, USA. https:\/\/doi.org\/10.1145\/3591106.3592230","DOI":"10.1145\/3591106.3592230"},{"key":"365_CR24","doi-asserted-by":"publisher","unstructured":"Luo G, Darrell T, Rohrbach A (2021) NewsCLIPpings: Automatic Generation of Out-of-Context Multimodal Media. In: Moens, M.-F., Huang, X., Specia, L., Yih, S.W.-t. (eds.) Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, pp. 6801\u20136817. Association for Computational Linguistics, Online and Punta Cana, Dominican Republic. https:\/\/doi.org\/10.18653\/v1\/2021.emnlp-main.545","DOI":"10.18653\/v1\/2021.emnlp-main.545"},{"key":"365_CR25","doi-asserted-by":"publisher","unstructured":"Jaiswal A, Sabir E, AbdAlmageed W, Natarajan P (2017) Multimedia semantic integrity assessment using joint embedding of images and text. In: Proceedings of the 25th ACM International Conference on Multimedia. MM \u201917, pp. 1465\u20131471. Association for Computing Machinery, New York, NY, USA. https:\/\/doi.org\/10.1145\/3123266.3123385","DOI":"10.1145\/3123266.3123385"},{"key":"365_CR26","doi-asserted-by":"publisher","unstructured":"Biamby G, Luo G, Darrell T, Rohrbach A (2022) Twitter-COMMs: Detecting climate, COVID, and military multimodal misinformation. In: Carpuat, M., Marneffe, M.-C., Meza\u00a0Ruiz, I.V. (eds.) Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 1530\u20131549. Association for Computational Linguistics, Seattle, United States. https:\/\/doi.org\/10.18653\/v1\/2022.naacl-main.110","DOI":"10.18653\/v1\/2022.naacl-main.110"},{"issue":"1","key":"365_CR27","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1109\/TPAMI.2021.3137605","volume":"45","author":"X Chang","year":"2023","unstructured":"Chang X, Ren P, Xu P, Li Z, Chen X, Hauptmann A (2023) A comprehensive survey of scene graphs: Generation and application. IEEE Trans Pattern Anal Mach Intell 45(1):1\u201326. https:\/\/doi.org\/10.1109\/TPAMI.2021.3137605","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"365_CR28","doi-asserted-by":"crossref","unstructured":"Zellers R, Yatskar M, Thomson S, Choi Y (2018) Neural motifs: Scene graph parsing with global context. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","DOI":"10.1109\/CVPR.2018.00611"},{"key":"365_CR29","doi-asserted-by":"crossref","unstructured":"Tang K, Zhang H, Wu B, Luo W, Liu W (2019) Learning to compose dynamic tree structures for visual contexts. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","DOI":"10.1109\/CVPR.2019.00678"},{"key":"365_CR30","doi-asserted-by":"crossref","unstructured":"Xu D, Zhu Y, Choy CB, Fei-Fei L (2017) Scene graph generation by iterative message passing. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","DOI":"10.1109\/CVPR.2017.330"},{"key":"365_CR31","doi-asserted-by":"crossref","unstructured":"Tang K, Niu Y, Huang J, Shi J, Zhang H (2020) Unbiased scene graph generation from biased training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","DOI":"10.1109\/CVPR42600.2020.00377"},{"key":"365_CR32","doi-asserted-by":"crossref","unstructured":"Dong X, Gan T, Song X, Wu J, Cheng Y, Nie L (2022) Stacked hybrid-attention and group collaborative learning for unbiased scene graph generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 19427\u201319436","DOI":"10.1109\/CVPR52688.2022.01882"},{"key":"365_CR33","doi-asserted-by":"crossref","unstructured":"Lin X, Ding C, Zhan Y, Li Z, Tao D (2022) Hl-net: Heterophily learning network for scene graph generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 19476\u201319485","DOI":"10.1109\/CVPR52688.2022.01887"},{"key":"365_CR34","doi-asserted-by":"crossref","unstructured":"Lin X, Ding C, Zhang J, Zhan Y, Tao D (2022) Ru-net: Regularized unrolling network for scene graph generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 19457\u201319466","DOI":"10.1109\/CVPR52688.2022.01885"},{"key":"365_CR35","doi-asserted-by":"publisher","unstructured":"Yan S, Shen C, Jin Z, Huang J, Jiang R, Chen Y, Hua X-S (2020) Pcpl: Predicate-correlation perception learning for unbiased scene graph generation. In: Proceedings of the 28th ACM International Conference on Multimedia. MM \u201920, pp. 265\u2013273. Association for Computing Machinery, New York, NY, USA. https:\/\/doi.org\/10.1145\/3394171.3413722","DOI":"10.1145\/3394171.3413722"},{"key":"365_CR36","doi-asserted-by":"crossref","unstructured":"Zhang A, Yao Y, Chen Q, Ji W, Liu Z, Sun M, Chua T-S (2022) Fine-grained scene graph generation with data transfer. In: European Conference on Computer Vision, pp. 409\u2013424. Springer","DOI":"10.1007\/978-3-031-19812-0_24"},{"key":"365_CR37","doi-asserted-by":"crossref","unstructured":"Zheng C, Lyu X, Gao L, Dai B, Song J (2023) Prototype-based embedding network for scene graph generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 22783\u201322792","DOI":"10.1109\/CVPR52729.2023.02182"},{"key":"365_CR38","doi-asserted-by":"publisher","unstructured":"Schuster S, Krishna R, Chang A, Fei-Fei L, Manning CD (2015) Generating semantically precise scene graphs from textual descriptions for improved image retrieval. In: Belz, A., Coheur, L., Ferrari, V., Moens, M.-F., Pastra, K., Vuli\u0107, I. (eds.) Proceedings of the Fourth Workshop on Vision and Language, pp. 70\u201380. Association for Computational Linguistics, Lisbon, Portugal. https:\/\/doi.org\/10.18653\/v1\/W15-2812","DOI":"10.18653\/v1\/W15-2812"},{"key":"365_CR39","doi-asserted-by":"crossref","unstructured":"Anderson P, Fernando B, Johnson M, Gould S (2016) Spice: Semantic propositional image caption evaluation. In: ECCV","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"365_CR40","doi-asserted-by":"publisher","unstructured":"Wang Y-S, Liu C, Zeng X, Yuille A (2018) Scene graph parsing as dependency parsing. In: Walker, M., Ji, H., Stent, A. (eds.) Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers), pp. 397\u2013407. Association for Computational Linguistics, New Orleans, Louisiana. https:\/\/doi.org\/10.18653\/v1\/N18-1037","DOI":"10.18653\/v1\/N18-1037"},{"key":"365_CR41","doi-asserted-by":"publisher","unstructured":"Choi WS, Heo Y-J, Punithan D, Zhang B-T (2022) Scene graph parsing via Abstract Meaning Representation in pre-trained language models. In: Wu, L., Liu, B., Mihalcea, R., Pei, J., Zhang, Y., Li, Y. (eds.) Proceedings of the 2nd Workshop on Deep Learning on Graphs for Natural Language Processing (DLG4NLP 2022), pp. 30\u201335. Association for Computational Linguistics, Seattle, Washington. https:\/\/doi.org\/10.18653\/v1\/2022.dlg4nlp-1.4","DOI":"10.18653\/v1\/2022.dlg4nlp-1.4"},{"key":"365_CR42","doi-asserted-by":"publisher","unstructured":"Li Z, Chai Y, Zhuo TY, Qu L, Haffari G, Li F, Ji D, Tran QH (2023) FACTUAL: A benchmark for faithful and consistent textual scene graph parsing. In: Rogers, A., Boyd-Graber, J., Okazaki, N. (eds.) Findings of the Association for Computational Linguistics: ACL 2023, pp. 6377\u20136390. Association for Computational Linguistics, Toronto, Canada. https:\/\/doi.org\/10.18653\/v1\/2023.findings-acl.398","DOI":"10.18653\/v1\/2023.findings-acl.398"},{"key":"365_CR43","doi-asserted-by":"crossref","unstructured":"Girshick R (2015) Fast r-cnn. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV)","DOI":"10.1109\/ICCV.2015.169"},{"key":"365_CR44","doi-asserted-by":"publisher","unstructured":"Gao L, Wang B, Wang W (2018) Image captioning with scene-graph based semantic concepts. In: Proceedings of the 2018 10th International Conference on Machine Learning and Computing. ICMLC \u201918, pp. 225\u2013229. Association for Computing Machinery, New York, NY, USA. https:\/\/doi.org\/10.1145\/3195106.3195114","DOI":"10.1145\/3195106.3195114"},{"key":"365_CR45","doi-asserted-by":"crossref","unstructured":"Kim D-J, Choi J, Oh T-H, Kweon IS (2019) Dense relational captioning: Triple-stream networks for relationship-based captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","DOI":"10.1109\/CVPR.2019.00643"},{"key":"365_CR46","doi-asserted-by":"crossref","unstructured":"Yang X, Tang K, Zhang H, Cai J (2019) Auto-encoding scene graphs for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","DOI":"10.1109\/CVPR.2019.01094"},{"key":"365_CR47","doi-asserted-by":"crossref","unstructured":"Li L, Gan Z, Cheng Y, Liu J (2019) Relation-aware graph attention network for visual question answering. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV)","DOI":"10.1109\/ICCV.2019.01041"},{"key":"365_CR48","unstructured":"Zhang C, Chao W-L, Xuan D (2019) An empirical study on leveraging scene graphs for visual question answering. In: 30th British Machine Vision Conference 2019, BMVC 2019, Cardiff, UK, September 9-12, 2019, p. 288"},{"key":"365_CR49","unstructured":"Chen T, Kornblith S, Norouzi M, Hinton G (2020) A simple framework for contrastive learning of visual representations. In: International Conference on Machine Learning, pp. 1597\u20131607. PMLR"},{"key":"365_CR50","doi-asserted-by":"publisher","unstructured":"Qi P, Cao J, Li X, Liu H, Sheng Q, Mi X, He Q, Lv Y, Guo C, Yu Y (2021) Improving fake news detection by using an entity-enhanced framework to fuse diverse multimodal clues. In: Proceedings of the 29th ACM International Conference on Multimedia. MM \u201921, pp. 1212\u20131220. Association for Computing Machinery, New York, NY, USA. https:\/\/doi.org\/10.1145\/3474085.3481548","DOI":"10.1145\/3474085.3481548"},{"key":"365_CR51","doi-asserted-by":"publisher","unstructured":"Pennington J, Socher R, Manning C (2014) GloVe: Global vectors for word representation. In: Moschitti, A., Pang, B., Daelemans, W. (eds.) Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 1532\u20131543. Association for Computational Linguistics, Doha, Qatar. https:\/\/doi.org\/10.3115\/v1\/D14-1162","DOI":"10.3115\/v1\/D14-1162"},{"key":"365_CR52","doi-asserted-by":"crossref","unstructured":"Bai Y, Ding H, Qiao Y, Marinovic A, Gu K, Chen T, Sun Y, Wang W (2019) Unsupervised inductive graph-level representation learning via graph-graph proximity. In: Proceedings of the 28th International Joint Conference on Artificial Intelligence. IJCAI\u201919, pp. 1988\u20131994","DOI":"10.24963\/ijcai.2019\/275"},{"key":"365_CR53","doi-asserted-by":"publisher","unstructured":"Williams A, Nangia N, Bowman S (2018) A broad-coverage challenge corpus for sentence understanding through inference. In: Walker, M., Ji, H., Stent, A. (eds.) Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers), pp. 1112\u20131122. Association for Computational Linguistics, New Orleans, Louisiana. https:\/\/doi.org\/10.18653\/v1\/N18-1101","DOI":"10.18653\/v1\/N18-1101"},{"key":"365_CR54","doi-asserted-by":"publisher","unstructured":"Lewis M, Liu Y, Goyal N, Ghazvininejad M, Mohamed A, Levy O, Stoyanov V, Zettlemoyer L (2020) BART: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. In: Jurafsky, D., Chai, J., Schluter, N., Tetreault, J. (eds.) Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 7871\u20137880. Association for Computational Linguistics, Online. https:\/\/doi.org\/10.18653\/v1\/2020.acl-main.703","DOI":"10.18653\/v1\/2020.acl-main.703"},{"key":"365_CR55","unstructured":"Liu Y, Ott M, Goyal N, Du J, Joshi M, Chen D, Levy O, Lewis M, Zettlemoyer L, Stoyanov V (2019) Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692"},{"key":"365_CR56","unstructured":"He P, Gao J, Chen W (2023) DeBERTav3: Improving deBERTa using ELECTRA-style pre-training with gradient-disentangled embedding sharing. In: The Eleventh International Conference on Learning Representations"},{"key":"365_CR57","doi-asserted-by":"publisher","unstructured":"Singhal S, Shah RR, Chakraborty T, Kumaraguru P, Satoh S (2019) Spotfake: A multi-modal framework for fake news detection. In: 2019 IEEE Fifth International Conference on Multimedia Big Data (BigMM), pp. 39\u201347. https:\/\/doi.org\/10.1109\/BigMM.2019.00-44","DOI":"10.1109\/BigMM.2019.00-44"},{"key":"365_CR58","doi-asserted-by":"publisher","unstructured":"Wang Y, Ma F, Jin Z, Yuan Y, Xun G, Jha K, Su L, Gao J (2018) Eann: Event adversarial neural networks for multi-modal fake news detection. In: Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. KDD \u201918, pp. 849\u2013857. Association for Computing Machinery, New York, NY, USA. https:\/\/doi.org\/10.1145\/3219819.3219903","DOI":"10.1145\/3219819.3219903"},{"key":"365_CR59","doi-asserted-by":"crossref","unstructured":"La T-V, Tran Q-T, Tran T-P, Tran A-D, Dang-Nguyen D-T, Dao M-S (2022) Multimodal cheapfakes detection by utilizing image captioning for global context. In: Proceedings of the 3rd ACM Workshop on Intelligent Cross-Data Analysis and Retrieval, pp. 9\u201316","DOI":"10.1145\/3512731.3534210"},{"key":"365_CR60","doi-asserted-by":"publisher","unstructured":"Zhang Z, Han X, Liu Z, Jiang X, Sun M, Liu Q (2019) ERNIE: Enhanced language representation with informative entities. In: Korhonen, A., Traum, D., M\u00e0rquez, L. (eds.) Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, pp. 1441\u20131451. Association for Computational Linguistics, Florence, Italy. https:\/\/doi.org\/10.18653\/v1\/P19-1139 . https:\/\/aclanthology.org\/P19-1139\/","DOI":"10.18653\/v1\/P19-1139"},{"key":"365_CR61","doi-asserted-by":"publisher","unstructured":"Wang WY (2017) \u201cliar, liar pants on fire\u201d: A new benchmark dataset for fake news detection. In: Barzilay, R., Kan, M.-Y. (eds.) Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers), pp. 422\u2013426. Association for Computational Linguistics, Vancouver, Canada. https:\/\/doi.org\/10.18653\/v1\/P17-2067 . https:\/\/aclanthology.org\/P17-2067\/","DOI":"10.18653\/v1\/P17-2067"},{"key":"365_CR62","doi-asserted-by":"crossref","unstructured":"Kim J, Park J, Park J, Kim J, Kim S, Kim HJ (2024) Groupwise query specialization and quality-aware multi-assignment for transformer-based visual relationship detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 28160\u201328169","DOI":"10.1109\/CVPR52733.2024.02660"},{"key":"365_CR63","doi-asserted-by":"crossref","unstructured":"Im J, Nam J, Park N, Lee H, Park S (2024) Egtr: Extracting graph from transformer for scene graph generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 24229\u201324238","DOI":"10.1109\/CVPR52733.2024.02287"},{"key":"365_CR64","doi-asserted-by":"crossref","unstructured":"Wang G, Li Z, Chen Q, Liu Y (2024) Oed: Towards one-stage end-to-end dynamic scene graph generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 27938\u201327947","DOI":"10.1109\/CVPR52733.2024.02639"}],"container-title":["International Journal of Multimedia Information Retrieval"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13735-025-00365-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s13735-025-00365-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13735-025-00365-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,10]],"date-time":"2025-06-10T09:41:22Z","timestamp":1749548482000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s13735-025-00365-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,21]]},"references-count":64,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2025,6]]}},"alternative-id":["365"],"URL":"https:\/\/doi.org\/10.1007\/s13735-025-00365-9","relation":{"has-preprint":[{"id-type":"doi","id":"10.21203\/rs.3.rs-4553024\/v1","asserted-by":"object"}]},"ISSN":["2192-6611","2192-662X"],"issn-type":[{"value":"2192-6611","type":"print"},{"value":"2192-662X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,4,21]]},"assertion":[{"value":"9 June 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 February 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 March 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 April 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no Conflict of interest to declare that are relevant to the content of this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"17"}}