{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,9]],"date-time":"2025-11-09T03:55:28Z","timestamp":1762660528807,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":32,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,12,18]]},"DOI":"10.1145\/3703323.3703332","type":"proceedings-article","created":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T12:03:28Z","timestamp":1750853008000},"page":"52-60","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["ReFrame: Rectification Framework for Image Explaining Architectures"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-3086-4501","authenticated-orcid":false,"given":"Debjyoti","family":"Das Adhikary","sequence":"first","affiliation":[{"name":"Computer Science and Engineering, Indian Institute of Technology, Kharagpur, Kharagpur, West Bengal, India"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2076-3577","authenticated-orcid":false,"given":"Aritra","family":"Hazra","sequence":"additional","affiliation":[{"name":"Computer Science and Engineering, Indian Institute of Technology, Kharagpur, Kharagpur, West Bengal, India"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3553-8834","authenticated-orcid":false,"given":"Partha Pratim","family":"Chakrabarti","sequence":"additional","affiliation":[{"name":"Computer Science and Engineering, Indian Institute of Technology, Kharagpur, Kharagpur, West Bengal, India"}]}],"member":"320","published-online":{"date-parts":[[2025,6,25]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"Josh Achiam Steven Adler Sandhini Agarwal Lama Ahmad Ilge Akkaya Florencia\u00a0Leoni Aleman Diogo Almeida Janko Altenschmidt Sam Altman Shyamal Anadkat et\u00a0al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"e_1_3_3_2_4_2","unstructured":"Zechen Bai Pichao Wang Tianjun Xiao Tong He Zongbo Han Zheng Zhang and Mike\u00a0Zheng Shou. 2024. Hallucination of multimodal large language models: A survey. arXiv preprint arXiv:2404.18930 (2024)."},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i10.29047"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"crossref","unstructured":"Yogesh\u00a0K Dwivedi Nir Kshetri Laurie Hughes Emma\u00a0Louise Slade Anand Jeyaraj Arpan\u00a0Kumar Kar Abdullah\u00a0M Baabdullah Alex Koohang Vishnupriya Raghavan Manju Ahuja et\u00a0al. 2023. \u201cSo what if ChatGPT wrote it?\u201d Multidisciplinary perspectives on opportunities challenges and implications of generative conversational AI for research practice and policy. International Journal of Information Management 71 (2023) 102642.","DOI":"10.1016\/j.ijinfomgt.2023.102642"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-15561-1_2"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"crossref","unstructured":"Guowei Ge Yufeng Han Lingguang Hao Kuangrong Hao Bing Wei and Xue-song Tang. 2024. Show tell and rectify: Boost image caption generation via an output rectifier. Neurocomputing (2024) 127651.","DOI":"10.1016\/j.neucom.2024.127651"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10593-2_35"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"crossref","unstructured":"MD\u00a0Zakir Hossain Ferdous Sohel Mohd\u00a0Fairuz Shiratuddin and Hamid Laga. 2019. A comprehensive survey of deep learning for image captioning. ACM Computing Surveys (CsUR) 51 6 (2019) 1\u201336.","DOI":"10.1145\/3295748"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00277"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"crossref","unstructured":"Wen Huang Hongbin Liu Minxin Guo and Neil\u00a0Zhenqiang Gong. 2024. Visual hallucinations of multi-modal large language models. arXiv preprint arXiv:2402.14683 (2024).","DOI":"10.18653\/v1\/2024.findings-acl.573"},{"key":"e_1_3_3_2_15_2","first-page":"5583","volume-title":"International conference on machine learning","author":"Kim Wonjae","year":"2021","unstructured":"Wonjae Kim, Bokyung Son, and Ildoo Kim. 2021. Vilt: Vision-and-language transformer without convolution or region supervision. In International conference on machine learning. PMLR, 5583\u20135594."},{"key":"e_1_3_3_2_16_2","first-page":"595","volume-title":"International conference on machine learning","author":"Kiros Ryan","year":"2014","unstructured":"Ryan Kiros, Ruslan Salakhutdinov, and Rich Zemel. 2014. Multimodal neural language models. In International conference on machine learning. PMLR, 595\u2013603."},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"crossref","unstructured":"Chenliang Li Haiyang Xu Junfeng Tian Wei Wang Ming Yan Bin Bi Jiabo Ye Hehong Chen Guohai Xu Zheng Cao et\u00a0al. 2022. mplug: Effective and efficient vision-language learning by cross-modal skip-connections. arXiv preprint arXiv:2205.12005 (2022).","DOI":"10.18653\/v1\/2022.emnlp-main.488"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1145\/3555776.3577794"},{"key":"e_1_3_3_2_20_2","unstructured":"Tomas Mikolov Kai Chen Greg Corrado and Jeffrey Dean. 2013. Efficient estimation of word representations in vector space. arXiv preprint arXiv:1301.3781 (2013)."},{"key":"e_1_3_3_2_21_2","unstructured":"Tomas Mikolov Ilya Sutskever Kai Chen Greg\u00a0S Corrado and Jeff Dean. 2013. Distributed representations of words and phrases and their compositionality. Advances in neural information processing systems 26 (2013)."},{"key":"e_1_3_3_2_22_2","first-page":"689","volume-title":"Proceedings of the 28th international conference on machine learning (ICML-11)","author":"Ngiam Jiquan","year":"2011","unstructured":"Jiquan Ngiam, Aditya Khosla, Mingyu Kim, Juhan Nam, Honglak Lee, and Andrew\u00a0Y Ng. 2011. Multimodal deep learning. In Proceedings of the 28th international conference on machine learning (ICML-11). 689\u2013696."},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i2.25285"},{"key":"e_1_3_3_2_24_2","unstructured":"Xuran Pan Tianzhu Ye Dongchen Han Shiji Song and Gao Huang. 2022. Contrastive language-image pre-training with knowledge graphs. Advances in Neural Information Processing Systems 35 (2022) 22895\u201322910."},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"crossref","unstructured":"Thorsten Rudroff. 2024. Revealing the Complexity of Fatigue: A Review of the Persistent Challenges and Promises of Artificial Intelligence. Brain sciences 14 2 (2024) 186.","DOI":"10.3390\/brainsci14020186"},{"key":"e_1_3_3_2_26_2","unstructured":"Yongliang Shen Kaitao Song Xu Tan Dongsheng Li Weiming Lu and Yueting Zhuang. 2024. Hugginggpt: Solving ai tasks with chatgpt and its friends in hugging face. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"crossref","unstructured":"Matteo Stefanini Marcella Cornia Lorenzo Baraldi Silvia Cascianelli Giuseppe Fiameni and Rita Cucchiara. 2022. From show to tell: A survey on deep learning-based image captioning. IEEE transactions on pattern analysis and machine intelligence 45 1 (2022) 539\u2013559.","DOI":"10.1109\/TPAMI.2022.3148210"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"crossref","unstructured":"Oriol Vinyals Alexander Toshev Samy Bengio and Dumitru Erhan. 2016. Show and tell: Lessons learned from the 2015 mscoco image captioning challenge. IEEE transactions on pattern analysis and machine intelligence 39 4 (2016) 652\u2013663.","DOI":"10.1109\/TPAMI.2016.2587640"},{"key":"e_1_3_3_2_30_2","unstructured":"Yiqi Wang Wentao Chen Xiaotian Han Xudong Lin Haiteng Zhao Yongfei Liu Bohan Zhai Jianbo Yuan Quanzeng You and Hongxia Yang. 2024. Exploring the reasoning abilities of multimodal large language models (mllms): A comprehensive survey on emerging trends in multimodal reasoning. arXiv preprint arXiv:2401.06805 (2024)."},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.10"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00644"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"crossref","unstructured":"Rui Zhao Ruqiang Yan Zhenghua Chen Kezhi Mao Peng Wang and Robert\u00a0X Gao. 2019. Deep learning and its applications to machine health monitoring. Mechanical Systems and Signal Processing 115 (2019) 213\u2013237.","DOI":"10.1016\/j.ymssp.2018.05.050"}],"event":{"name":"CODS-COMAD 2024: 8th International Conference on Data Science and Management of Data (12th ACM IKDD CODS and 30th COMAD)","location":"Jodhpur India","acronym":"CODS-COMAD Dec '24"},"container-title":["Proceedings of the 8th International Conference on Data Science and Management of Data (12th ACM IKDD CODS and 30th COMAD)"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3703323.3703332","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T13:05:50Z","timestamp":1750856750000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3703323.3703332"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,18]]},"references-count":32,"alternative-id":["10.1145\/3703323.3703332","10.1145\/3703323"],"URL":"https:\/\/doi.org\/10.1145\/3703323.3703332","relation":{},"subject":[],"published":{"date-parts":[[2024,12,18]]},"assertion":[{"value":"2025-06-25","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}