{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,24]],"date-time":"2025-08-24T01:29:10Z","timestamp":1755998950509,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":99,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62176058"],"award-info":[{"award-number":["62176058"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2023YFF1204800"],"award-info":[{"award-number":["2023YFF1204800"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681529","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"1971-1980","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["ReForm-Eval: Evaluating Large Vision Language Models via Unified Re-Formulation of Task-Oriented Benchmarks"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7443-8027","authenticated-orcid":false,"given":"Zejun","family":"Li","sequence":"first","affiliation":[{"name":"School of Data Science, Fudan University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-0062-3181","authenticated-orcid":false,"given":"Ye","family":"Wang","sequence":"additional","affiliation":[{"name":"Academy for Engineering and Technology, Fudan University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-3078-417X","authenticated-orcid":false,"given":"Mengfei","family":"Du","sequence":"additional","affiliation":[{"name":"School of Data Science, Fudan University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-3161-5158","authenticated-orcid":false,"given":"Qingwen","family":"Liu","sequence":"additional","affiliation":[{"name":"Institute of Science and Technology for Brain-Inspired Intelligence, Fudan University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4377-7679","authenticated-orcid":false,"given":"Binhao","family":"Wu","sequence":"additional","affiliation":[{"name":"School of Data Science, Fudan University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8060-6824","authenticated-orcid":false,"given":"Jiwen","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Data Science, Fudan University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8809-2422","authenticated-orcid":false,"given":"Chengxing","family":"Zhou","sequence":"additional","affiliation":[{"name":"Software College, Northeastern University, Shenyang, Liaoning, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9910-7937","authenticated-orcid":false,"given":"Zhihao","family":"Fan","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, Zhejiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4494-843X","authenticated-orcid":false,"given":"Jie","family":"Fu","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Hong Kong University of Science and Technology, Hong Kong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1737-3420","authenticated-orcid":false,"given":"Jingjing","family":"Chen","sequence":"additional","affiliation":[{"name":"School of Computer Science, Fudan University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3789-8507","authenticated-orcid":false,"given":"Zhongyu","family":"Wei","sequence":"additional","affiliation":[{"name":"School of Data Science, Fudan University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9197-9426","authenticated-orcid":false,"given":"Xuanjing","family":"Huang","sequence":"additional","affiliation":[{"name":"School of Computer Science, Fudan University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2024. OmniLMM: Large multi-modal models for strong performance and efficient deployment. https:\/\/github.com\/OpenBMB\/OmniLMM"},{"key":"e_1_3_2_1_2_1","volume-title":"Nocaps: Novel object captioning at scale. In ICCV. 8948--8957.","author":"Agrawal Harsh","year":"2019","unstructured":"Harsh Agrawal, Karan Desai, Yufei Wang, Xinlei Chen, Rishabh Jain, Mark Johnson, Dhruv Batra, Devi Parikh, Stefan Lee, and Peter Anderson. 2019. Nocaps: Novel object captioning at scale. In ICCV. 8948--8957."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00521-022-07717-0"},{"key":"e_1_3_2_1_4_1","first-page":"23716","article-title":"Flamingo: a visual language model for few-shot learning","volume":"35","author":"Alayrac Jean-Baptiste","year":"2022","unstructured":"Jean-Baptiste Alayrac, Jeff Donahue, Pauline Luc, Antoine Miech, Iain Barr, Yana Hasson, Karel Lenc, Arthur Mensch, Katherine Millican, Malcolm Reynolds, et al. 2022. Flamingo: a visual language model for few-shot learning. NIPS 35 (2022), 23716--23736.","journal-title":"NIPS"},{"key":"e_1_3_2_1_5_1","volume-title":"Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_1_6_1","volume-title":"Breaking common sense: Whoops! a vision-and-language benchmark of synthetic and compositional images. arXiv:2303.07274","author":"Bitton-Guetta Nitzan","year":"2023","unstructured":"Nitzan Bitton-Guetta, Yonatan Bitton, Jack Hessel, Ludwig Schmidt, Yuval Elovici, Gabriel Stanovsky, and Roy Schwartz. 2023. Breaking common sense: Whoops! a vision-and-language benchmark of synthetic and compositional images. arXiv:2303.07274 (2023)."},{"key":"e_1_3_2_1_7_1","volume-title":"Matterport3d: Learning from rgb-d data in indoor environments. arXiv:1709.06158","author":"Chang Angel","year":"2017","unstructured":"Angel Chang, Angela Dai, Thomas Funkhouser, Maciej Halber, Matthias Niessner, Manolis Savva, Shuran Song, Andy Zeng, and Yinda Zhang. 2017. Matterport3d: Learning from rgb-d data in indoor environments. arXiv:1709.06158 (2017)."},{"key":"e_1_3_2_1_8_1","volume-title":"Minigpt-v2: large language model as a unified interface for vision-language multi-task learning. arXiv:2310.09478","author":"Chen Jun","year":"2023","unstructured":"Jun Chen, Deyao Zhu, Xiaoqian Shen, Xiang Li, Zechun Liu, Pengchuan Zhang, Raghuraman Krishnamoorthi, Vikas Chandra, Yunyang Xiong, and Mohamed Elhoseiny. 2023. Minigpt-v2: large language model as a unified interface for vision-language multi-task learning. arXiv:2310.09478 (2023)."},{"key":"e_1_3_2_1_9_1","volume-title":"Shikra: Unleashing Multimodal LLM's Referential Dialogue Magic. arXiv:2306.15195","author":"Chen Keqin","year":"2023","unstructured":"Keqin Chen, Zhao Zhang, Weili Zeng, Richong Zhang, Feng Zhu, and Rui Zhao. 2023. Shikra: Unleashing Multimodal LLM's Referential Dialogue Magic. arXiv:2306.15195 (2023)."},{"key":"e_1_3_2_1_10_1","volume-title":"Sharegpt4v: Improving large multi-modal models with better captions. arXiv:2311.12793","author":"Chen Lin","year":"2023","unstructured":"Lin Chen, Jisong Li, Xiaoyi Dong, Pan Zhang, Conghui He, Jiaqi Wang, Feng Zhao, and Dahua Lin. 2023. Sharegpt4v: Improving large multi-modal models with better captions. arXiv:2311.12793 (2023)."},{"key":"e_1_3_2_1_11_1","volume-title":"Faisal Ahmed, Zhe Gan, Yu Cheng, and Jingjing Liu.","author":"Chen Yen-Chun","year":"2020","unstructured":"Yen-Chun Chen, Linjie Li, Licheng Yu, Ahmed El Kholy, Faisal Ahmed, Zhe Gan, Yu Cheng, and Jingjing Liu. 2020. Uniter: Universal image-text representation learning. In ECCV. Springer, 104--120."},{"key":"e_1_3_2_1_12_1","volume-title":"Xing","author":"Chiang Wei-Lin","year":"2023","unstructured":"Wei-Lin Chiang, Zhuohan Li, Zi Lin, Ying Sheng, Zhanghao Wu, Hao Zhang, Lianmin Zheng, Siyuan Zhuang, Yonghao Zhuang, Joseph E. Gonzalez, Ion Stoica, and Eric P. Xing. 2023. Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90%* ChatGPT Quality. https:\/\/lmsys.org\/blog\/2023-03--30-vicuna\/"},{"key":"e_1_3_2_1_13_1","unstructured":"Hyung Won Chung Le Hou Shayne Longpre Barret Zoph Yi Tay William Fedus Eric Li Xuezhi Wang Mostafa Dehghani Siddhartha Brahma et al. 2022. Scaling instruction-finetuned language models. arXiv:2210.11416 (2022)."},{"key":"e_1_3_2_1_14_1","volume-title":"Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven Hoi.","author":"Dai Wenliang","year":"2023","unstructured":"Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven Hoi. 2023. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. arXiv:2305.06500 [cs.CV]"},{"key":"e_1_3_2_1_15_1","volume-title":"Devi Parikh, and Dhruv Batra.","author":"Das Abhishek","year":"2017","unstructured":"Abhishek Das, Satwik Kottur, Khushi Gupta, Avi Singh, Deshraj Yadav, Jos\u00e9 MF Moura, Devi Parikh, and Dhruv Batra. 2017. Visual dialog. In CVPR. 326--335."},{"key":"e_1_3_2_1_16_1","volume-title":"Imagenet: A large-scale hierarchical image database. In CVPR. Ieee, 248--255.","author":"Deng Jia","year":"2009","unstructured":"Jia Deng, Wei Dong, Richard Socher, Li-Jia Li, Kai Li, and Li Fei-Fei. 2009. Imagenet: A large-scale hierarchical image database. In CVPR. Ieee, 248--255."},{"key":"e_1_3_2_1_17_1","volume-title":"MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models. arXiv:2306.13394","author":"Fu Chaoyou","year":"2023","unstructured":"Chaoyou Fu, Peixian Chen, Yunhang Shen, Yulei Qin, Mengdan Zhang, Xu Lin, Zhenyu Qiu, Wei Lin, Jinrui Yang, Xiawu Zheng, et al. 2023. MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models. arXiv:2306.13394 (2023)."},{"key":"e_1_3_2_1_18_1","unstructured":"Peng Gao Jiaming Han Renrui Zhang Ziyi Lin Shijie Geng Aojun Zhou Wei Zhang Pan Lu Conghui He Xiangyu Yue et al. 2023. Llama-adapter v2: Parameter-efficient visual instruction model. arXiv:2304.15010 (2023)."},{"key":"e_1_3_2_1_19_1","volume-title":"Armand Joulin, and Ishan Misra.","author":"Girdhar Rohit","year":"2023","unstructured":"Rohit Girdhar, Alaaeldin El-Nouby, Zhuang Liu, Mannat Singh, Kalyan Vasudev Alwala, Armand Joulin, and Ishan Misra. 2023. Imagebind: One embedding space to bind them all. In CVPR. 15180--15190."},{"key":"e_1_3_2_1_20_1","volume-title":"Multimodalgpt: A vision and language model for dialogue with humans. arXiv:2305.04790","author":"Gong Tao","year":"2023","unstructured":"Tao Gong, Chengqi Lyu, Shilong Zhang, Yudong Wang, Miao Zheng, Qian Zhao, Kuikun Liu, Wenwei Zhang, Ping Luo, and Kai Chen. 2023. Multimodalgpt: A vision and language model for dialogue with humans. arXiv:2305.04790 (2023)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"Yash Goyal Tejas Khot Douglas Summers-Stay Dhruv Batra and Devi Parikh. 2017. Making the v in vqa matter: Elevating the role of image understanding in visual question answering. In CVPR. 6904--6913.","DOI":"10.1109\/CVPR.2017.670"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Danna Gurari Qing Li Abigale J Stangl Anhong Guo Chi Lin Kristen Grauman Jiebo Luo and Jeffrey P Bigham. 2018. Vizwiz grand challenge: Answering visual questions from blind people. In CVPR. 3608--3617.","DOI":"10.1109\/CVPR.2018.00380"},{"key":"e_1_3_2_1_23_1","unstructured":"Jiaming Han Renrui Zhang Wenqi Shao Peng Gao Peng Xu Han Xiao Kaipeng Zhang Chris Liu Song Wen Ziyu Guo et al. 2023. ImageBind-LLM: Multi-modality Instruction Tuning. arXiv:2309.03905 (2023)."},{"key":"e_1_3_2_1_24_1","volume-title":"Measuring massive multitask language understanding. arXiv:2009.03300","author":"Hendrycks Dan","year":"2020","unstructured":"Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, and Jacob Steinhardt. 2020. Measuring massive multitask language understanding. arXiv:2009.03300 (2020)."},{"key":"e_1_3_2_1_25_1","volume-title":"BLIVA: A Simple Multimodal LLM for Better Handling of Text-Rich Visual Questions. arXiv:2308.09936","author":"Hu Wenbo","year":"2023","unstructured":"Wenbo Hu, Yifan Xu, Y Li, W Li, Z Chen, and Z Tu. 2023. BLIVA: A Simple Multimodal LLM for Better Handling of Text-Rich Visual Questions. arXiv:2308.09936 (2023)."},{"key":"e_1_3_2_1_26_1","volume-title":"C-eval: A multi-level multi-discipline chinese evaluation suite for foundation models. arXiv:2305.08322","author":"Huang Yuzhen","year":"2023","unstructured":"Yuzhen Huang, Yuzhuo Bai, Zhihao Zhu, Junlei Zhang, Jinghan Zhang, Tangjun Su, Junteng Liu, Chuancheng Lv, Yikai Zhang, Jiayi Lei, et al. 2023. C-eval: A multi-level multi-discipline chinese evaluation suite for foundation models. arXiv:2305.08322 (2023)."},{"volume-title":"Icdar2019 competition on scanned receipt ocr and information extraction. In 2019 ICDAR","author":"Huang Zheng","key":"e_1_3_2_1_27_1","unstructured":"Zheng Huang, Kai Chen, Jianhua He, Xiang Bai, Dimosthenis Karatzas, Shijian Lu, and CV Jawahar. 2019. Icdar2019 competition on scanned receipt ocr and information extraction. In 2019 ICDAR. IEEE, 1516--1520."},{"key":"e_1_3_2_1_28_1","volume-title":"Gqa: A new dataset for real-world visual reasoning and compositional question answering. In CVPR. 6700--6709.","author":"Hudson Drew A","year":"2019","unstructured":"Drew A Hudson and Christopher D Manning. 2019. Gqa: A new dataset for real-world visual reasoning and compositional question answering. In CVPR. 6700--6709."},{"key":"e_1_3_2_1_29_1","volume-title":"Hazim Kemal Ekenel, and Jean-Philippe Thiran","author":"Jaume Guillaume","year":"2019","unstructured":"Guillaume Jaume, Hazim Kemal Ekenel, and Jean-Philippe Thiran. 2019. Funsd: A dataset for form understanding in noisy scanned documents. In ICDARW, Vol. 2. IEEE, 1--6."},{"key":"e_1_3_2_1_30_1","volume-title":"Li Fei-Fei, C Lawrence Zitnick, and Ross Girshick.","author":"Johnson Justin","year":"2017","unstructured":"Justin Johnson, Bharath Hariharan, Laurens Van Der Maaten, Li Fei-Fei, C Lawrence Zitnick, and Ross Girshick. 2017. Clevr: A diagnostic dataset for compositional language and elementary visual reasoning. In CVPR. 2901--2910."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Kushal Kafle and Christopher Kanan. 2017. An Analysis of Visual Question Answering Algorithms. In ICCV.","DOI":"10.1109\/ICCV.2017.217"},{"key":"e_1_3_2_1_32_1","volume-title":"ICDAR 2015 competition on robust reading. In 2015 13th ICDAR. IEEE, 1156--1160","author":"Karatzas Dimosthenis","year":"2015","unstructured":"Dimosthenis Karatzas, Lluis Gomez-Bigorda, Anguelos Nicolaou, Suman Ghosh, Andrew Bagdanov, Masakazu Iwamura, Jiri Matas, Lukas Neumann, Vijay Ramaseshan Chandrasekhar, Shijian Lu, et al. 2015. ICDAR 2015 competition on robust reading. In 2015 13th ICDAR. IEEE, 1156--1160."},{"key":"e_1_3_2_1_33_1","volume-title":"Referitgame: Referring to objects in photographs of natural scenes. In EMNLP. 787--798.","author":"Kazemzadeh Sahar","year":"2014","unstructured":"Sahar Kazemzadeh, Vicente Ordonez, Mark Matten, and Tamara Berg. 2014. Referitgame: Referring to objects in photographs of natural scenes. In EMNLP. 787--798."},{"key":"e_1_3_2_1_34_1","volume-title":"Wikihow: A large scale text summarization dataset. arXiv:1810.09305","author":"Koupaee Mahnaz","year":"2018","unstructured":"Mahnaz Koupaee and William Yang Wang. 2018. Wikihow: A large scale text summarization dataset. arXiv:1810.09305 (2018)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"key":"e_1_3_2_1_36_1","unstructured":"Alex Krizhevsky Geoffrey Hinton et al. 2009. Learning multiple layers of features from tiny images. (2009)."},{"volume-title":"Visual information extraction in the wild: practical dataset and end-to-end solution","author":"Kuang Jianfeng","key":"e_1_3_2_1_37_1","unstructured":"Jianfeng Kuang, Wei Hua, Dingkang Liang, Mingkun Yang, Deqiang Jiang, Bo Ren, and Xiang Bai. 2023. Visual information extraction in the wild: practical dataset and end-to-end solution. In ICDAR. Springer, 36--53."},{"key":"e_1_3_2_1_38_1","volume-title":"Romaric Besan\u00e7on, Jos\u00e9 G Moreno, and Jes\u00fas Lov\u00f3n Melgarejo.","author":"Lerner Paul","year":"2022","unstructured":"Paul Lerner, Olivier Ferret, Camille Guinaudeau, Herv\u00e9 Le Borgne, Romaric Besan\u00e7on, Jos\u00e9 G Moreno, and Jes\u00fas Lov\u00f3n Melgarejo. 2022. ViQuAE, a dataset for knowledge-based visual question answering about named entities. In 45th ACM SIGIR. 3108--3120."},{"key":"e_1_3_2_1_39_1","volume-title":"SEED-Bench: Benchmarking Multimodal LLMs with Generative Comprehension. arXiv:2307.16125","author":"Li Bohao","year":"2023","unstructured":"Bohao Li, Rui Wang, Guangzhi Wang, Yuying Ge, Yixiao Ge, and Ying Shan. 2023. SEED-Bench: Benchmarking Multimodal LLMs with Generative Comprehension. arXiv:2307.16125 (2023)."},{"key":"e_1_3_2_1_40_1","volume-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv:2301.12597","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv:2301.12597 (2023)."},{"key":"e_1_3_2_1_41_1","volume-title":"Empowering Vision-Language Models to Follow Interleaved Vision-Language Instructions. arXiv:2308.04152","author":"Li Juncheng","year":"2023","unstructured":"Juncheng Li, Kaihang Pan, Zhiqi Ge, Minghe Gao, Hanwang Zhang, Wei Ji, Wenqiao Zhang, Tat-Seng Chua, Siliang Tang, and Yueting Zhuang. 2023. Empowering Vision-Language Models to Follow Interleaved Vision-Language Instructions. arXiv:2308.04152 (2023)."},{"key":"e_1_3_2_1_42_1","unstructured":"Lei Li Yuwei Yin Shicheng Li Liang Chen Peiyi Wang Shuhuai Ren Mukai Li Yazheng Yang Jingjing Xu Xu Sun et al. 2023. M3IT: A Large-Scale Dataset towards Multi-Modal Multilingual Instruction Tuning. arXiv:2306.04387 (2023)."},{"key":"e_1_3_2_1_43_1","volume-title":"Wayne Xin Zhao, and Ji-Rong Wen","author":"Li Yifan","year":"2023","unstructured":"Yifan Li, Yifan Du, Kun Zhou, Jinpeng Wang, Wayne Xin Zhao, and Ji-Rong Wen. 2023. Evaluating object hallucination in large vision-language models. arXiv:2305.10355 (2023)."},{"key":"e_1_3_2_1_44_1","volume-title":"Monkey: Image resolution and text label are important things for large multi-modal models. arXiv:2311.06607","author":"Li Zhang","year":"2023","unstructured":"Zhang Li, Biao Yang, Qiang Liu, Zhiyin Ma, Shuo Zhang, Jingxu Yang, Yabo Sun, Yuliang Liu, and Xiang Bai. 2023. Monkey: Image resolution and text label are important things for large multi-modal models. arXiv:2311.06607 (2023)."},{"volume-title":"Microsoft coco: Common objects in context","author":"Lin Tsung-Yi","key":"e_1_3_2_1_45_1","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. In ECCV. Springer, 740--755."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00566"},{"key":"e_1_3_2_1_47_1","volume-title":"Hallusionbench: You see what you think? or you think what you see? an image-context reasoning benchmark challenging for gpt-4v (ision), llava-1.5, and other multi-modality models. arXiv:2310.14566","author":"Liu Fuxiao","year":"2023","unstructured":"Fuxiao Liu, Tianrui Guan, Zongxia Li, Lichang Chen, Yaser Yacoob, Dinesh Manocha, and Tianyi Zhou. 2023. Hallusionbench: You see what you think? or you think what you see? an image-context reasoning benchmark challenging for gpt-4v (ision), llava-1.5, and other multi-modality models. arXiv:2310.14566 (2023)."},{"key":"e_1_3_2_1_48_1","volume-title":"Improved baselines with visual instruction tuning. arXiv:2310.03744","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Yuheng Li, and Yong Jae Lee. 2023. Improved baselines with visual instruction tuning. arXiv:2310.03744 (2023)."},{"key":"e_1_3_2_1_49_1","volume-title":"Llava-next: Improved reasoning, ocr, and world knowledge.","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Yuheng Li, Bo Li, Yuanhan Zhang, Sheng Shen, and Yong Jae Lee. 2024. Llava-next: Improved reasoning, ocr, and world knowledge."},{"key":"e_1_3_2_1_50_1","volume-title":"Visual instruction tuning. arXiv:2304.08485","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual instruction tuning. arXiv:2304.08485 (2023)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Yuan Liu Haodong Duan Yuanhan Zhang Bo Li Songyang Zhang Wangbo Zhao Yike Yuan Jiaqi Wang Conghui He Ziwei Liu et al. 2023. MMBench: Is Your Multi-modal Model an All-around Player? arXiv:2307.06281 (2023).","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"e_1_3_2_1_52_1","unstructured":"Yuliang Liu Zhang Li Hongliang Li Wenwen Yu Mingxin Huang Dezhi Peng Mingyu Liu Mingrui Chen Chunyuan Li Lianwen Jin et al. 2023. On the hidden mystery of ocr in large multimodal models. arXiv:2305.07895 (2023)."},{"key":"e_1_3_2_1_53_1","volume-title":"Deepseek-vl: Towards real-world vision-language understanding. arXiv:2403.05525","author":"Lu Haoyu","year":"2024","unstructured":"Haoyu Lu, Wen Liu, Bo Zhang, Bingxuan Wang, Kai Dong, Bo Liu, Jingxiang Sun, Tongzheng Ren, Zhuoshu Li, Yaofeng Sun, et al. 2024. Deepseek-vl: Towards real-world vision-language understanding. arXiv:2403.05525 (2024)."},{"key":"e_1_3_2_1_54_1","first-page":"2507","article-title":"Learn to explain: Multimodal reasoning via thought chains for science question answering","volume":"35","author":"Lu Pan","year":"2022","unstructured":"Pan Lu, Swaroop Mishra, Tanglin Xia, Liang Qiu, Kai-Wei Chang, Song-Chun Zhu, Oyvind Tafjord, Peter Clark, and Ashwin Kalyan. 2022. Learn to explain: Multimodal reasoning via thought chains for science question answering. Advances in Neural Information Processing Systems 35 (2022), 2507--2521.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_55_1","volume-title":"Ok-vqa: A visual question answering benchmark requiring external knowledge. In CVPR. 3195--3204.","author":"Marino Kenneth","year":"2019","unstructured":"Kenneth Marino, Mohammad Rastegari, Ali Farhadi, and Roozbeh Mottaghi. 2019. Ok-vqa: A visual question answering benchmark requiring external knowledge. In CVPR. 3195--3204."},{"key":"e_1_3_2_1_56_1","volume-title":"Docvqa: A dataset for vqa on document images. In WACV. 2200--2209.","author":"Mathew Minesh","year":"2021","unstructured":"Minesh Mathew, Dimosthenis Karatzas, and CV Jawahar. 2021. Docvqa: A dataset for vqa on document images. In WACV. 2200--2209."},{"volume-title":"Top-down and bottomup cues for scene text recognition","author":"Mishra Anand","key":"e_1_3_2_1_57_1","unstructured":"Anand Mishra, Karteek Alahari, and CV Jawahar. 2012. Top-down and bottomup cues for scene text recognition. In CVPR. IEEE, 2687--2694."},{"key":"e_1_3_2_1_58_1","volume-title":"Ajeet Kumar Singh, and Anirban Chakraborty","author":"Mishra Anand","year":"2019","unstructured":"Anand Mishra, Shashank Shekhar, Ajeet Kumar Singh, and Anirban Chakraborty. 2019. Ocr-vqa: Visual question answering by reading text in images. In 2019 ICDAR. IEEE, 947--952."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"crossref","unstructured":"Maria-Elena Nilsback and Andrew Zisserman. 2008. Automated Flower Classification over a Large Number of Classes. In ICVGIP.","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"e_1_3_2_1_60_1","unstructured":"OpenAI. 2023. ChatGPT (August 3 version). https:\/\/chat.openai.com\/chat"},{"volume-title":"Cats and dogs","author":"Parkhi Omkar M","key":"e_1_3_2_1_62_1","unstructured":"Omkar M Parkhi, Andrea Vedaldi, Andrew Zisserman, and CV Jawahar. 2012. Cats and dogs. In CVPR. IEEE, 3498--3505."},{"key":"e_1_3_2_1_63_1","volume-title":"Kosmos-2: Grounding Multimodal Large Language Models to the World. arXiv:2306.14824","author":"Peng Zhiliang","year":"2023","unstructured":"Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, and Furu Wei. 2023. Kosmos-2: Grounding Multimodal Large Language Models to the World. arXiv:2306.14824 (2023)."},{"key":"e_1_3_2_1_64_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In ICML. PMLR, 8748--8763."},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2014.07.008"},{"key":"e_1_3_2_1_66_1","volume-title":"Laion-400m: Open dataset of clip-filtered 400 million imagetext pairs. arXiv:2111.02114","author":"Schuhmann Christoph","year":"2021","unstructured":"Christoph Schuhmann, Richard Vencu, Romain Beaumont, Robert Kaczmarczyk, Clayton Mullis, Aarush Katta, Theo Coombes, Jenia Jitsev, and Aran Komatsuzaki. 2021. Laion-400m: Open dataset of clip-filtered 400 million imagetext pairs. arXiv:2111.02114 (2021)."},{"key":"e_1_3_2_1_67_1","volume-title":"A-okvqa: A benchmark for visual question answering using world knowledge. arXiv.","author":"Schwenk Dustin","year":"2022","unstructured":"Dustin Schwenk, Apoorv Khandelwal, Christopher Clark, Kenneth Marino, and Roozbeh Mottaghi. 2022. A-okvqa: A benchmark for visual question answering using world knowledge. arXiv. (2022)."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"crossref","unstructured":"Piyush Sharma Nan Ding Sebastian Goodman and Radu Soricut. 2018. Conceptual captions: A cleaned hypernymed image alt-text dataset for automatic image captioning. In ACL. 2556--2565.","DOI":"10.18653\/v1\/P18-1238"},{"volume-title":"Textcaps: a dataset for image captioning with reading comprehension","author":"Sidorov Oleksii","key":"e_1_3_2_1_69_1","unstructured":"Oleksii Sidorov, Ronghang Hu, Marcus Rohrbach, and Amanpreet Singh. 2020. Textcaps: a dataset for image captioning with reading comprehension. In ECCV. Springer, 742--758."},{"key":"e_1_3_2_1_70_1","volume-title":"Meet Shah, Yu Jiang, Xinlei Chen, Dhruv Batra, Devi Parikh, and Marcus Rohrbach.","author":"Singh Amanpreet","year":"2019","unstructured":"Amanpreet Singh, Vivek Natarajan, Meet Shah, Yu Jiang, Xinlei Chen, Dhruv Batra, Devi Parikh, and Marcus Rohrbach. 2019. Towards vqa models that can read. In CVPR. 8317--8326."},{"key":"e_1_3_2_1_71_1","volume-title":"Textocr: Towards large-scale end-to-end reasoning for arbitrary-shaped scene text. In CVPR. 8802--8812.","author":"Singh Amanpreet","year":"2021","unstructured":"Amanpreet Singh, Guan Pang, Mandy Toh, Jing Huang, Wojciech Galuba, and Tal Hassner. 2021. Textocr: Towards large-scale end-to-end reasoning for arbitrary-shaped scene text. In CVPR. 8802--8812."},{"key":"e_1_3_2_1_72_1","volume-title":"Abubakar Abid, Adam Fisch, Adam R Brown, Adam Santoro, Aditya Gupta, Adri\u00e0 Garriga-Alonso, et al.","author":"Srivastava Aarohi","year":"2022","unstructured":"Aarohi Srivastava, Abhinav Rastogi, Abhishek Rao, Abu Awal Md Shoeb, Abubakar Abid, Adam Fisch, Adam R Brown, Adam Santoro, Aditya Gupta, Adri\u00e0 Garriga-Alonso, et al. 2022. Beyond the imitation game: Quantifying and extrapolating the capabilities of language models. arXiv:2206.04615 (2022)."},{"key":"e_1_3_2_1_73_1","volume-title":"Pandagpt: One model to instruction-follow them all. arXiv:2305.16355","author":"Su Yixuan","year":"2023","unstructured":"Yixuan Su, Tian Lan, Huayang Li, Jialu Xu, Yan Wang, and Deng Cai. 2023. Pandagpt: One model to instruction-follow them all. arXiv:2305.16355 (2023)."},{"key":"e_1_3_2_1_74_1","unstructured":"Gemini Team Rohan Anil Sebastian Borgeaud Yonghui Wu Jean-Baptiste Alayrac Jiahui Yu Radu Soricut Johan Schalkwyk Andrew M Dai Anja Hauth et al. 2023. Gemini: a family of highly capable multimodal models. arXiv:2312.11805 (2023)."},{"key":"e_1_3_2_1_75_1","volume-title":"Winoground: Probing vision and language models for visio-linguistic compositionality. In CVPR. 5238--5248.","author":"Thrush Tristan","year":"2022","unstructured":"Tristan Thrush, Ryan Jiang, Max Bartolo, Amanpreet Singh, Adina Williams, Douwe Kiela, and Candace Ross. 2022. Winoground: Probing vision and language models for visio-linguistic compositionality. In CVPR. 5238--5248."},{"key":"e_1_3_2_1_76_1","volume-title":"Llama: Open and efficient foundation language models. arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023. Llama: Open and efficient foundation language models. arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_77_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_78_1","first-page":"200","article-title":"Multimodal few-shot learning with frozen language models","volume":"34","author":"Tsimpoukelli Maria","year":"2021","unstructured":"Maria Tsimpoukelli, Jacob L Menick, Serkan Cabi, SM Eslami, Oriol Vinyals, and Felix Hill. 2021. Multimodal few-shot learning with frozen language models. NeurIPS 34 (2021), 200--212.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_79_1","article-title":"Visualizing data using t-SNE","volume":"9","author":"der Maaten Laurens Van","year":"2008","unstructured":"Laurens Van der Maaten and Geoffrey Hinton. 2008. Visualizing data using t-SNE. Journal of machine learning research 9, 11 (2008).","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_80_1","volume-title":"Cider: Consensus-based image description evaluation. In CVPR. 4566--4575.","author":"Vedantam Ramakrishna","year":"2015","unstructured":"Ramakrishna Vedantam, C Lawrence Zitnick, and Devi Parikh. 2015. Cider: Consensus-based image description evaluation. In CVPR. 4566--4575."},{"key":"e_1_3_2_1_81_1","volume-title":"What Makes for Good Visual Tokenizers for Large Language Models? arXiv:2305.12223","author":"Wang Guangzhi","year":"2023","unstructured":"Guangzhi Wang, Yixiao Ge, Xiaohan Ding, Mohan Kankanhalli, and Ying Shan. 2023. What Makes for Good Visual Tokenizers for Large Language Models? arXiv:2305.12223 (2023)."},{"key":"e_1_3_2_1_82_1","volume-title":"An llm-free multi-dimensional benchmark for mllms hallucination evaluation. arXiv:2311.07397","author":"Wang Junyang","year":"2023","unstructured":"Junyang Wang, Yuhang Wang, Guohai Xu, Jing Zhang, Yukai Gu, Haitao Jia, Ming Yan, Ji Zhang, and Jitao Sang. 2023. An llm-free multi-dimensional benchmark for mllms hallucination evaluation. arXiv:2311.07397 (2023)."},{"key":"e_1_3_2_1_83_1","volume-title":"ImageNetVC: Zero-Shot Visual Commonsense Evaluation on 1000 ImageNet Categories. arXiv:2305.15028","author":"Xia Heming","year":"2023","unstructured":"Heming Xia, Qingxiu Dong, Lei Li, Jingjing Xu, Ziwei Qin, and Zhifang Sui. 2023. ImageNetVC: Zero-Shot Visual Commonsense Evaluation on 1000 ImageNet Categories. arXiv:2305.15028 (2023)."},{"key":"e_1_3_2_1_84_1","volume-title":"Visual entailment: A novel task for fine-grained image understanding. arXiv:1901.06706","author":"Xie Ning","year":"2019","unstructured":"Ning Xie, Farley Lai, Derek Doran, and Asim Kadav. 2019. Visual entailment: A novel task for fine-grained image understanding. arXiv:1901.06706 (2019)."},{"volume-title":"Toward understanding wordart: Corner-guided transformer for scene text recognition","author":"Xie Xudong","key":"e_1_3_2_1_85_1","unstructured":"Xudong Xie, Ling Fu, Zhifei Zhang, Zhaowen Wang, and Xiang Bai. 2022. Toward understanding wordart: Corner-guided transformer for scene text recognition. In ECCV. Springer, 303--321."},{"key":"e_1_3_2_1_86_1","volume-title":"Lvlm-ehub: A comprehensive evaluation benchmark for large vision-language models. arXiv:2306.09265","author":"Xu Peng","year":"2023","unstructured":"Peng Xu, Wenqi Shao, Kaipeng Zhang, Peng Gao, Shuo Liu, Meng Lei, Fanqing Meng, Siyuan Huang, Yu Qiao, and Ping Luo. 2023. Lvlm-ehub: A comprehensive evaluation benchmark for large vision-language models. arXiv:2306.09265 (2023)."},{"key":"e_1_3_2_1_87_1","volume-title":"Multiinstruct: Improving multimodal zero-shot learning via instruction tuning. arXiv:2212.10773","author":"Xu Zhiyang","year":"2022","unstructured":"Zhiyang Xu, Ying Shen, and Lifu Huang. 2022. Multiinstruct: Improving multimodal zero-shot learning via instruction tuning. arXiv:2212.10773 (2022)."},{"key":"e_1_3_2_1_88_1","doi-asserted-by":"crossref","unstructured":"Barry Menglong Yao Aditya Shah Lichao Sun Jin-Hee Cho and Lifu Huang. 2023. End-to-end multimodal fact-checking and explanation generation: A challenging dataset and models. In ACM SIGIR. 2733--2743.","DOI":"10.1145\/3539618.3591879"},{"key":"e_1_3_2_1_89_1","unstructured":"Qinghao Ye Haiyang Xu Guohai Xu Jiabo Ye Ming Yan Yiyang Zhou Junyang Wang Anwen Hu Pengcheng Shi Yaya Shi et al. 2023. mplug-owl: Modularization empowers large language models with multimodality. arXiv:2304.14178 (2023)."},{"key":"e_1_3_2_1_90_1","volume-title":"LAMM: Language- Assisted Multi-Modal Instruction-Tuning Dataset, Framework, and Benchmark. arXiv:2306.06687","author":"Yin Zhenfei","year":"2023","unstructured":"Zhenfei Yin, Jiong Wang, Jianjian Cao, Zhelun Shi, Dingning Liu, Mukai Li, Lu Sheng, Lei Bai, Xiaoshui Huang, Zhiyong Wang, et al. 2023. LAMM: Language- Assisted Multi-Modal Instruction-Tuning Dataset, Framework, and Benchmark. arXiv:2306.06687 (2023)."},{"key":"e_1_3_2_1_91_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"volume-title":"Modeling context in referring expressions","author":"Yu Licheng","key":"e_1_3_2_1_92_1","unstructured":"Licheng Yu, Patrick Poirson, Shan Yang, Alexander C Berg, and Tamara L Berg. 2016. Modeling context in referring expressions. In ECCV. Springer, 69--85."},{"key":"e_1_3_2_1_93_1","unstructured":"Weihao Yu Zhengyuan Yang Linjie Li Jianfeng Wang Kevin Lin Zicheng Liu Xinchao Wang and Lijuan Wang. 2023. MM-Vet: Evaluating Large Multimodal Models for Integrated Capabilities. arXiv:2308.02490 [cs.AI]"},{"key":"e_1_3_2_1_94_1","volume-title":"Mmmu: A massive multi-discipline multimodal understanding and reasoning benchmark for expert agi. arXiv:2311.16502","author":"Yue Xiang","year":"2023","unstructured":"Xiang Yue, Yuansheng Ni, Kai Zhang, Tianyu Zheng, Ruoqi Liu, Ge Zhang, Samuel Stevens, Dongfu Jiang, Weiming Ren, Yuxuan Sun, et al. 2023. Mmmu: A massive multi-discipline multimodal understanding and reasoning benchmark for expert agi. arXiv:2311.16502 (2023)."},{"key":"e_1_3_2_1_95_1","doi-asserted-by":"crossref","unstructured":"Rowan Zellers Yonatan Bisk Ali Farhadi and Yejin Choi. 2019. From recognition to cognition: Visual commonsense reasoning. In CVPR. 6720--6731.","DOI":"10.1109\/CVPR.2019.00688"},{"key":"e_1_3_2_1_96_1","volume-title":"What Matters in Training a GPT4-Style Language Model with Multimodal Inputs? arXiv:2307.02469","author":"Zeng Yan","year":"2023","unstructured":"Yan Zeng, Hanbo Zhang, Jiani Zheng, Jiangnan Xia, Guoqiang Wei, Yang Wei, Yuchen Zhang, and Tao Kong. 2023. What Matters in Training a GPT4-Style Language Model with Multimodal Inputs? arXiv:2307.02469 (2023)."},{"key":"e_1_3_2_1_97_1","volume-title":"Vinvl: Revisiting visual representations in vision-language models. In CVPR. 5579--5588.","author":"Zhang Pengchuan","year":"2021","unstructured":"Pengchuan Zhang, Xiujun Li, Xiaowei Hu, Jianwei Yang, Lei Zhang, Lijuan Wang, Yejin Choi, and Jianfeng Gao. 2021. Vinvl: Revisiting visual representations in vision-language models. In CVPR. 5579--5588."},{"key":"e_1_3_2_1_98_1","volume-title":"On evaluating adversarial robustness of large vision-language models. arXiv:2305.16934","author":"Zhao Yunqing","year":"2023","unstructured":"Yunqing Zhao, Tianyu Pang, Chao Du, Xiao Yang, Chongxuan Li, Ngai-Man Cheung, and Min Lin. 2023. On evaluating adversarial robustness of large vision-language models. arXiv:2305.16934 (2023)."},{"key":"e_1_3_2_1_99_1","volume-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv:2304.10592","author":"Zhu Deyao","year":"2023","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2023. Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv:2304.10592 (2023)."},{"key":"e_1_3_2_1_100_1","volume-title":"Jesse Dodge, Alex Fang, Youngjae Yu, Ludwig Schmidt, William Yang Wang, and Yejin Choi.","author":"Zhu Wanrong","year":"2024","unstructured":"Wanrong Zhu, Jack Hessel, Anas Awadalla, Samir Yitzhak Gadre, Jesse Dodge, Alex Fang, Youngjae Yu, Ludwig Schmidt, William Yang Wang, and Yejin Choi. 2024. Multimodal c4: An open, billion-scale corpus of images interleaved with text. NeurIPS 36 (2024)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681529","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681529","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:57:48Z","timestamp":1750294668000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681529"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":99,"alternative-id":["10.1145\/3664647.3681529","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681529","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}