{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:06:56Z","timestamp":1765339616077,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62476066, 62277002"],"award-info":[{"award-number":["62476066, 62277002"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755357","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:54:15Z","timestamp":1761375255000},"page":"4213-4221","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["DREAM: Integrating Hierarchical Multimodal Retrieval with Multi-page Multimodal Language Model for Documents VQA"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-9876-1454","authenticated-orcid":false,"given":"Jinxu","family":"Zhang","sequence":"first","affiliation":[{"name":"Harbin Institute of Technology, Harbin, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7107-9795","authenticated-orcid":false,"given":"Qiyuan","family":"Fan","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Harbin, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-0674-788X","authenticated-orcid":false,"given":"Yongqi","family":"Yu","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Harbin, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3090-7431","authenticated-orcid":false,"given":"Yu","family":"Zhang","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Harbin, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al., 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","unstructured":"Anthropic. 2024. Introducing the next generation of claude. (2024)."},{"key":"e_1_3_2_1_3_1","volume-title":"Self-rag: Learning to retrieve, generate, and critique through self-reflection. arXiv preprint arXiv:2310.11511","author":"Asai Akari","year":"2023","unstructured":"Akari Asai, Zeqiu Wu, Yizhong Wang, Avirup Sil, and Hannaneh Hajishirzi. 2023. Self-rag: Learning to retrieve, generate, and critique through self-reflection. arXiv preprint arXiv:2310.11511 (2023)."},{"key":"e_1_3_2_1_4_1","volume-title":"Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_1_5_1","volume-title":"Longformer: The long-document transformer. arXiv preprint arXiv:2004.05150","author":"Beltagy Iz","year":"2020","unstructured":"Iz Beltagy, Matthew E Peters, and Arman Cohan. 2020. Longformer: The long-document transformer. arXiv preprint arXiv:2004.05150 (2020)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01477"},{"key":"e_1_3_2_1_7_1","volume-title":"LoRA-Contextualizing Adaptation of Large Multimodal Models for Long Document Understanding. arXiv preprint arXiv:2411.01106","author":"Chen Jian","year":"2024","unstructured":"Jian Chen, Ruiyi Zhang, Yufan Zhou, Tong Yu, Franck Dernoncourt, Jiuxiang Gu, Ryan A Rossi, Changyou Chen, and Tong Sun. 2024b. LoRA-Contextualizing Adaptation of Large Multimodal Models for Long Document Understanding. arXiv preprint arXiv:2411.01106 (2024)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Zhe Chen Weiyun Wang Hao Tian Shenglong Ye Zhangwei Gao Erfei Cui Wenwen Tong Kongzhi Hu Jiapeng Luo Zheng Ma et al. 2024a. How far are we to gpt-4v? closing the gap to commercial multimodal models with open-source suites. arXiv preprint arXiv:2404.16821 (2024).","DOI":"10.1007\/s11432-024-4231-5"},{"key":"e_1_3_2_1_9_1","volume-title":"M3docrag: Multi-modal retrieval is what you need for multi-page multi-document understanding. arXiv preprint arXiv:2411.04952","author":"Cho Jaemin","year":"2024","unstructured":"Jaemin Cho, Debanjan Mahata, Ozan Irsoy, Yujie He, and Mohit Bansal. 2024. M3docrag: Multi-modal retrieval is what you need for multi-page multi-document understanding. arXiv preprint arXiv:2411.04952 (2024)."},{"key":"e_1_3_2_1_10_1","unstructured":"Xiaoyi Dong Pan Zhang Yuhang Zang Yuhang Cao Bin Wang Linke Ouyang Songyang Zhang Haodong Duan Wenwei Zhang Yining Li et al. 2024. Internlm-xcomposer2-4khd: A pioneering large vision-language model handling resolutions from 336 pixels to 4k hd. arXiv preprint arXiv:2404.06512 (2024)."},{"key":"e_1_3_2_1_11_1","volume-title":"Colpali: Efficient document retrieval with vision language models. arXiv preprint arXiv:2407.01449","author":"Faysse Manuel","year":"2024","unstructured":"Manuel Faysse, Hugues Sibille, Tony Wu, Bilel Omrani, Gautier Viaud, C\u00e9line Hudelot, and Pierre Colombo. 2024. Colpali: Efficient document retrieval with vision language models. arXiv preprint arXiv:2407.01449 (2024)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","unstructured":"Maarten Grootendorst. 2020. KeyBERT: Minimal keyword extraction with BERT. doi:10.5281\/zenodo.4461265","DOI":"10.5281\/zenodo.4461265"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73010-8_23"},{"key":"e_1_3_2_1_14_1","volume-title":"International conference on machine learning. PMLR, 3929-3938","author":"Guu Kelvin","year":"2020","unstructured":"Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat, and Mingwei Chang. 2020. Retrieval augmented language model pre-training. In International conference on machine learning. PMLR, 3929-3938."},{"key":"e_1_3_2_1_15_1","unstructured":"Anwen Hu Haiyang Xu Jiabo Ye Ming Yan Liang Zhang Bo Zhang Chen Li Ji Zhang Qin Jin Fei Huang et al. 2024. mplug-docowl 1.5: Unified structure learning for ocr-free document understanding. arXiv preprint arXiv:2403.12895 (2024)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548112"},{"key":"e_1_3_2_1_17_1","volume-title":"Dense passage retrieval for open-domain question answering. arXiv preprint arXiv:2004.04906","author":"Karpukhin Vladimir","year":"2020","unstructured":"Vladimir Karpukhin, Barlas O\u011fuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih. 2020. Dense passage retrieval for open-domain question answering. arXiv preprint arXiv:2004.04906 (2020)."},{"key":"e_1_3_2_1_18_1","volume-title":"Proceedings of naacL-HLT","volume":"1","author":"Ming-Wei Chang Jacob Devlin","year":"2019","unstructured":"Jacob Devlin Ming-Wei Chang Kenton and Lee Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of naacL-HLT, Vol. 1. 2."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_29"},{"key":"e_1_3_2_1_20_1","volume-title":"What matters when building vision-language models? arXiv preprint arXiv:2405.02246","author":"Lauren\u00e7on Hugo","year":"2024","unstructured":"Hugo Lauren\u00e7on, L\u00e9o Tronchon, Matthieu Cord, and Victor Sanh. 2024. What matters when building vision-language models? arXiv preprint arXiv:2405.02246 (2024)."},{"key":"e_1_3_2_1_21_1","volume-title":"International Conference on Machine Learning. PMLR","author":"Lee Kenton","year":"2023","unstructured":"Kenton Lee, Mandar Joshi, Iulia Raluca Turc, Hexiang Hu, Fangyu Liu, Julian Martin Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, and Kristina Toutanova. 2023. Pix2struct: Screenshot parsing as pretraining for visual language understanding. In International Conference on Machine Learning. PMLR, 18893-18912."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02527"},{"key":"e_1_3_2_1_23_1","unstructured":"Haoyu Lu Wen Liu Bo Zhang Bingxuan Wang Kai Dong Bo Liu Jingxiang Sun Tongzheng Ren Zhuoshu Li Hao Yang et al. 2024. Deepseek-vl: towards real-world vision-language understanding. arXiv preprint arXiv:2403.05525 (2024)."},{"key":"e_1_3_2_1_24_1","volume-title":"Unifying multimodal retrieval via document screenshot embedding. arXiv preprint arXiv:2406.11251","author":"Ma Xueguang","year":"2024","unstructured":"Xueguang Ma, Sheng-Chieh Lin, Minghan Li, Wenhu Chen, and Jimmy Lin. 2024a. Unifying multimodal retrieval via document screenshot embedding. arXiv preprint arXiv:2406.11251 (2024)."},{"key":"e_1_3_2_1_25_1","unstructured":"Yubo Ma Yuhang Zang Liangyu Chen Meiqi Chen Yizhu Jiao Xinze Li Xinyuan Lu Ziyu Liu Yan Ma Xiaoyi Dong Pan Zhang Liangming Pan Yu-Gang Jiang Jiaqi Wang Yixin Cao and Aixin Sun. 2024b. MMLongBench-Doc: Benchmarking Long-context Document Understanding with Visualizations. arXiv:2407.01523 [cs.CV] https:\/\/arxiv.org\/abs\/2407.01523"},{"key":"e_1_3_2_1_26_1","volume-title":"Jia Qing Tan, Shafiq Joty, and Enamul Hoque.","author":"Masry Ahmed","year":"2022","unstructured":"Ahmed Masry, Do Xuan Long, Jia Qing Tan, Shafiq Joty, and Enamul Hoque. 2022. Chartqa: A benchmark for question answering about charts with visual and logical reasoning. arXiv preprint arXiv:2203.10244 (2022)."},{"key":"e_1_3_2_1_27_1","volume-title":"Infographicvqa. In Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision. 1697-1706","author":"Mathew Minesh","year":"2022","unstructured":"Minesh Mathew, Viraj Bagal, Rub\u00e8n Tito, Dimosthenis Karatzas, Ernest Valveny, and CV Jawahar. 2022. Infographicvqa. In Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision. 1697-1706."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00225"},{"key":"e_1_3_2_1_29_1","unstructured":"OpenAI. 2024. Hello gpt-4o. (2024)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.5555\/3455716.3455856"},{"key":"e_1_3_2_1_31_1","volume-title":"Proceedings of the 3rd Text REtrieval Conference (-3). 109-126","author":"Robertson S","year":"1994","unstructured":"S Robertson, Steve Walker, Susan Jones, and MHB GATFORD. 1994. Okapi at 3. In Proceedings of the 3rd Text REtrieval Conference (-3). 109-126."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1108\/eb026526"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01365"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01845"},{"key":"e_1_3_2_1_35_1","volume-title":"Ryan Burnell, Libin Bai, Anmol Gulati, Garrett Tanzer, Damien Vincent, Zhufeng Pan, Shibo Wang, et al.","author":"Team Gemini","year":"2024","unstructured":"Gemini Team, Petko Georgiev, Ving Ian Lei, Ryan Burnell, Libin Bai, Anmol Gulati, Garrett Tanzer, Damien Vincent, Zhufeng Pan, Shibo Wang, et al., 2024. Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context. arXiv preprint arXiv:2403.05530 (2024)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2023.109834"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01789"},{"key":"e_1_3_2_1_38_1","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan Jinze Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge et al. 2024. Qwen2-vl: Enhancing vision-language model's perception of the world at any resolution. arXiv preprint arXiv:2409.12191 (2024)."},{"key":"e_1_3_2_1_39_1","volume-title":"Cogvlm: Visual expert for pretrained language models. arXiv preprint arXiv:2311.03079","author":"Wang Weihan","year":"2023","unstructured":"Weihan Wang, Qingsong Lv, Wenmeng Yu, Wenyi Hong, Ji Qi, Yan Wang, Junhui Ji, Zhuoyi Yang, Lei Zhao, Xixuan Song, et al., 2023. Cogvlm: Visual expert for pretrained language models. arXiv preprint arXiv:2311.03079 (2023)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3403172"},{"key":"e_1_3_2_1_41_1","unstructured":"Yang Xu Yiheng Xu Tengchao Lv Lei Cui Furu Wei Guoxin Wang Yijuan Lu Dinei Florencio Cha Zhang Wanxiang Che et al. 2020b. Layoutlmv2: Multi-modal pre-training for visually-rich document understanding. arXiv preprint arXiv:2012.14740 (2020)."},{"key":"e_1_3_2_1_42_1","volume-title":"Ureader: Universal ocr-free visually-situated language understanding with multimodal large language model. arXiv preprint arXiv:2310.05126","author":"Ye Jiabo","year":"2023","unstructured":"Jiabo Ye, Anwen Hu, Haiyang Xu, Qinghao Ye, Ming Yan, Guohai Xu, Chenliang Li, Junfeng Tian, Qi Qian, Ji Zhang, et al., 2023. Ureader: Universal ocr-free visually-situated language understanding with multimodal large language model. arXiv preprint arXiv:2310.05126 (2023)."},{"key":"e_1_3_2_1_43_1","unstructured":"Shi Yu Chaoyue Tang Bokai Xu Junbo Cui Junhao Ran Yukun Yan Zhenghao Liu Shuo Wang Xu Han Zhiyuan Liu et al. 2024. VisRAG: Vision-based Retrieval-augmented Generation on Multi-modality Documents. arXiv preprint arXiv:2410.10594 (2024)."},{"key":"e_1_3_2_1_44_1","volume-title":"Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, et al.","author":"Zaheer Manzil","year":"2020","unstructured":"Manzil Zaheer, Guru Guruganesh, Kumar Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, et al., 2020. Big bird: Transformers for longer sequences. Advances in neural information processing systems, Vol. 33 (2020), 17283-17297."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680750"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755357","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:02:59Z","timestamp":1765339379000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755357"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":45,"alternative-id":["10.1145\/3746027.3755357","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755357","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}