{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,30]],"date-time":"2026-01-30T11:48:02Z","timestamp":1769773682083,"version":"3.49.0"},"publisher-location":"Singapore","reference-count":30,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819557189","type":"print"},{"value":"9789819557196","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-5719-6_26","type":"book-chapter","created":{"date-parts":[[2026,1,29]],"date-time":"2026-01-29T20:34:31Z","timestamp":1769718871000},"page":"402-416","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Dual-Stream Adaptive Retrieval and\u00a0Hierarchical Agent Collaboration for\u00a0Document Visual QA"],"prefix":"10.1007","author":[{"given":"Qin","family":"Fang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ruixue","family":"Gou","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shiqi","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Minqi","family":"Song","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Pan","family":"Yuan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xian","family":"Peng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,1,30]]},"reference":[{"issue":"11","key":"26_CR1","first-page":"2215","volume":"66","author":"L Bornmann","year":"2015","unstructured":"Bornmann, L., Mutz, R.: Growth rates of modern science: a bibliometric analysis based on the number of publications and cited references. J. Am. Soc. Inf. Sci. 66(11), 2215\u20132222 (2015)","journal-title":"J. Am. Soc. Inf. Sci."},{"key":"26_CR2","unstructured":"Han, S., et al.: Mdocagent: a multi-modal multi-agent framework for document understanding. arXiv preprint arXiv:2503.13964 (2025)"},{"key":"26_CR3","doi-asserted-by":"publisher","unstructured":"Tanaka, R., Nishida, K., Nishida, K., Hasegawa, T., Saito, I., Saito, K.: Slidevqa: a dataset for document visual question answering on multiple images. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a037, pp. 13636\u201313645 (2023). https:\/\/doi.org\/10.1609\/aaai.v37i11.26598","DOI":"10.1609\/aaai.v37i11.26598"},{"key":"26_CR4","doi-asserted-by":"crossref","unstructured":"Chia, Y.K., et al.: M-longdoc: a benchmark for multimodal super-long document understanding and a retrieval-aware tuning framework. arXiv preprint arXiv:2411.06176 (2024)","DOI":"10.18653\/v1\/2025.emnlp-main.469"},{"key":"26_CR5","doi-asserted-by":"crossref","unstructured":"Mathew, M., Karatzas, D., Jawahar, C.: Docvqa: a dataset for vqa on document images. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 2200\u20132209 (2021)","DOI":"10.1109\/WACV48630.2021.00225"},{"key":"26_CR6","unstructured":"Cho, J., Mahata, D., Irsoy, O., He, Y., Bansal, M.: M3docrag: multi-modal retrieval is what you need for multi-page multi-document understanding. arXiv preprint arXiv:2411.04952 (2024)"},{"key":"26_CR7","doi-asserted-by":"publisher","first-page":"142642","DOI":"10.1109\/ACCESS.2020.3012542","volume":"8","author":"J Memon","year":"2020","unstructured":"Memon, J., Sami, M., Khan, R.A., Uddin, M.: Handwritten optical character recognition (ocr): A comprehensive systematic literature review (slr). IEEE Access 8, 142642\u2013142668 (2020)","journal-title":"IEEE Access"},{"key":"26_CR8","first-page":"9459","volume":"33","author":"P Lewis","year":"2020","unstructured":"Lewis, P., et al.: Retrieval-augmented generation for knowledge-intensive nlp tasks. Adv. Neural. Inf. Process. Syst. 33, 9459\u20139474 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"26_CR9","unstructured":"Yu, S., et\u00a0al.: Visrag: vision-based retrieval-augmented generation on multi-modality documents. arXiv preprint arXiv:2410.10594 (2024)"},{"key":"26_CR10","doi-asserted-by":"crossref","unstructured":"Wang, Q., et al.: Vidorag: visual document retrieval-augmented generation via dynamic iterative reasoning agents. arXiv preprint arXiv:2502.18017 (2025)","DOI":"10.18653\/v1\/2025.emnlp-main.464"},{"key":"26_CR11","doi-asserted-by":"publisher","first-page":"109834","DOI":"10.1016\/j.patcog.2023.109834","volume":"144","author":"R Tito","year":"2023","unstructured":"Tito, R., Karatzas, D., Valveny, E.: Hierarchical multimodal transformers for multipage docvqa. Pattern Recogn. 144, 109834 (2023). https:\/\/doi.org\/10.1016\/j.patcog.2023.109834","journal-title":"Pattern Recogn."},{"key":"26_CR12","doi-asserted-by":"publisher","unstructured":"Mishra, A., Shekhar, S., Singh, A.K., Chakraborty, A.: Ocr-vqa: visual question answering by reading text in images. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), pp. 947\u2013952. IEEE (2019). https:\/\/doi.org\/10.1109\/ICDAR.2019.00155","DOI":"10.1109\/ICDAR.2019.00155"},{"key":"26_CR13","doi-asserted-by":"crossref","unstructured":"Ding, Y., et al.: V-doc: visual questions answers with documents. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 21492\u201321498 (2022)","DOI":"10.1109\/CVPR52688.2022.02083"},{"key":"26_CR14","doi-asserted-by":"crossref","unstructured":"Zhang, Y., et al.: Motiongpt: finetuned llms are general-purpose motion generators. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 7368\u20137376 (2024)","DOI":"10.1609\/aaai.v38i7.28567"},{"key":"26_CR15","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 26296\u201326306 (2024)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"26_CR16","unstructured":"Tong, H., et\u00a0al.: Mj-video: fine-grained benchmarking and rewarding video preferences in video generation. arXiv preprint arXiv:2502.01719 (2025)"},{"key":"26_CR17","doi-asserted-by":"crossref","unstructured":"Wang, W., et\u00a0al.: Image as a foreign language: Beit pretraining for vision and vision-language tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19175\u201319186 (2023)","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"26_CR18","unstructured":"Faysse, M., et al.: Colpali: efficient document retrieval with vision language models (2025). https:\/\/arxiv.org\/abs\/2407.01449"},{"key":"26_CR19","first-page":"34892","volume":"36","author":"H Liu","year":"2023","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. Adv. Neural. Inf. Process. Syst. 36, 34892\u201334916 (2023)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"26_CR20","unstructured":"Bai, J., et\u00a0al.: Qwen technical report. arXiv preprint arXiv:2309.16609 (2023)"},{"key":"26_CR21","unstructured":"Achiam, J., et\u00a0al.: Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)"},{"key":"26_CR22","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: Minigpt-4: enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)"},{"key":"26_CR23","doi-asserted-by":"crossref","unstructured":"Abootorabi, M.M., et al.: Ask in any modality: a comprehensive survey on multimodal retrieval-augmented generation. arXiv preprint arXiv:2502.08826 (2025)","DOI":"10.18653\/v1\/2025.findings-acl.861"},{"key":"26_CR24","unstructured":"Du, Y., et\u00a0al.: Pp-ocr: a practical ultra lightweight ocr system. arXiv preprint arXiv:2009.09941 (2020)"},{"key":"26_CR25","doi-asserted-by":"crossref","unstructured":"Khattab, O., Zaharia, M.: Colbert: efficient and effective passage search via contextualized late interaction over bert. In: Proceedings of the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 39\u201348 (2020)","DOI":"10.1145\/3397271.3401075"},{"issue":"1","key":"26_CR26","doi-asserted-by":"publisher","first-page":"405","DOI":"10.1146\/annurev-statistics-030718-104938","volume":"6","author":"VM Panaretos","year":"2019","unstructured":"Panaretos, V.M., Zemel, Y.: Statistical aspects of wasserstein distances. Ann. Rev. Stat. Appl. 6(1), 405\u2013431 (2019)","journal-title":"Ann. Rev. Stat. Appl."},{"key":"26_CR27","unstructured":"Weng, L.: From gan to wgan. arXiv preprint arXiv:1904.08994 (2019)"},{"key":"26_CR28","unstructured":"Lee, C., et al.: Nv-embed: improved techniques for training llms as generalist embedding models. arXiv preprint arXiv:2405.17428 (2024)"},{"key":"26_CR29","unstructured":"Bai, S., et\u00a0al.: Qwen2. 5-vl technical report. arXiv preprint arXiv:2502.13923 (2025)"},{"key":"26_CR30","doi-asserted-by":"crossref","unstructured":"Chen, J., Xiao, S., Zhang, P., Luo, K., Lian, D., Liu, Z.: Bge m3-embedding: multi-lingual, multi-functionality, multi-granularity text embeddings through self-knowledge distillation. arXiv preprint arXiv:2402.03216 (2024)","DOI":"10.18653\/v1\/2024.findings-acl.137"}],"container-title":["Lecture Notes in Computer Science","Web and Big Data"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-5719-6_26","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,29]],"date-time":"2026-01-29T20:34:37Z","timestamp":1769718877000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-5719-6_26"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819557189","9789819557196"],"references-count":30,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-5719-6_26","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"30 January 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"APWeb-WAIM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Asia-Pacific Web (APWeb) and Web-Age Information Management (WAIM) Joint International Conference on Web and Big Data","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Shenyang","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 August 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"30 August 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"apwebwaim2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/apweb2025.sau.edu.cn\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}