{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,23]],"date-time":"2025-11-23T18:26:58Z","timestamp":1763922418469,"version":"3.45.0"},"publisher-location":"Cham","reference-count":57,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032093677","type":"print"},{"value":"9783032093684","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,11,24]],"date-time":"2025-11-24T00:00:00Z","timestamp":1763942400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,11,24]],"date-time":"2025-11-24T00:00:00Z","timestamp":1763942400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-09368-4_16","type":"book-chapter","created":{"date-parts":[[2025,11,23]],"date-time":"2025-11-23T18:14:20Z","timestamp":1763921660000},"page":"260-278","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Interpret, Prune and\u00a0Distill Donut: Towards Lightweight VLMs for\u00a0VQA on\u00a0Documents"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-1891-4903","authenticated-orcid":false,"given":"Adnan","family":"Ben Mansour","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9304-4613","authenticated-orcid":false,"given":"Ayoub","family":"Karine","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8651-6555","authenticated-orcid":false,"given":"David","family":"Naccache","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,11,24]]},"reference":[{"key":"16_CR1","unstructured":"Appalaraju, S., et al.: DocformerV2: local features for document understanding (2023). https:\/\/arxiv.org\/abs\/2306.01733"},{"key":"16_CR2","unstructured":"Back, J., Ahn, N., Kim, J.: Magnitude attention-based dynamic pruning (2023). https:\/\/arxiv.org\/abs\/2306.05056"},{"key":"16_CR3","unstructured":"Bai, N., Iyer, R.A., Oikarinen, T., Kulkarni, A., Weng, T.W.: Interpreting neurons in deep vision networks with language models (2025). https:\/\/arxiv.org\/abs\/2403.13771"},{"key":"16_CR4","unstructured":"Bai, S., et al.: Qwen2.5-VL technical report (2025). https:\/\/arxiv.org\/abs\/2502.13923"},{"key":"16_CR5","doi-asserted-by":"crossref","unstructured":"Basu, S., Grayson, M., Morrison, C., Nushi, B., Feizi, S., Massiceti, D.: Understanding information storage and transfer in multi-modal large language models (2024). https:\/\/arxiv.org\/abs\/2406.04236","DOI":"10.52202\/079017-0237"},{"key":"16_CR6","unstructured":"Basu, S., Zhao, N., Morariu, V., Feizi, S., Manjunatha, V.: Localizing and editing knowledge in text-to-image generative models (2023). https:\/\/arxiv.org\/abs\/2310.13730"},{"key":"16_CR7","doi-asserted-by":"crossref","unstructured":"Bau, D., Zhou, B., Khosla, A., Oliva, A., Torralba, A.: Network dissection: quantifying interpretability of deep visual representations (2017). https:\/\/arxiv.org\/abs\/1704.05796","DOI":"10.1109\/CVPR.2017.354"},{"key":"16_CR8","unstructured":"Bereska, L., Gavves, E.: Mechanistic interpretability for AI safety \u2013 a review (2024). https:\/\/arxiv.org\/abs\/2404.14082"},{"key":"16_CR9","doi-asserted-by":"crossref","unstructured":"Biten, A.F., et al.: Scene text visual question answering (2019). https:\/\/arxiv.org\/abs\/1905.13648","DOI":"10.1109\/ICCV.2019.00439"},{"key":"16_CR10","unstructured":"Burns, C., Ye, H., Klein, D., Steinhardt, J.: Discovering latent knowledge in language models without supervision (2022). https:\/\/arxiv.org\/abs\/2212.03827"},{"key":"16_CR11","unstructured":"Conmy, A., Mavor-Parker, A.N., Lynch, A., Heimersheim, S., Garriga-Alonso, A.: Towards automated circuit discovery for mechanistic interpretability (2023). https:\/\/arxiv.org\/abs\/2304.14997"},{"key":"16_CR12","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale (2021). https:\/\/arxiv.org\/abs\/2010.11929"},{"key":"16_CR13","unstructured":"Frankle, J., Carbin, M.: The lottery ticket hypothesis: finding sparse, trainable neural networks (2019). https:\/\/arxiv.org\/abs\/1803.03635"},{"key":"16_CR14","unstructured":"Gandelsman, Y., Efros, A.A., Steinhardt, J.: Interpreting clip\u2019s image representation via text-based decomposition (2024). https:\/\/arxiv.org\/abs\/2310.05916"},{"key":"16_CR15","doi-asserted-by":"crossref","unstructured":"Golovanevsky, M., Rudman, W., Palit, V., Singh, R., Eickhoff, C.: What do VLMS notice? A mechanistic interpretability pipeline for gaussian-noise-free text-image corruption and evaluation (2025). https:\/\/arxiv.org\/abs\/2406.16320","DOI":"10.18653\/v1\/2025.naacl-long.571"},{"key":"16_CR16","unstructured":"Gurnee, W., Nanda, N., Pauly, M., Harvey, K., Troitskii, D., Bertsimas, D.: Finding neurons in a haystack: case studies with sparse probing (2023). https:\/\/arxiv.org\/abs\/2305.01610"},{"key":"16_CR17","unstructured":"Han, S., Pool, J., Tran, J., Dally, W.: Learning both weights and connections for efficient neural network. In: Cortes, C., Lawrence, N., Lee, D., Sugiyama, M., Garnett, R. (eds.) Advances in Neural Information Processing Systems, vol.\u00a028. Curran Associates, Inc. (2015). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2015\/file\/ae0eb3eed39d2bcef4622b2499a05fe6-Paper.pdf"},{"key":"16_CR18","unstructured":"Hanna, M., Liu, O., Variengien, A.: How does GPT-2 compute greater-than?: Interpreting mathematical abilities in a pre-trained language model (2023). https:\/\/arxiv.org\/abs\/2305.00586"},{"key":"16_CR19","unstructured":"Hertz, A., Mokady, R., Tenenbaum, J., Aberman, K., Pritch, Y., Cohen-Or, D.: prompt-to-prompt image editing with cross attention control (2022). https:\/\/arxiv.org\/abs\/2208.01626"},{"key":"16_CR20","unstructured":"Hinton, G., Vinyals, O., Dean, J.: Distilling the knowledge in a neural network (2015). https:\/\/arxiv.org\/abs\/1503.02531"},{"key":"16_CR21","unstructured":"Hu, H., Zhao, P., Li, P., Zheng, Y., Wang, Z., Yuan, X.: FASP: fast and accurate structured pruning of large language models (2025). https:\/\/arxiv.org\/abs\/2501.09412"},{"key":"16_CR22","doi-asserted-by":"crossref","unstructured":"Kim, G., et al.: OCR-free document understanding transformer (2022). https:\/\/arxiv.org\/abs\/2111.15664","DOI":"10.1007\/978-3-031-19815-1_29"},{"key":"16_CR23","unstructured":"Lee, K., et al.: Pix2Struct: screenshot parsing as pretraining for visual language understanding (2023). https:\/\/arxiv.org\/abs\/2210.03347"},{"key":"16_CR24","unstructured":"Li, C., et al.: PP-OCRV3: more attempts for the improvement of ultra lightweight OCR system (2022). https:\/\/arxiv.org\/abs\/2206.03001"},{"key":"16_CR25","unstructured":"Li, M., et al.: TROCR: transformer-based optical character recognition with pre-trained models (2022). https:\/\/arxiv.org\/abs\/2109.10282"},{"key":"16_CR26","unstructured":"Lin, Z., et al.: A survey on mechanistic interpretability for multi-modal foundation models (2025). https:\/\/arxiv.org\/abs\/2502.17516"},{"key":"16_CR27","doi-asserted-by":"crossref","unstructured":"Ling, G., Wang, Z., Yan, Y., Liu, Q.: SlimGPT: layer-wise structured pruning for large language models (2024). https:\/\/arxiv.org\/abs\/2412.18110","DOI":"10.52202\/079017-3401"},{"key":"16_CR28","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning (2023). https:\/\/arxiv.org\/abs\/2304.08485"},{"key":"16_CR29","unstructured":"Liu, Y., et al.: Multilingual denoising pre-training for neural machine translation (2020). https:\/\/arxiv.org\/abs\/2001.08210"},{"key":"16_CR30","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows (2021). https:\/\/arxiv.org\/abs\/2103.14030","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"16_CR31","doi-asserted-by":"crossref","unstructured":"Mathew, M., Karatzas, D., Jawahar, C.V.: DocVQA: a dataset for VQA on document images (2020). https:\/\/arxiv.org\/abs\/2007.00398","DOI":"10.1109\/WACV48630.2021.00225"},{"key":"16_CR32","unstructured":"Meng, K., Bau, D., Andonian, A., Belinkov, Y.: Locating and editing factual associations in GPT (2023). https:\/\/arxiv.org\/abs\/2202.05262"},{"key":"16_CR33","unstructured":"Muralidharan, S., et al.: Compact language models via pruning and knowledge distillation (2024). https:\/\/arxiv.org\/abs\/2407.14679"},{"key":"16_CR34","unstructured":"Nanda, N.: Attribution patching: activation patching at industrial scale (2023). https:\/\/www.neelnanda.io\/mechanistic-interpretability\/attribution-patching. Accessed 27 Apr 2025"},{"key":"16_CR35","unstructured":"nostalgebraist: Interpreting GPT: the logit lens (2020). https:\/\/www.lesswrong.com\/posts\/AcKRB8wDpdaN6v6ru\/interpreting-gpt-the-logit-lens. Accessed 7 Apr 2025"},{"key":"16_CR36","unstructured":"Olsson, C., et al.: In-context learning and induction heads (2022). https:\/\/arxiv.org\/abs\/2209.11895"},{"key":"16_CR37","doi-asserted-by":"crossref","unstructured":"Palit, V., Pandey, R., Arora, A., Liang, P.P.: Towards vision-language mechanistic interpretability: a causal tracing tool for blip (2023). https:\/\/arxiv.org\/abs\/2308.14179","DOI":"10.1109\/ICCVW60793.2023.00307"},{"key":"16_CR38","doi-asserted-by":"crossref","unstructured":"Rai, D., Yao, Z.: An investigation of neuron activation as a unified lens to explain chain-of-thought eliciting arithmetic reasoning of LLMS (2024). https:\/\/arxiv.org\/abs\/2406.12288","DOI":"10.18653\/v1\/2024.acl-long.387"},{"key":"16_CR39","unstructured":"Rai, D., Zhou, Y., Feng, S., Saparov, A., Yao, Z.: A practical review of mechanistic interpretability for transformer-based language models (2025). https:\/\/arxiv.org\/abs\/2407.02646"},{"key":"16_CR40","unstructured":"Romero, A., Ballas, N., Kahou, S.E., Chassang, A., Gatta, C., Bengio, Y.: FitNets: hints for thin deep nets (2015). https:\/\/arxiv.org\/abs\/1412.6550"},{"key":"16_CR41","unstructured":"Sakarvadia, M., et al.: Attention lens: a tool for mechanistically interpreting the attention head information retrieval mechanism (2023). https:\/\/arxiv.org\/abs\/2310.16270"},{"key":"16_CR42","unstructured":"Shaikh, A., Cochez, M., Diachkov, D., de\u00a0Rijcke, M., Yousefi, S.: Donut-hole: Donut sparsification by harnessing knowledge and optimizing learning efficiency (2023). https:\/\/arxiv.org\/abs\/2311.05778"},{"key":"16_CR43","unstructured":"Sun, M., Liu, Z., Bair, A., Kolter, J.Z.: A simple and effective pruning approach for large language models (2024). https:\/\/arxiv.org\/abs\/2306.11695"},{"key":"16_CR44","doi-asserted-by":"crossref","unstructured":"Syed, A., Rager, C., Conmy, A.: Attribution patching outperforms automated circuit discovery (2023). https:\/\/arxiv.org\/abs\/2310.10348","DOI":"10.18653\/v1\/2024.blackboxnlp-1.25"},{"key":"16_CR45","doi-asserted-by":"crossref","unstructured":"Tung, F., Mori, G.: Similarity-preserving knowledge distillation (2019). https:\/\/arxiv.org\/abs\/1907.09682","DOI":"10.1109\/ICCV.2019.00145"},{"key":"16_CR46","unstructured":"Vaswani, A., et al.: Attention is all you need (2017). https:\/\/arxiv.org\/abs\/1706.03762"},{"key":"16_CR47","unstructured":"Vig, J., et al.: Investigating gender bias in language models using causal mediation analysis. In: Larochelle, H., Ranzato, M., Hadsell, R., Balcan, M., Lin, H. (eds.) Advances in Neural Information Processing Systems, vol.\u00a033, pp. 12388\u201312401. Curran Associates, Inc. (2020). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2020\/file\/92650b2e92217715fe312e6fa7b90d82-Paper.pdf"},{"key":"16_CR48","unstructured":"Wang, K., Variengien, A., Conmy, A., Shlegeris, B., Steinhardt, J.: Interpretability in the wild: a circuit for indirect object identification in GPT-2 small (2022). https:\/\/arxiv.org\/abs\/2211.00593"},{"key":"16_CR49","unstructured":"Wang, Z., et al.: SmartTrim: adaptive tokens and attention pruning for efficient vision-language models (2023). https:\/\/arxiv.org\/abs\/2305.15033"},{"key":"16_CR50","doi-asserted-by":"publisher","unstructured":"Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: LayoutLM: pre-training of text and layout for document image understanding. In: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. ACM (2019). https:\/\/doi.org\/10.1145\/3394486.3403172","DOI":"10.1145\/3394486.3403172"},{"key":"16_CR51","unstructured":"Yang, D., Cao, B., Zhang, A., Gu, W., Hu, W., Chen, G.: Beyond intermediate states: Explaining visual redundancy through language (2025). https:\/\/arxiv.org\/abs\/2503.20540"},{"key":"16_CR52","doi-asserted-by":"crossref","unstructured":"Ye, X., Gan, Y., Ge, Y., Zhang, X.P., Tang, Y.: ATP-LLAVA: adaptive token pruning for large vision language models (2024). https:\/\/arxiv.org\/abs\/2412.00447","DOI":"10.1109\/CVPR52734.2025.02325"},{"key":"16_CR53","unstructured":"Yu, Z., Ananiadou, S.: Understanding multimodal LLMS: the mechanistic interpretability of LLAVA in visual question answering (2025). https:\/\/arxiv.org\/abs\/2411.10950"},{"key":"16_CR54","unstructured":"Zafrir, O., Larey, A., Boudoukh, G., Shen, H., Wasserblat, M.: Prune once for all: sparse pre-trained language models (2021). https:\/\/arxiv.org\/abs\/2111.05754"},{"key":"16_CR55","unstructured":"Zagoruyko, S., Komodakis, N.: Paying more attention to attention: improving the performance of convolutional neural networks via attention transfer (2017). https:\/\/arxiv.org\/abs\/1612.03928"},{"key":"16_CR56","unstructured":"Zhang, Y., et al.: FinerCut: finer-grained interpretable layer pruning for large language models (2024). https:\/\/arxiv.org\/abs\/2405.18218"},{"key":"16_CR57","unstructured":"Zhu, J., et al.: InternVL3: exploring advanced training and test-time recipes for open-source multimodal models (2025). https:\/\/arxiv.org\/abs\/2504.10479"}],"container-title":["Lecture Notes in Computer Science","Document Analysis and Recognition \u2013 ICDAR 2025 Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-09368-4_16","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,23]],"date-time":"2025-11-23T18:14:41Z","timestamp":1763921681000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-09368-4_16"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,24]]},"ISBN":["9783032093677","9783032093684"],"references-count":57,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-09368-4_16","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,24]]},"assertion":[{"value":"24 November 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"ICDAR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Document Analysis and Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Wuhan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16 September 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21 September 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icdar2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/iapr.org\/icdar2025","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}