{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T03:29:06Z","timestamp":1777865346786,"version":"3.51.4"},"reference-count":73,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.00213","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"2206-2216","source":"Crossref","is-referenced-by-count":0,"title":["Analyzing Fine-Tuning Representation Shift for Multimodal LLMs Steering"],"prefix":"10.1109","author":[{"given":"Pegah","family":"Khayatan","sequence":"first","affiliation":[{"name":"Sorbonne Universit&#x00E9;,ISIR,Paris,France"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mustafa","family":"Shukor","sequence":"additional","affiliation":[{"name":"Sorbonne Universit&#x00E9;,ISIR,Paris,France"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jayneel","family":"Parekh","sequence":"additional","affiliation":[{"name":"Sorbonne Universit&#x00E9;,ISIR,Paris,France"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Arnaud","family":"Dapogny","sequence":"additional","affiliation":[{"name":"Sorbonne Universit&#x00E9;,ISIR,Paris,France"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Matthieu","family":"Cord","sequence":"additional","affiliation":[{"name":"Sorbonne Universit&#x00E9;,ISIR,Paris,France"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1723"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.52202\/079017-4322"},{"key":"ref3","article-title":"Qwen-vl: A frontier large vision-language model with versatile abilities","author":"Bai","year":"2023","journal-title":"arXiv preprint"},{"key":"ref4","first-page":"1539","volume-title":"What makes multimodal in-context learning work? In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops","author":"Baldassini","year":"2024"},{"key":"ref5","author":"Belrose","year":"2023","journal-title":"Eliciting latent predictions from transformers with the tuned lens. ArXiv, abs\/2303.08112"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.52202\/075280-2884"},{"key":"ref7","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01514"},{"key":"ref9","article-title":"Understanding and improving in-context learning on vision-language models","author":"Chen","year":"2024","journal-title":"ICLR 2024 Workshop on Mathematical and Empirical Understanding of Foundation Models"},{"key":"ref10","article-title":"Palm: Scaling language modeling with pathways","author":"Chowdhery","year":"2022","journal-title":"arXiv preprint"},{"key":"ref11","article-title":"Eta: Evaluating then aligning safety of vision language models at inference time","volume":"abs\/2410.06625","author":"Ding","year":"2024","journal-title":"ArXiv"},{"key":"ref12","article-title":"Palm-e: An embodied multimodal language model","author":"Driess","year":"2023","journal-title":"arXiv preprint"},{"key":"ref13","article-title":"A mathematical framework for transformer circuits","author":"Elhage","year":"2021","journal-title":"Transformer Circuits Thread"},{"key":"ref14","article-title":"Not all language model features are linear","author":"Engels","year":"2024","journal-title":"arXiv preprint"},{"key":"ref15","article-title":"Multimodal task vectors enable manyshot multimodal in-context learning","author":"Huang","year":"2024","journal-title":"NeurIPS"},{"key":"ref16","article-title":"LIVE: Learnable in-context vector for visual question answering","author":"Yingzhe","year":"2024","journal-title":"NeurIPS"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.52202\/075280-2391"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00266"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.00901"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.75"},{"key":"ref21","first-page":"9277","article-title":"Towards automatic concept-based explanations","author":"Ghorbani","year":"2019","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i22.34568"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72643-9_23"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"ref25","article-title":"Lora: Low-rank adaptation of large language models","volume":"abs\/2106.09685","author":"Hu","year":"2021","journal-title":"ArXiv"},{"key":"ref26","article-title":"Miner: Mining the underlying pattern of modality-specific neurons in multimodal large language models","author":"Huang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref27","article-title":"Sparse autoencoders find highly interpretable features in language models","volume-title":"The Twelfth International Conference on Learning Representations","author":"Huben","year":"2024"},{"key":"ref28","article-title":"Mistral 7b","author":"Albert","year":"2023","journal-title":"arXiv preprint"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02776"},{"key":"ref30","first-page":"2668","article-title":"Interpretability beyond feature attribution: Quantitative testing with concept activation vectors (tcav)","volume-title":"International conference on machine learning","author":"Kim","year":"2018"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"key":"ref32","article-title":"Decoderlens: Layerwise interpretation of encoder-decoder transformers","volume":"abs\/2310.03686","author":"Langedijk","year":"2023","journal-title":"ArXiv"},{"key":"ref33","article-title":"Obelics: An open web-scale filtered dataset of interleaved image-text documents","author":"Lauren\u00e7on","year":"2024","journal-title":"Advances in Neural Information Processing Systems, 36"},{"key":"ref34","article-title":"What matters when building vision-language models?","author":"Lauren\u00e7on","year":"2024","journal-title":"arXiv preprint"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1797"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.235"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1516"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72992-8_22"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.eacl-main.185"},{"key":"ref42","author":"Nanda","year":"2023","journal-title":"Actually, othello-gpt has a linear emergent world model"},{"key":"ref43","article-title":"Interpreting gpt: The logit lens","volume-title":"Nostalgebraist","year":"2020"},{"key":"ref44","article-title":"Gpt-4 technical report","year":"2023","journal-title":"arXiv"},{"key":"ref45","article-title":"Finding and editing multi-modal neurons in pre-trained transformer","author":"Pan","year":"2023","journal-title":"arXiv preprint"},{"key":"ref46","article-title":"Steering llama 2 via contrastive activation addition","author":"Panickssery","year":"2023","journal-title":"arXiv preprint"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.52202\/079017-4312"},{"key":"ref48","article-title":"The linear representation hypothesis and the geometry of large language models","volume-title":"Forty-first International Conference on Machine Learning","author":"Park","year":"2024"},{"key":"ref49","article-title":"What factors affect multi-modal incontext learning? an in-depth exploration","author":"Qin","year":"2024","journal-title":"arXiv preprint"},{"key":"ref50","article-title":"Ui-tars: Pioneering automated gui interaction with native agents","author":"Qin","year":"2025","journal-title":"arXiv preprint"},{"key":"ref51","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International conference on machine learning","author":"Radford","year":"2021"},{"key":"ref52","article-title":"Improving dictionary learning with gated sparse autoencoders","author":"Rajamanoharan","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref53","first-page":"1840018421","article-title":"Linear adversarial concept erasure","volume-title":"International Conference on Machine Learning","author":"Ravfogel","year":"2022"},{"key":"ref54","article-title":"Attention lens: A tool for mechanistically interpreting the attention head information retrieval mechanism","volume":"abs\/2310.16270","author":"Sakarvadia","year":"2023","journal-title":"CoRR"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00308"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.52202\/079017-4159"},{"key":"ref57","article-title":"Skipping computations in multimodal 11 ms","author":"Shukor","year":"2024","journal-title":"arXiv preprint"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/iccv51070.2023.02016"},{"key":"ref59","article-title":"Beyond task performance: evaluating and reducing the flaws of large multimodal models with in-contextlearning","volume-title":"The Twelfth International Conference on Learning Representations","author":"Shukor","year":"2024"},{"key":"ref60","article-title":"Smolvla: A vision-language-action model for affordable and efficient robotics","author":"Shukor","year":"2025","journal-title":"arXiv preprint"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.00009"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-acl.48"},{"key":"ref63","article-title":"Gemma 2: Improving open language models at a practical size","author":"Team","year":"2024","journal-title":"arXiv preprint"},{"key":"ref64","article-title":"Linear representations of sentiment in large language models","author":"Tigges","year":"2023","journal-title":"arXiv preprint"},{"key":"ref65","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023","journal-title":"arXiv preprint"},{"key":"ref66","article-title":"Activation addition: Steering language models without optimization","author":"Matt Turner","year":"2023","journal-title":"arXiv preprint"},{"key":"ref67","article-title":"Improved baselines for data-efficient perceptual augmentation of 11 ms","author":"Vallaeys","year":"2024","journal-title":"arXiv preprint"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72661-3_5"},{"key":"ref69","article-title":"Reft: Representation finetuning for language models","author":"Wu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref70","article-title":"Fewshot multimodal explanation for visual question answering","author":"Xue","year":"2024","journal-title":"ACM Multimedia 2024"},{"key":"ref71","article-title":"On conceptbased explanations in deep neural networks","author":"Yeh","journal-title":"arXiv preprint"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"ref73","article-title":"From redundancy to relevance: Enhancing explainability in multimodal large language models","author":"Zhang","year":"2024","journal-title":"arXiv preprint"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11445319.pdf?arnumber=11445319","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T06:18:54Z","timestamp":1777529934000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11445319\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":73,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.00213","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}