{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,17]],"date-time":"2026-01-17T02:12:24Z","timestamp":1768615944907,"version":"3.49.0"},"publisher-location":"Cham","reference-count":36,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032144942","type":"print"},{"value":"9783032144959","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-14495-9_32","type":"book-chapter","created":{"date-parts":[[2026,1,16]],"date-time":"2026-01-16T15:22:45Z","timestamp":1768576965000},"page":"416-428","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Vision Embeddings and\u00a0Their Role in\u00a0Hallucination Vulnerabilities of\u00a0Multimodal Large Language Models"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-4048-5784","authenticated-orcid":false,"given":"Chashi Mahiul","family":"Islam","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-0948-4234","authenticated-orcid":false,"given":"Samuel Jacob","family":"Chacko","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8307-9730","authenticated-orcid":false,"given":"Preston","family":"Horne","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9320-3872","authenticated-orcid":false,"given":"Xiuwen","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,1,17]]},"reference":[{"key":"32_CR1","unstructured":"Achiam, J., et\u00a0al.: GPT-4 technical report. arXiv e-prints, pp. arXiv\u20132303 (2023)"},{"key":"32_CR2","first-page":"23716","volume":"35","author":"JB Alayrac","year":"2022","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning. Adv. Neural. Inf. Process. Syst. 35, 23716\u201323736 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"32_CR3","doi-asserted-by":"crossref","unstructured":"Antol, S., et al.: VQA: visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2425\u20132433 (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"32_CR4","unstructured":"Awadalla, A., et\u00a0al.: Openflamingo: an open-source framework for training large autoregressive vision-language models. arXiv preprint arXiv:2306.14824 (2023)"},{"key":"32_CR5","unstructured":"Bai, Z., Wu, Q., Zhang, Y., Jiang, Y., Li, X., et\u00a0al.: Qwen-VL: a scalable and versatile vision-language model. arXiv preprint arXiv:2310.15261 (2023)"},{"key":"32_CR6","unstructured":"Betker, J., et\u00a0al.: Improving image generation with better captions. Computer Science (2023). https:\/\/cdn.openai.com\/papers\/dall-e-3.pdf, 2(3), 8"},{"issue":"1","key":"32_CR7","doi-asserted-by":"publisher","first-page":"25","DOI":"10.1049\/cit2.12028","volume":"6","author":"A Chakraborty","year":"2021","unstructured":"Chakraborty, A., Alam, M., Dey, V., Chattopadhyay, A., Mukhopadhyay, D.: A survey on adversarial attacks and defences. CAAI Trans. Intell. Technol. 6(1), 25\u201345 (2021)","journal-title":"CAAI Trans. Intell. Technol."},{"key":"32_CR8","unstructured":"Chen, X., et al.: Janus-pro: unified multimodal understanding and generation with data and model scaling. arXiv preprint arXiv:2501.17811 (2025)"},{"key":"32_CR9","unstructured":"Cohen, J., Rosenfeld, E., Kolter, Z.: Certified adversarial robustness via randomized smoothing. In: International Conference on Machine Learning, pp. 1310\u20131320. PMLR (2019)"},{"key":"32_CR10","doi-asserted-by":"crossref","unstructured":"Cui, X., Aparcedo, A., Jang, Y.K., Lim, S.N.: On the robustness of large multimodal models against image adversarial attacks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 24625\u201324634 (2024)","DOI":"10.1109\/CVPR52733.2024.02325"},{"key":"32_CR11","unstructured":"Cunningham, H., Ewart, A., Riggs, L., Huben, R., Sharkey, L.: Sparse autoencoders find highly interpretable features in language models. arXiv e-prints, pp. arXiv\u20132309 (2023)"},{"key":"32_CR12","unstructured":"Driess, D., et\u00a0al.: Palm-e: an embodied multimodal language model. In: Proceedings of the 40th International Conference on Machine Learning, pp. 8469\u20138488 (2023)"},{"key":"32_CR13","unstructured":"Dubey, A., et\u00a0al.: The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)"},{"key":"32_CR14","unstructured":"Elhage, N., et\u00a0al.: Toy models of superposition. arXiv preprint arXiv:2209.10652 (2022)"},{"key":"32_CR15","unstructured":"Goodfellow, I.J., Shlens, J., Szegedy, C.: Explaining and harnessing adversarial examples. arXiv preprint arXiv:1412.6572 (2014)"},{"key":"32_CR16","unstructured":"Guo, D., et\u00a0al.: Deepseek-r1: incentivizing reasoning capability in LLMs via reinforcement learning. arXiv preprint arXiv:2501.12948 (2025)"},{"key":"32_CR17","first-page":"24","volume":"6","author":"T Henighan","year":"2023","unstructured":"Henighan, T., et al.: Superposition, memorization, and double descent. Transformer Circuits Thread 6, 24 (2023)","journal-title":"Transformer Circuits Thread"},{"key":"32_CR18","unstructured":"Ibrahimi, S., Atigh, M.G., Noord, N.V., Mettes, P., Worring, M.: Intriguing properties of hyperbolic embeddings in vision-language models. Trans. Mach. Learn. Res. (2024)"},{"key":"32_CR19","unstructured":"Islam, C.M., Chacko, S.J., Nishino, M., Liu, X.: Mechanistic understandings of representation vulnerabilities and engineering robust vision transformers. arXiv preprint arXiv:2502.04679 (2025)"},{"key":"32_CR20","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: International Conference on Machine Learning, pp. 19730\u201319742. PMLR (2023)"},{"key":"32_CR21","first-page":"34892","volume":"36","author":"H Liu","year":"2023","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. Adv. Neural. Inf. Process. Syst. 36, 34892\u201334916 (2023)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"32_CR22","unstructured":"Lu, D., Pang, T., Du, C., Liu, Q., Yang, X., Lin, M.: Test-time backdoor attacks on multimodal large language models. arXiv preprint arXiv:2402.08577 (2024)"},{"key":"32_CR23","doi-asserted-by":"crossref","unstructured":"Ma, Y., et\u00a0al.: Janusflow: harmonizing autoregression and rectified flow for unified multimodal understanding and generation. In: Proceedings of the Computer Vision and Pattern Recognition Conference, pp. 7739\u20137751 (2025)","DOI":"10.1109\/CVPR52734.2025.00725"},{"key":"32_CR24","unstructured":"Madry, A., Makelov, A., Schmidt, L., Tsipras, D., Vladu, A.: Towards deep learning models resistant to adversarial attacks. In: International Conference on Learning Representations (2018)"},{"key":"32_CR25","doi-asserted-by":"crossref","unstructured":"Moosavi-Dezfooli, S.M., Fawzi, A., Fawzi, O., Frossard, P.: Universal adversarial perturbations. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1765\u20131773 (2017)","DOI":"10.1109\/CVPR.2017.17"},{"key":"32_CR26","doi-asserted-by":"crossref","unstructured":"Papernot, N., McDaniel, P., Jha, S., Fredrikson, M., Celik, Z.B., Swami, A.: The limitations of deep learning in adversarial settings. In: 2016 IEEE European Symposium on Security and Privacy (EuroS &P), pp. 372\u2013387. IEEE (2016)","DOI":"10.1109\/EuroSP.2016.36"},{"issue":"5","key":"32_CR27","doi-asserted-by":"publisher","first-page":"909","DOI":"10.3390\/app9050909","volume":"9","author":"S Qiu","year":"2019","unstructured":"Qiu, S., Liu, Q., Zhou, S., Wu, C.: Review of artificial intelligence adversarial attack and defense technologies. Appl. Sci. 9(5), 909 (2019)","journal-title":"Appl. Sci."},{"key":"32_CR28","unstructured":"Salman, S., Shams, M.M.B., Liu, X.: Intriguing equivalence structures of the embedding space of vision transformers. arXiv preprint arXiv:2401.15568 (2024)"},{"issue":"17","key":"32_CR29","doi-asserted-by":"publisher","first-page":"7782","DOI":"10.3390\/app14177782","volume":"14","author":"S Shahriar","year":"2024","unstructured":"Shahriar, S., et al.: Putting GPT-4o to the sword: a comprehensive evaluation of language, vision, speech, and multimodal proficiency. Appl. Sci. 14(17), 7782 (2024)","journal-title":"Appl. Sci."},{"key":"32_CR30","unstructured":"Von\u00a0Oswald, J., et al.: Transformers learn in-context by gradient descent. In: International Conference on Machine Learning, pp. 35151\u201335174. PMLR (2023)"},{"key":"32_CR31","unstructured":"Wan, A., Wallace, E., Shen, S., Klein, D.: Poisoning language models during instruction tuning. In: International Conference on Machine Learning, pp. 35413\u201335425. PMLR (2023)"},{"issue":"5","key":"32_CR32","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pcbi.1005536","volume":"13","author":"V Weilnhammer","year":"2017","unstructured":"Weilnhammer, V., Stuke, H., Hesselmann, G., Sterzer, P., Schmack, K.: A predictive coding account of bistable perception-a model-based fMRI study. PLoS Comput. Biol. 13(5), e1005536 (2017)","journal-title":"PLoS Comput. Biol."},{"key":"32_CR33","doi-asserted-by":"crossref","unstructured":"Wu, C., et\u00a0al.: Janus: decoupling visual encoding for unified multimodal understanding and generation. In: Proceedings of the Computer Vision and Pattern Recognition Conference, pp. 12966\u201312977 (2025)","DOI":"10.1109\/CVPR52734.2025.01210"},{"key":"32_CR34","unstructured":"Wu, Z., et\u00a0al.: Deepseek-vl2: mixture-of-experts vision-language models for advanced multimodal understanding. arXiv preprint arXiv:2412.10302 (2024)"},{"key":"32_CR35","unstructured":"Zhao, B., Wu, B., He, M., Huang, T.: SVIT: scaling up visual instruction tuning. arXiv preprint arXiv:2307.04087 (2023)"},{"key":"32_CR36","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: MiniGPT-4: enhancing vision-language understanding with advanced large language models. In: The Twelfth International Conference on Learning Representations (2024)"}],"container-title":["Lecture Notes in Computer Science","Advances in Visual Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-14495-9_32","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,16]],"date-time":"2026-01-16T15:22:52Z","timestamp":1768576972000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-14495-9_32"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9783032144942","9783032144959"],"references-count":36,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-14495-9_32","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"17 January 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ISVC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Symposium on Visual Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Las Vegas, NV","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"USA","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17 November 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"19 November 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"isvc2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.isvc.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}