{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T18:47:35Z","timestamp":1772822855339,"version":"3.50.1"},"reference-count":56,"publisher":"Springer Science and Business Media LLC","issue":"7","license":[{"start":{"date-parts":[[2025,2,19]],"date-time":"2025-02-19T00:00:00Z","timestamp":1739923200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,2,19]],"date-time":"2025-02-19T00:00:00Z","timestamp":1739923200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,7]]},"DOI":"10.1007\/s11263-025-02368-9","type":"journal-article","created":{"date-parts":[[2025,2,19]],"date-time":"2025-02-19T14:03:12Z","timestamp":1739973792000},"page":"3994-4013","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":10,"title":["VL-Trojan: Multimodal Instruction Backdoor Attacks against Autoregressive Visual Language Models"],"prefix":"10.1007","volume":"133","author":[{"given":"Jiawei","family":"Liang","sequence":"first","affiliation":[]},{"given":"Siyuan","family":"Liang","sequence":"additional","affiliation":[]},{"given":"Aishan","family":"Liu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7141-708X","authenticated-orcid":false,"given":"Xiaochun","family":"Cao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,2,19]]},"reference":[{"key":"2368_CR1","unstructured":"Achiam, J., Adler, S., Agarwal. S., et\u00a0al. (2023). Gpt-4 technical report. arXiv preprint arXiv:2303.08774"},{"key":"2368_CR2","first-page":"23716","volume":"35","author":"JB Alayrac","year":"2022","unstructured":"Alayrac, J. B., Donahue, J., Luc, P., et al. (2022). Flamingo: A visual language model for few-shot learning. Advances in Neural Information Processing Systems, 35, 23716\u201323736.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2368_CR3","doi-asserted-by":"crossref","unstructured":"Antol, S., Agrawal, A., Lu, J., et\u00a0al. (2015). Vqa: Visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2425\u20132433.","DOI":"10.1109\/ICCV.2015.279"},{"key":"2368_CR4","unstructured":"Awadalla, A., Gao, I., Gardner, J., et\u00a0al. (2023). Openflamingo: An open-source framework for training large autoregressive vision-language models. arXiv preprint arXiv:2308.01390"},{"key":"2368_CR5","doi-asserted-by":"crossref","unstructured":"Bai, J., Gao, K., Min, S., et\u00a0al. (2024). Badclip: Trigger-aware prompt learning for backdoor attacks on clip. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 24239\u201324250.","DOI":"10.1109\/CVPR52733.2024.02288"},{"key":"2368_CR6","doi-asserted-by":"crossref","unstructured":"Bansal, H., Singhi, N., Yang, Y., et\u00a0al. (2023). Cleanclip: Mitigating data poisoning attacks in multimodal contrastive learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 112\u2013123.","DOI":"10.1109\/ICCV51070.2023.00017"},{"key":"2368_CR7","doi-asserted-by":"crossref","unstructured":"Barni, M., Kallas, K., Tondi, B. (2019). A new backdoor attack in CNNS by training set corruption without label poisoning. In: 2019 IEEE International Conference on Image Processing (ICIP), IEEE, pp. 101\u2013105.","DOI":"10.1109\/ICIP.2019.8802997"},{"key":"2368_CR8","unstructured":"Brock, A., De S., Smith, S.L., et\u00a0al. (2021). High-performance large-scale image recognition without normalization. In: International Conference on Machine Learning, PMLR, pp. 1059\u20131071."},{"key":"2368_CR9","unstructured":"Carlini, N., Terzis, A. (2022). Poisoning and backdooring contrastive learning. In: International Conference on Learning Representations."},{"key":"2368_CR10","unstructured":"Chen, X., Fang, H., Lin, T.Y., et\u00a0al. (2015). Microsoft coco captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325"},{"key":"2368_CR11","unstructured":"Chen, X., Liu, C., Li, B., et\u00a0al. (2017). Targeted backdoor attacks on deep learning systems using data poisoning. arXiv preprint arXiv:1712.05526"},{"issue":"70","key":"2368_CR12","first-page":"1","volume":"25","author":"HW Chung","year":"2024","unstructured":"Chung, H. W., Hou, L., Longpre, S., et al. (2024). Scaling instruction-finetuned language models. Journal of Machine Learning Research, 25(70), 1\u201353.","journal-title":"Journal of Machine Learning Research"},{"key":"2368_CR13","doi-asserted-by":"crossref","unstructured":"Duan, H., Yang, J., Qiao, Y., et\u00a0al. (2024). Vlmevalkit: An open-source toolkit for evaluating large multi-modality models. arXiv:2407.11691","DOI":"10.1145\/3664647.3685520"},{"key":"2368_CR14","unstructured":"Fu, C., Chen, P., Shen, Y., et\u00a0al. (2023). Mme: A comprehensive evaluation benchmark for multimodal large language models. arXiv preprint arXiv:2306.13394"},{"key":"2368_CR15","doi-asserted-by":"publisher","first-page":"47230","DOI":"10.1109\/ACCESS.2019.2909068","volume":"7","author":"T Gu","year":"2019","unstructured":"Gu, T., Liu, K., Dolan-Gavitt, B., et al. (2019). Badnets: Evaluating backdooring attacks on deep neural networks. IEEE Access, 7, 47230\u201347244.","journal-title":"IEEE Access"},{"key":"2368_CR16","doi-asserted-by":"crossref","unstructured":"Gurari, D., Li, Q., Stangl, A.J., et\u00a0al. (2018). Vizwiz grand challenge: Answering visual questions from blind people. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3608\u20133617.","DOI":"10.1109\/CVPR.2018.00380"},{"key":"2368_CR17","doi-asserted-by":"crossref","unstructured":"Han, J., Gong, K., Zhang, Y., et\u00a0al. (2024). Onellm: One framework to align all modalities with language. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 26584\u201326595.","DOI":"10.1109\/CVPR52733.2024.02510"},{"key":"2368_CR18","unstructured":"Han, X., Wu, Y., Zhang, Q., et\u00a0al. (2023). Backdooring multimodal learning. In: 2024 IEEE Symposium on Security and Privacy (SP), IEEE Computer Society, pp. 31\u201331."},{"key":"2368_CR19","unstructured":"Hoffmann, J., Borgeaud, S., Mensch, A., et\u00a0al. (2022). Training compute-optimal large language models. In: Proceedings of the 36th International Conference on Neural Information Processing Systems, pp. 30016\u201330030."},{"issue":"6","key":"2368_CR20","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3295748","volume":"51","author":"MZ Hossain","year":"2019","unstructured":"Hossain, M. Z., Sohel, F., Shiratuddin, M. F., et al. (2019). A comprehensive survey of deep learning for image captioning. ACM Computing Surveys (CsUR), 51(6), 1\u201336.","journal-title":"ACM Computing Surveys (CsUR)"},{"key":"2368_CR21","unstructured":"Li, B., Zhang, Y., Chen, L., et\u00a0al. (2023a). Mimic-it: Multi-modal in-context instruction tuning. arXiv preprint arXiv:2306.05425"},{"key":"2368_CR22","unstructured":"Li, B., Zhang, Y., Chen, L., et\u00a0al. (2023b). Otter: A multi-modal model with in-context instruction tuning. arXiv preprint arXiv:2305.03726"},{"key":"2368_CR23","doi-asserted-by":"crossref","unstructured":"Li, B., Ge, Y., Ge, Y., et\u00a0al. (2024). Seed-bench: Benchmarking multimodal large language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13299\u201313308.","DOI":"10.1109\/CVPR52733.2024.01263"},{"key":"2368_CR24","unstructured":"Li, J., Li, D., Savarese, S., et\u00a0al. (2023c). Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In: International Conference on Machine Learning, PMLR, pp. 19730\u201319742."},{"key":"2368_CR25","doi-asserted-by":"crossref","unstructured":"Li, Y., Li, Y., Wu, B., et\u00a0al. (2021a). Invisible backdoor attack with sample-specific triggers. In: Proceedings of the IEEE\/CVF international Conference on Computer Vision, pp. 16463\u201316472.","DOI":"10.1109\/ICCV48922.2021.01615"},{"key":"2368_CR26","first-page":"14900","volume":"34","author":"Y Li","year":"2021","unstructured":"Li, Y., Lyu, X., Koren, N., et al. (2021). Anti-backdoor learning: Training clean models on poisoned data. Advances in Neural Information Processing Systems, 34, 14900\u201314912.","journal-title":"Advances in Neural Information Processing Systems"},{"issue":"1","key":"2368_CR27","doi-asserted-by":"publisher","first-page":"5","DOI":"10.1109\/TNNLS.2022.3182979","volume":"35","author":"Y Li","year":"2022","unstructured":"Li, Y., Jiang, Y., Li, Z., et al. (2022). Backdoor learning: A survey. IEEE Transactions on Neural Networks and Learning Systems, 35(1), 5\u201322.","journal-title":"IEEE Transactions on Neural Networks and Learning Systems"},{"key":"2368_CR28","unstructured":"Liang, J., Liang, S., Liu, A., et\u00a0al. (2024a). Poisoned forgery face: Towards backdoor attacks on face forgery detection. In: The Twelfth International Conference on Learning Representations"},{"key":"2368_CR29","unstructured":"Liang, S., Liang, J., Pang, T., et\u00a0al. (2024b). Revisiting backdoor attacks against large vision-language models. arXiv preprint arXiv:2406.18844"},{"key":"2368_CR30","doi-asserted-by":"crossref","unstructured":"Liang, S., Zhu, M., Liu, A., et\u00a0al. (2024c). Badclip: Dual-embedding guided backdoor attack on multimodal contrastive learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 24645\u201324654.","DOI":"10.1109\/CVPR52733.2024.02327"},{"key":"2368_CR31","unstructured":"Liu, A., Zhang, X., Xiao, Y., et\u00a0al. (2023a). Pre-trained trojan attacks for visual recognition. arXiv preprint arXiv:2312.15172"},{"key":"2368_CR32","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Wu, Q., et\u00a0al. (2024). Visual instruction tuning. Advances in neural information processing systems 36","DOI":"10.1007\/978-981-99-8079-6_1"},{"key":"2368_CR33","doi-asserted-by":"crossref","unstructured":"Liu, Y., Duan, H., Zhang, Y., et\u00a0al. (2023b). Mmbench: Is your multi-modal model an all-around player? arXiv preprint arXiv:2307.06281","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"2368_CR34","unstructured":"Loshchilov, I., Hutter, F. (2017). Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101"},{"key":"2368_CR35","unstructured":"Lu, D., Pang, T., Du, C., et\u00a0al. (2024). Test-time backdoor attacks on multimodal large language models. arXiv preprint arXiv:2402.08577"},{"key":"2368_CR36","doi-asserted-by":"crossref","unstructured":"Marino, K., Rastegari, M., Farhadi, A., et\u00a0al. (2019). Ok-vqa: A visual question answering benchmark requiring external knowledge. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3195\u20133204.","DOI":"10.1109\/CVPR.2019.00331"},{"key":"2368_CR37","unstructured":"MosaicML (2023) Introducing mpt-7b: A new standard for open-source, commercially usable llms"},{"key":"2368_CR38","unstructured":"Radford, A., Kim, J.W., Hallacy, C., et\u00a0al. (2021). Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, PMLR, pp. 8748\u20138763."},{"key":"2368_CR39","doi-asserted-by":"crossref","unstructured":"Singh, A., Natarajan, V., Shah, M., et\u00a0al. (2019). Towards vqa models that can read. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8317\u20138326.","DOI":"10.1109\/CVPR.2019.00851"},{"key":"2368_CR40","unstructured":"Together.xyz (2023) Releasing 3b and 7b redpajama-incite family of models including base, instruction-tuned & chat models. https:\/\/www.together.xyz\/blog\/redpajama-models-v1"},{"key":"2368_CR41","unstructured":"Touvron, H., Lavril, T., Izacard, G., et\u00a0al. (2023). Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971"},{"key":"2368_CR42","doi-asserted-by":"crossref","unstructured":"Walmer, M., Sikka, K., Sur, I., et\u00a0al. (2022). Dual-key multimodal backdoors for visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15375\u201315385.","DOI":"10.1109\/CVPR52688.2022.01494"},{"key":"2368_CR43","doi-asserted-by":"crossref","unstructured":"Wang, T., Yao, Y., Xu, F., et\u00a0al. (2022). An invisible black-box backdoor attack through frequency domain. In: European Conference on Computer Vision, Springer, pp. 396\u2013413.","DOI":"10.1007\/978-3-031-19778-9_23"},{"key":"2368_CR44","first-page":"10546","volume":"35","author":"B Wu","year":"2022","unstructured":"Wu, B., Chen, H., Zhang, M., et al. (2022). Backdoorbench: A comprehensive benchmark of backdoor learning. Advances in Neural Information Processing Systems, 35, 10546\u201310559.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2368_CR45","doi-asserted-by":"crossref","unstructured":"Wu, Y., Han, X., Qiu, H., et\u00a0al. (2023). Computation and data efficient backdoor attacks. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4805\u20134814.","DOI":"10.1109\/ICCV51070.2023.00443"},{"key":"2368_CR46","unstructured":"Xu, Y., Yao, J., Shu, M., et\u00a0al. (2024). Shadowcast: Stealthy data poisoning attacks against vision-language models. In: ICLR 2024 Workshop on Navigating and Addressing Data Problems for Foundation Models."},{"key":"2368_CR47","unstructured":"Yang, W., Gao, J., Mirzasoleiman, B. (2024). Robust contrastive language-image pretraining against data poisoning and backdoor attacks. Advances in Neural Information Processing Systems 36"},{"key":"2368_CR48","unstructured":"Yao, J.Y., Ning, K.P., Liu, Z.H., et\u00a0al (2023). Llm lies: Hallucinations are not bugs, but features as adversarial examples. arXiv preprint arXiv:2310.01469"},{"key":"2368_CR49","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young, P., Lai, A., Hodosh, M., et al. (2014). From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions. Transactions of the Association for Computational Linguistics, 2, 67\u201378.","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"2368_CR50","doi-asserted-by":"crossref","unstructured":"Zhai, S., Dong, Y., Shen, Q., et\u00a0al. (2023). Text-to-image diffusion models can be easily backdoored through multimodal data poisoning. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 1577\u20131587.","DOI":"10.1145\/3581783.3612108"},{"key":"2368_CR51","unstructured":"Zhang, S., Roller, S., Goyal, N., et\u00a0al. (2022). Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068"},{"key":"2368_CR52","unstructured":"Zhang, S., Dong, L., Li, X., et\u00a0al. (2023). Instruction tuning for large language models: A survey. arXiv preprint arXiv:2308.10792"},{"key":"2368_CR53","doi-asserted-by":"publisher","first-page":"2558","DOI":"10.1109\/TIP.2024.3378918","volume":"33","author":"Z Zhang","year":"2024","unstructured":"Zhang, Z., Yuan, X., Zhu, L., et al. (2024). Badcm: Invisible backdoor attack against cross-modal learning. IEEE Transactions on Image Processing, 33, 2558\u20132571.","journal-title":"IEEE Transactions on Image Processing"},{"key":"2368_CR54","unstructured":"Zhu, D., Chen, J., Shen, X., et\u00a0al. (2024). Minigpt-4: Enhancing vision-language understanding with advanced large language models. In: The Twelfth International Conference on Learning Representations"},{"key":"2368_CR55","doi-asserted-by":"crossref","unstructured":"Zhu, M., Wei, S., Shen, L., et\u00a0al. (2023). Enhancing fine-tuning based backdoor defense with sharpness-aware minimization. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4466\u20134477.","DOI":"10.1109\/ICCV51070.2023.00412"},{"key":"2368_CR56","unstructured":"Zou, A., Wang, Z., Carlini, N., et\u00a0al. (2023). Universal and transferable adversarial attacks on aligned language models. arXiv preprint arXiv:2307.15043"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02368-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02368-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02368-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,7]],"date-time":"2025-06-07T06:04:01Z","timestamp":1749276241000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02368-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2,19]]},"references-count":56,"journal-issue":{"issue":"7","published-print":{"date-parts":[[2025,7]]}},"alternative-id":["2368"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02368-9","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,2,19]]},"assertion":[{"value":"1 May 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 January 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"19 February 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}