{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,18]],"date-time":"2026-03-18T03:34:57Z","timestamp":1773804897750,"version":"3.50.1"},"publisher-location":"Cham","reference-count":64,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729829","type":"print"},{"value":"9783031729836","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T00:00:00Z","timestamp":1730160000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T00:00:00Z","timestamp":1730160000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72983-6_3","type":"book-chapter","created":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T09:34:20Z","timestamp":1730108060000},"page":"37-55","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["How Many Are in This Image A Safety Evaluation Benchmark for Vision LLMs"],"prefix":"10.1007","author":[{"given":"Haoqin","family":"Tu","sequence":"first","affiliation":[]},{"given":"Chenhang","family":"Cui","sequence":"additional","affiliation":[]},{"given":"Zijun","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Yiyang","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Bingchen","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Junlin","family":"Han","sequence":"additional","affiliation":[]},{"given":"Wangchunshu","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Huaxiu","family":"Yao","sequence":"additional","affiliation":[]},{"given":"Cihang","family":"Xie","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,29]]},"reference":[{"key":"3_CR1","unstructured":"Aldahdooh, A., Hamidouche, W., Deforges, O.: Reveal of vision transformers robustness against adversarial attacks. arXiv preprint arXiv:2106.03734 (2021)"},{"key":"3_CR2","doi-asserted-by":"crossref","unstructured":"Antol, S., et al.: VQA: visual question answering. In: ICCV, pp. 2425\u20132433 (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"3_CR3","unstructured":"Bai, J., et\u00a0al.: Qwen technical report. arXiv preprint arXiv:2309.16609 (2023)"},{"key":"3_CR4","unstructured":"Bai, J., et al.: Qwen-VL: a frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)"},{"key":"3_CR5","unstructured":"Bavishi, R., et al.: Introducing our multimodal models (2023). https:\/\/www.adept.ai\/blog\/fuyu-8b"},{"key":"3_CR6","unstructured":"Benz, P., Ham, S., Zhang, C., Karjauv, A., Kweon, I.S.: Adversarial robustness comparison of vision transformer and MLP-mixer to CNNs. arXiv preprint arXiv:2110.02797 (2021)"},{"key":"3_CR7","unstructured":"Bitton, Y., et al.: VisIT-bench: a benchmark for vision-language instruction following inspired by real-world use. arXiv preprint arXiv:2308.06595 (2023)"},{"key":"3_CR8","unstructured":"Brain, G., (2017). https:\/\/www.kaggle.com\/competitions\/nips-2017-non-targeted-adversarial-attack"},{"key":"3_CR9","doi-asserted-by":"publisher","first-page":"231","DOI":"10.1136\/bmjqs-2018-008370","volume":"28","author":"R Challen","year":"2019","unstructured":"Challen, R., Denny, J., Pitt, M., Gompels, L., Edwards, T., Tsaneva-Atanasova, K.: Artificial intelligence, bias and clinical safety. BMJ Q. Saf. 28, 231\u2013237 (2019)","journal-title":"BMJ Q. Saf."},{"key":"3_CR10","unstructured":"Chen, R., Zhang, H., Liang, S., Li, J., Cao, X.: Less is more: fewer interpretable region via submodular subset selection. In: ICLR (2024)"},{"key":"3_CR11","doi-asserted-by":"crossref","unstructured":"Chen, W., Hays, J.: SketchyGAN: towards diverse and realistic sketch to image synthesis. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00981"},{"key":"3_CR12","unstructured":"Chiang, W.L., et\u00a0al.: Vicuna: an open-source chatbot impressing GPT-4 with 90%* ChatGPT quality (2023). https:\/\/vicunalmsys.org. Accessed 14 Apr 2023"},{"key":"3_CR13","unstructured":"Cui, C., et al.: Holistic analysis of hallucination in GPT-4V (ision): bias and interference challenges. arXiv preprint arXiv:2311.03287 (2023)"},{"key":"3_CR14","unstructured":"Dai, W., et al.: InstructBLIP: towards general-purpose vision-language models with instruction tuning. arXiv arXiv:2305.06500 (2023). https:\/\/api.semanticscholar.org\/CorpusID:258615266"},{"key":"3_CR15","unstructured":"Diao, S., Zhou, W., Zhang, X., Wang, J.: Write and paint: generative vision-language models are unified modal learners. In: ICLR (2023)"},{"key":"3_CR16","unstructured":"Dong, Y., et al.: How robust is Google\u2019s bard to adversarial image attacks? arXiv preprint arXiv:2309.11751 (2023)"},{"key":"3_CR17","doi-asserted-by":"crossref","unstructured":"Du, Y., Liu, Z., Li, J., Zhao, W.X.: A survey of vision-language pre-trained models. arXiv preprint arXiv:2202.10936 (2022)","DOI":"10.24963\/ijcai.2022\/762"},{"key":"3_CR18","doi-asserted-by":"crossref","unstructured":"Eitz, M., Hays, J., Alexa, M.: How do humans sketch objects? In: SIGGRAPH (2012)","DOI":"10.1145\/2185520.2335395"},{"key":"3_CR19","unstructured":"Fu, C., et\u00a0al.: MME: a comprehensive evaluation benchmark for multimodal large language models. arXiv preprint arXiv:2306.13394 (2023)"},{"key":"3_CR20","unstructured":"Gao, P., et\u00a0al.: LLaMA-adapter V2: parameter-efficient visual instruction model. arXiv preprint arXiv:2304.15010 (2023)"},{"key":"3_CR21","unstructured":"Gong, Y., et al.: FigStep: jailbreaking large vision-language models via typographic visual prompts. arXiv preprint arXiv:2311.05608 (2023)"},{"key":"3_CR22","unstructured":"Jigsaw, G., (2023). https:\/\/perspectiveapi.com\/"},{"key":"3_CR23","doi-asserted-by":"crossref","unstructured":"Koley, S., Bhunia, A.K., Sain, A., Chowdhury, P.N., Xiang, T., Song, Y.Z.: Picture that sketch: photorealistic image generation from abstract sketches. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00662"},{"key":"3_CR24","doi-asserted-by":"crossref","unstructured":"Li, X., Fang, Y., Liu, M., Ling, Z., Tu, Z., Su, H.: Distilling large vision-language model with out-of-distribution generalizability. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00236"},{"key":"3_CR25","doi-asserted-by":"crossref","unstructured":"Li, Y., Du, Y., Zhou, K., Wang, J., Zhao, W.X., Wen, J.R.: Evaluating object hallucination in large vision-language models. arXiv preprint arXiv:2305.10355 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.20"},{"key":"3_CR26","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. arXiv preprint arXiv:2304.08485 (2023)"},{"key":"3_CR27","unstructured":"Liu, X., Zhu, Y., Lan, Y., Yang, C., Qiao, Y.: Query-relevant images jailbreak large multi-modal models. arXiv preprint arXiv:2311.17600 (2023)"},{"key":"3_CR28","doi-asserted-by":"crossref","unstructured":"Liu, Y., et\u00a0al.: MMBench: is your multi-modal model an all-around player? arXiv preprint arXiv:2307.06281 (2023)","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"3_CR29","unstructured":"OpenAI: ChatGPT can now see, hear, and speak (2023). https:\/\/openai.com\/blog\/chatgpt-can-now-see-hear-and-speak"},{"key":"3_CR30","unstructured":"OpenAI: GPT-4 technical report, Technical report. OpenAI (2023)"},{"key":"3_CR31","unstructured":"OpenAI: GPT-4V(ision) technical work and authors, Technical report. OpenAI (2023). https:\/\/cdn.openai.com\/contributions\/gpt-4v.pdf"},{"key":"3_CR32","doi-asserted-by":"crossref","unstructured":"Qi, X., Huang, K., Panda, A., Wang, M., Mittal, P.: Visual adversarial examples jailbreak aligned large language models. In: The Second Workshop on New Frontiers in Adversarial Machine Learning (2023)","DOI":"10.1609\/aaai.v38i19.30150"},{"key":"3_CR33","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML, pp. 8748\u20138763 (2021)"},{"key":"3_CR34","doi-asserted-by":"crossref","unstructured":"Schlarmann, C., Hein, M.: On the adversarial robustness of multi-modal foundation models. In: ICCV (2023)","DOI":"10.1109\/ICCVW60793.2023.00395"},{"key":"3_CR35","unstructured":"Su, Y., Lan, T., Li, H., Xu, J., Wang, Y., Cai, D.: PandaGPT: one model to instruction-follow them all. arXiv preprint arXiv:2305.16355 (2023)"},{"key":"3_CR36","unstructured":"Szegedy, C., et al.: Intriguing properties of neural networks. arXiv preprint arXiv:1312.6199 (2013)"},{"key":"3_CR37","unstructured":"Tamkin, A., Brundage, M., Clark, J., Ganguli, D.: Understanding the capabilities, limitations, and societal impact of large language models. arXiv preprint arXiv:2102.02503 (2021)"},{"key":"3_CR38","doi-asserted-by":"crossref","unstructured":"Tang, F., Gao, W., Peng, L., Zhan, J.: AGIBench: a multi-granularity, multimodal, human-referenced, auto-scoring benchmark for large language models. arXiv preprint arXiv:2309.06495 (2023)","DOI":"10.1007\/978-981-97-0316-6_9"},{"key":"3_CR39","unstructured":"Tatman, R., (2017). https:\/\/www.kaggle.com\/datasets\/rtatman\/english-word-frequency"},{"key":"3_CR40","unstructured":"DeepMind Interactive Agents Team, et\u00a0al.: Creating multimodal interactive agents with imitation and self-supervised learning. arXiv preprint arXiv:2112.03763 (2021)"},{"key":"3_CR41","unstructured":"Team, G., et\u00a0al.: Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)"},{"key":"3_CR42","unstructured":"Tong, S., Jones, E., Steinhardt, J.: Mass-producing failures of multimodal systems with language models. arXiv preprint arXiv:2306.12105 (2023)"},{"key":"3_CR43","unstructured":"Touvron, H., et\u00a0al.: Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"3_CR44","doi-asserted-by":"crossref","unstructured":"Tu, H., Li, Y., Mi, F., Yang, Z.: ReSee: responding through seeing fine-grained visual knowledge in open-domain dialogue. arXiv preprint arXiv:2305.13602 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.479"},{"key":"3_CR45","doi-asserted-by":"crossref","unstructured":"Tu, H., Yang, B., Zhao, X.: ZeroGEN: zero-shot multimodal controllable text generation with multiple oracles. arXiv preprint arXiv:2306.16649 (2023)","DOI":"10.1007\/978-3-031-44696-2_39"},{"key":"3_CR46","unstructured":"Tu, H., Zhao, B., Wei, C., Xie, C.: Sight beyond text: multi-modal training enhances LLMs in truthfulness and ethics. arXiv preprint arXiv:2309.07120 (2023)"},{"key":"3_CR47","unstructured":"Vidgen, B., et al.: SimpleSafetyTests: a test suite for identifying critical safety risks in large language models (2023)"},{"key":"3_CR48","unstructured":"Wang, J., et al.: An LLM-free multi-dimensional benchmark for MLLMs hallucination evaluation. arXiv preprint arXiv:2311.07397 (2023)"},{"key":"3_CR49","unstructured":"Wang, W., et\u00a0al.: CogVLM: visual expert for pretrained language models. arXiv preprint arXiv:2311.03079 (2023)"},{"key":"3_CR50","unstructured":"Wang, Z., et\u00a0al.: JARVIS-1: open-world multi-task agents with memory-augmented multimodal language models. arXiv preprint arXiv:2311.05997 (2023)"},{"key":"3_CR51","unstructured":"Wei, Z., Wang, Y., Wang, Y.: Jailbreak and guard aligned language models with only few in-context demonstrations. arXiv preprint arXiv:2310.06387 (2023)"},{"key":"3_CR52","unstructured":"Ye, Q., et\u00a0al.: mPLUG-Owl: modularization empowers large language models with multimodality. arXiv preprint arXiv:2304.14178 (2023)"},{"key":"3_CR53","unstructured":"Yu, J., Lin, X., Yu, Z., Xing, X.: GPTFUZZER: red teaming large language models with auto-generated jailbreak prompts. arXiv preprint arXiv:2309.10253 (2023)"},{"key":"3_CR54","doi-asserted-by":"crossref","unstructured":"Yue, X., et\u00a0al.: MMMU: a massive multi-discipline multimodal understanding and reasoning benchmark for expert AGI. arXiv preprint arXiv:2311.16502 (2023)","DOI":"10.1109\/CVPR52733.2024.00913"},{"key":"3_CR55","unstructured":"Zeng, A., et\u00a0al.: GLM-130B: an open bilingual pre-trained model. arXiv preprint arXiv:2210.02414 (2022)"},{"key":"3_CR56","doi-asserted-by":"crossref","unstructured":"Zhang, L., Zhai, X., Zhao, Z., Wen, X., Zhao, B.: What if the TV was off? Examining counterfactual reasoning abilities of multi-modal language models. In: ICCVW (2023)","DOI":"10.1109\/ICCVW60793.2023.00497"},{"key":"3_CR57","unstructured":"Zhang, P., et\u00a0al.: InternLM-XComposer: a vision-language large model for advanced text-image comprehension and composition. arXiv preprint arXiv:2309.15112 (2023)"},{"key":"3_CR58","doi-asserted-by":"publisher","unstructured":"Zhao, B., et al.: OOD-CV: a benchmark for robustness to out-of-distribution shifts of individual nuisances in natural images. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision, ECCV 2022. LNCS, vol. 13668, pp. 163\u2013180. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20074-8_10","DOI":"10.1007\/978-3-031-20074-8_10"},{"key":"3_CR59","unstructured":"Zhao, Y., et al.: On evaluating adversarial robustness of large vision-language models. In: NeurIPS (2024)"},{"key":"3_CR60","unstructured":"Zhou, W., et\u00a0al.: Agents: an open-source framework for autonomous language agents. arXiv preprint arXiv:2309.07870 (2023)"},{"key":"3_CR61","unstructured":"Zhou, W., Zeng, Y., Diao, S., Zhang, X.: VLUE: a multi-task multi-dimension benchmark for evaluating vision-language pre-training. In: ICML (2022)"},{"key":"3_CR62","unstructured":"Zhou, Y., et al.: Analyzing and mitigating object hallucination in large vision-language models. arXiv preprint arXiv:2310.00754 (2023)"},{"key":"3_CR63","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: MiniGPT-4: enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)"},{"key":"3_CR64","unstructured":"Zou, A., Wang, Z., Kolter, J.Z., Fredrikson, M.: Universal and transferable adversarial attacks on aligned language models. arXiv preprint arXiv:2307.15043 (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72983-6_3","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T10:34:50Z","timestamp":1732962890000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72983-6_3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,29]]},"ISBN":["9783031729829","9783031729836"],"references-count":64,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72983-6_3","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,29]]},"assertion":[{"value":"29 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}