{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,16]],"date-time":"2026-01-16T04:42:02Z","timestamp":1768538522951,"version":"3.49.0"},"publisher-location":"Cham","reference-count":76,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031734038","type":"print"},{"value":"9783031734045","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T00:00:00Z","timestamp":1730246400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T00:00:00Z","timestamp":1730246400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73404-5_15","type":"book-chapter","created":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T16:03:13Z","timestamp":1730217793000},"page":"251-268","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":10,"title":["Attention Prompting on\u00a0Image for\u00a0Large Vision-Language Models"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6321-9614","authenticated-orcid":false,"given":"Runpeng","family":"Yu","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3349-5890","authenticated-orcid":false,"given":"Weihao","family":"Yu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0057-1404","authenticated-orcid":false,"given":"Xinchao","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,30]]},"reference":[{"key":"15_CR1","unstructured":"Achiam, J., et\u00a0al.: Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)"},{"key":"15_CR2","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning. Adv. Neural. Inf. Process. Syst. 35, 23716\u201323736 (2022)"},{"key":"15_CR3","unstructured":"Asai, A., Wu, Z., Wang, Y., Sil, A., Hajishirzi, H.: Self-rag: learning to retrieve, generate, and critique through self-reflection. CoRR (2023)"},{"key":"15_CR4","unstructured":"Awadalla, A., et\u00a0al.: Openflamingo: An open-source framework for training large autoregressive vision-language models. arXiv preprint arXiv:2308.01390 (2023)"},{"key":"15_CR5","doi-asserted-by":"crossref","unstructured":"Bigham, J.P., et al.: Vizwiz: nearly real-time answers to visual questions. In: Proceedings of the 23nd Annual ACM Symposium on User Interface Software and Technology, pp. 333\u2013342 (2010)","DOI":"10.1145\/1866029.1866080"},{"key":"15_CR6","unstructured":"Chen, X., Lin, M., Sch\u00e4rli, N., Zhou, D.: Teaching large language models to self-debug. CoRR (2023)"},{"key":"15_CR7","doi-asserted-by":"crossref","unstructured":"Chowdhury, S., Nag, S., Manocha, D.: Apollo : unified adapter and prompt learning for vision language models. In: Conference on Empirical Methods in Natural Language Processing, EMNLP (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.629"},{"key":"15_CR8","unstructured":"Dai, W., et al.: InstructBLIP: towards general-purpose vision-language models with instruction tuning. In: Thirty-seventh Conference on Neural Information Processing Systems (2023). https:\/\/openreview.net\/forum?id=vvoWPYqZJA"},{"key":"15_CR9","unstructured":"Darcet, T., Oquab, M., Mairal, J., Bojanowski, P.: Vision transformers need registers. CoRR (2023)"},{"key":"15_CR10","unstructured":"Dong, B., Zhou, P., Yan, S., Zuo, W.: LPT: long-tailed prompt tuning for image classification. CoRR (2022)"},{"key":"15_CR11","unstructured":"Du, Y., Li, S., Torralba, A., Tenenbaum, J.B., Mordatch, I.: Improving factuality and reasoning in language models through multiagent debate. CoRR (2023)"},{"key":"15_CR12","doi-asserted-by":"crossref","unstructured":"Du, Y., Wei, F., Zhang, Z., Shi, M., Gao, Y., Li, G.: Learning to prompt for open-vocabulary object detection with vision-language model. In: IEEE \/ CVF Computer Vision and Pattern Recognition Conference (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.01369"},{"key":"15_CR13","doi-asserted-by":"crossref","unstructured":"Fahes, M., Vu, T., Bursuc, A., P\u00e9rez, P., de\u00a0Charette, R.: P\u00f8da: Prompt-driven zero-shot domain adaptation. CoRR (2022)","DOI":"10.1109\/ICCV51070.2023.01707"},{"key":"15_CR14","unstructured":"Fu, C., Chen, P., Shen, Y., Qin, Y., Zhang, M., Lin, X., Qiu, Z., Lin, W., Yang, J., Zheng, X., Li, K., Sun, X., Ji, R.: MME: A comprehensive evaluation benchmark for multimodal large language models. CoRR abs\/2306.13394 (2023)"},{"key":"15_CR15","doi-asserted-by":"publisher","first-page":"105151","DOI":"10.1016\/j.engappai.2022.105151","volume":"115","author":"MA Ganaie","year":"2022","unstructured":"Ganaie, M.A., Hu, M., Malik, A.K., Tanveer, M., Suganthan, P.N.: Ensemble deep learning: a review. Eng. Appl. Artif. Intell. 115, 105151 (2022)","journal-title":"Eng. Appl. Artif. Intell."},{"key":"15_CR16","unstructured":"Gandelsman, Y., Efros, A.A., Steinhardt, J.: Interpreting CLIP\u2019s image representation via text-based decomposition. In: International Conference on Learning Representations (ICLR) (2024)"},{"key":"15_CR17","unstructured":"Gao, P., et\u00a0al.: Llama-adapter v2: parameter-efficient visual instruction model. arXiv preprint arXiv:2304.15010 (2023)"},{"key":"15_CR18","doi-asserted-by":"crossref","unstructured":"Gao, T., Fisch, A., Chen, D.: Making pre-trained language models better few-shot learners. In: Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing, ACL\/IJCNLP (2021)","DOI":"10.18653\/v1\/2021.acl-long.295"},{"key":"15_CR19","doi-asserted-by":"crossref","unstructured":"Guo, Z., Dong, B., Ji, Z., Bai, J., Guo, Y., Zuo, W.: Texts as images in prompt tuning for multi-label image recognition. In: IEEE \/ CVF Computer Vision and Pattern Recognition Conference (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00275"},{"key":"15_CR20","doi-asserted-by":"crossref","unstructured":"Jia, M., Tang, L., Chen, B., Cardie, C., Belongie, S.J., Hariharan, B., Lim, S.: Visual prompt tuning. In: European Conference on Computer Vision (ECCV) (2022)","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"15_CR21","doi-asserted-by":"crossref","unstructured":"Kirillov, A., et al.: Segment anything. In: arXiv (2023)","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"15_CR22","first-page":"22199","volume":"35","author":"T Kojima","year":"2022","unstructured":"Kojima, T., Gu, S.S., Reid, M., Matsuo, Y., Iwasawa, Y.: Large language models are zero-shot reasoners. Adv. Neural. Inf. Process. Syst. 35, 22199\u201322213 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"15_CR23","unstructured":"Li, B., Zhang, Y., Chen, L., Wang, J., Yang, J., Liu, Z.: Otter: a multi-modal model with in-context instruction tuning. arXiv preprint arXiv:2305.03726 (2023)"},{"key":"15_CR24","unstructured":"Li, F., et al.: Visual in-context prompting. CoRR (2023)"},{"key":"15_CR25","unstructured":"Li, S., Du, Y., Tenenbaum, J.B., Torralba, A., Mordatch, I.: Composing ensembles of pre-trained models via iterative consensus. In: International Conference on Learning Representations (ICLR) (2023)"},{"key":"15_CR26","doi-asserted-by":"crossref","unstructured":"Li, Y., Du, Y., Zhou, K., Wang, J., Zhao, W.X., Wen, J.R.: Evaluating object hallucination in large vision-language models. In: The 2023 Conference on Empirical Methods in Natural Language Processing (2023). https:\/\/openreview.net\/forum?id=xozJw0kZXF","DOI":"10.18653\/v1\/2023.emnlp-main.20"},{"key":"15_CR27","unstructured":"Lin, Z., et al.: SPHINX: the joint mixing of weights, tasks, and visual embeddings for multi-modal large language models. CoRR abs\/2311.07575 (2023)"},{"key":"15_CR28","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning (2023)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"15_CR29","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: Conference on Neural Information Processing Systems (NeurlPS) (2023)"},{"key":"15_CR30","doi-asserted-by":"crossref","unstructured":"Liu, P., Yuan, W., Fu, J., Jiang, Z., Hayashi, H., Neubig, G.: Pre-train, prompt, and predict: a systematic survey of prompting methods in natural language processing. ACM Comput. Surv. 55(9), 195:1\u2013195:35 (2023)","DOI":"10.1145\/3560815"},{"key":"15_CR31","first-page":"21702","volume":"36","author":"X Ma","year":"2023","unstructured":"Ma, X., Fang, G., Wang, X.: LLM-pruner: on the structural pruning of large language models. Adv. Neural. Inf. Process. Syst. 36, 21702\u201321720 (2023)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"15_CR32","unstructured":"Madaan, A., et al.: Self-refine: Iterative refinement with self-feedback. In: Conference on Neural Information Processing Systems (NeurlPS) (2023)"},{"key":"15_CR33","unstructured":"Miao, N., Teh, Y.W., Rainforth, T.: Selfcheck: using LLMS to zero-shot check their own step-by-step reasoning. CoRR (2023)"},{"key":"15_CR34","unstructured":"Niu, H., Li, H., Zhao, F., Li, B.: Domain-unified prompt representations for source-free domain generalization. CoRR (2022)"},{"key":"15_CR35","doi-asserted-by":"crossref","unstructured":"Pan, L., Saxon, M., Xu, W., Nathani, D., Wang, X., Wang, W.Y.: Automatically correcting large language models: Surveying the landscape of diverse self-correction strategies. arXiv preprint arXiv:2308.03188 (2023)","DOI":"10.1162\/tacl_a_00660"},{"key":"15_CR36","doi-asserted-by":"crossref","unstructured":"Pan, T., Tang, L., Wang, X., Shan, S.: Tokenize anything via prompting. CoRR (2023)","DOI":"10.1007\/978-3-031-72970-6_19"},{"key":"15_CR37","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning (ICML) (2021)"},{"key":"15_CR38","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: Meila, M., Zhang, T. (eds.) International Conference on Machine Learning (ICML) (2021)"},{"key":"15_CR39","doi-asserted-by":"crossref","unstructured":"Rao, Y., et al.: Denseclip: language-guided dense prediction with context-aware prompting. In: IEEE \/ CVF Computer Vision and Pattern Recognition Conference (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.01755"},{"key":"15_CR40","unstructured":"Reddy, G.: The mechanistic basis of data dependence and abrupt learning in an in-context classification task. In: International Conference on Learning Representations (ICLR) (2023)"},{"key":"15_CR41","doi-asserted-by":"crossref","unstructured":"Sahoo, P., Singh, A.K., Saha, S., Jain, V., Mondal, S., Chadha, A.: A systematic survey of prompt engineering in large language models: techniques and applications. CoRR (2024)","DOI":"10.1007\/979-8-8688-0569-1_4"},{"key":"15_CR42","unstructured":"Shen, S., et al.: Multitask vision-language prompt tuning. CoRR (2022)"},{"key":"15_CR43","unstructured":"Shinn, N., Cassano, F., Gopinath, A., Narasimhan, K., Yao, S.: Reflexion: language agents with verbal reinforcement learning. In: Conference on Neural Information Processing Systems (NeurlPS) (2023)"},{"key":"15_CR44","unstructured":"Shinn, N., Cassano, F., Gopinath, A., Narasimhan, K., Yao, S.: Reflexion: language agents with verbal reinforcement learning. Adv. Neural Inf. Proce. Syst. 36 (2024)"},{"key":"15_CR45","doi-asserted-by":"crossref","unstructured":"Shtedritski, A., Rupprecht, C., Vedaldi, A.: What does CLIP know about a red circle? visual prompt engineering for vlms. In: International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.01101"},{"key":"15_CR46","unstructured":"Shu, M., et al.: Test-time prompt tuning for zero-shot generalization in vision-language models. CoRR (2022)"},{"key":"15_CR47","unstructured":"Shu, M., et al.: Test-time prompt tuning for zero-shot generalization in vision-language models. In: Conference on Neural Information Processing Systems 2022, NeurIPS (2022)"},{"key":"15_CR48","doi-asserted-by":"crossref","unstructured":"Singh, A., et al.: Towards VQA models that can read. In: IEEE \/ CVF Computer Vision and Pattern Recognition Conference (CVPR), pp. 8317\u20138326 (2019)","DOI":"10.1109\/CVPR.2019.00851"},{"key":"15_CR49","unstructured":"Team, G., et\u00a0al.: Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)"},{"key":"15_CR50","unstructured":"Touvron, H., et\u00a0al.: Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"15_CR51","unstructured":"Touvron, H., et\u00a0al.: Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"15_CR52","unstructured":"Wang, T., et al.: Caption anything: interactive image description with diverse multimodal controls (2023)"},{"key":"15_CR53","unstructured":"Wang, W., et al.: Cogvlm: visual expert for pretrained language models (2023)"},{"key":"15_CR54","doi-asserted-by":"crossref","unstructured":"Wang, W., et al.: The all-seeing project v2: towards general relation comprehension of the open world (2024)","DOI":"10.1007\/978-3-031-73414-4_27"},{"key":"15_CR55","unstructured":"Wang, W., Cao, Y., Zhang, J., Tao, D.: FP-DETR: detection transformer advanced by fully pre-training. In: International Conference on Learning Representations (ICLR) (2022)"},{"key":"15_CR56","unstructured":"Wang, Y., Huang, Z., Hong, X.: S-prompts learning with pre-trained transformers: an occam\u2019s razor for domain incremental learning. CoRR (2022)"},{"key":"15_CR57","doi-asserted-by":"crossref","unstructured":"Wang, Z., et al.: Learning to prompt for continual learning. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00024"},{"key":"15_CR58","first-page":"24824","volume":"35","author":"J Wei","year":"2022","unstructured":"Wei, J., et al.: Chain-of-thought prompting elicits reasoning in large language models. Adv. Neural. Inf. Process. Syst. 35, 24824\u201324837 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"15_CR59","unstructured":"Wu, C.H., Motamed, S., Srivastava, S., la\u00a0Torre, F.D.: Generative visual prompt: Unifying distributional control of pre-trained generative models. CoRR (2022)"},{"key":"15_CR60","unstructured":"Wu, C., Yin, S., Qi, W., Wang, X., Tang, Z., Duan, N.: Visual chatgpt: talking, drawing and editing with visual foundation models. CoRR (2023)"},{"key":"15_CR61","unstructured":"Yang, J., Zhang, H., Li, F., Zou, X., Li, C., Gao, J.: Set-of-mark prompting unleashes extraordinary visual grounding in GPT-4V. CoRR (2023)"},{"key":"15_CR62","unstructured":"Yang, L., Wang, Y., Li, X., Wang, X., Yang, J.: Fine-grained visual prompting. In: Conference on Neural Information Processing Systems (NeurlPS) (2023)"},{"key":"15_CR63","unstructured":"Yang, Z., Li, L., Lin, K., Wang, J., Lin, C.C., Liu, Z., Wang, L.: The dawn of LMMS: preliminary explorations with GPT-4v(ision) (2023)"},{"key":"15_CR64","unstructured":"Yang, Z., et al.: Mm-react: Prompting chatgpt for multimodal reasoning and action. arXiv preprint arXiv:2303.11381 (2023)"},{"key":"15_CR65","doi-asserted-by":"crossref","unstructured":"Yao, Y., Zhang, A., Zhang, Z., Liu, Z., Chua, T., Sun, M.: CPT: colorful prompt tuning for pre-trained vision-language models. CoRR (2021)","DOI":"10.18653\/v1\/2022.findings-acl.273"},{"key":"15_CR66","unstructured":"Yu, W., et al.: Mm-vet: evaluating large multimodal models for integrated capabilities (2023)"},{"key":"15_CR67","doi-asserted-by":"crossref","unstructured":"Yue, X., et al.: Mmmu: A massive multi-discipline multimodal understanding and reasoning benchmark for expert agi. arXiv preprint arXiv:2311.16502 (2023)","DOI":"10.1109\/CVPR52733.2024.00913"},{"key":"15_CR68","unstructured":"Zeng, A., et al.: Socratic models: composing zero-shot multimodal reasoning with language. In: International Conference on Learning Representations (ICLR) (2023)"},{"key":"15_CR69","unstructured":"Zhang, A., Ji, W., Chua, T.: Next-chat: an LMM for chat, detection and segmentation. CoRR (2023)"},{"key":"15_CR70","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Ma, Z., Gao, X., Shakiah, S., Gao, Q., Chai, J.: Groundhog: grounding large language models to holistic segmentation (2024)","DOI":"10.1109\/CVPR52733.2024.01349"},{"key":"15_CR71","unstructured":"Zhang, Z., Zhou, Y., Zhao, X., Che, T., Lyu, L.: Prompt certified machine unlearning with randomized gradient smoothing and quantization. In: Conference on Neural Information Processing Systems (NeurlPS) (2022)"},{"key":"15_CR72","unstructured":"Zheng, C., Liu, Z., Xie, E., Li, Z., Li, Y.: Progressive-hint prompting improves reasoning in large language models. CoRR (2023)"},{"key":"15_CR73","doi-asserted-by":"crossref","unstructured":"Zhou, K., Yang, J., Loy, C.C., Liu, Z.: Conditional prompt learning for vision-language models. In: IEEE \/ CVF Computer Vision and Pattern Recognition Conference (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"15_CR74","doi-asserted-by":"crossref","unstructured":"Zhou, K., Yang, J., Loy, C.C., Liu, Z.: Learning to prompt for vision-language models. Int. J. Comput. Vis. (2022)","DOI":"10.1007\/s11263-022-01653-1"},{"key":"15_CR75","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)"},{"key":"15_CR76","doi-asserted-by":"crossref","unstructured":"Zhu, X., Zhang, R., He, B., Zeng, Z., Zhang, S., Gao, P.: Pointclip V2: adapting CLIP for powerful 3D open-world learning. CoRR abs\/2211.11682 (2022)","DOI":"10.1109\/ICCV51070.2023.00249"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73404-5_15","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,24]],"date-time":"2025-04-24T19:46:17Z","timestamp":1745523977000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73404-5_15"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,30]]},"ISBN":["9783031734038","9783031734045"],"references-count":76,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73404-5_15","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,30]]},"assertion":[{"value":"30 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}