{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T00:31:08Z","timestamp":1742949068062,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":31,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819786190"},{"type":"electronic","value":"9789819786206"}],"license":[{"start":{"date-parts":[[2024,10,20]],"date-time":"2024-10-20T00:00:00Z","timestamp":1729382400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,20]],"date-time":"2024-10-20T00:00:00Z","timestamp":1729382400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-97-8620-6_7","type":"book-chapter","created":{"date-parts":[[2024,10,19]],"date-time":"2024-10-19T21:02:10Z","timestamp":1729371730000},"page":"98-113","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Evaluating Attribute Comprehension in Large Vision-Language Models"],"prefix":"10.1007","author":[{"given":"Haiwen","family":"Zhang","sequence":"first","affiliation":[]},{"given":"Zixi","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Yuanzhi","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Xinran","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Zheqi","family":"He","sequence":"additional","affiliation":[]},{"given":"Kongming","family":"Liang","sequence":"additional","affiliation":[]},{"given":"Zhanyu","family":"Ma","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,20]]},"reference":[{"key":"7_CR1","doi-asserted-by":"crossref","unstructured":"Antol, S., Agrawal, A., Lu, J., Mitchell, M., Batra, D., Zitnick, C.L., Parikh, D.: Vqa: visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2425\u20132433 (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"7_CR2","doi-asserted-by":"crossref","unstructured":"Bravo, M.A., Mittal, S., Ging, S., Brox, T.: Open-vocabulary attribute detection. In: IEEE CVPR, pp. 7041\u20137050 (2023)","DOI":"10.1109\/CVPR52729.2023.00680"},{"key":"7_CR3","doi-asserted-by":"crossref","unstructured":"Changpinyo, S., Sharma, P., Ding, N., Soricut, R.: Conceptual 12m: Pushing web-scale image-text pre-training to recognize long-tail visual concepts. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3558\u20133568 (2021)","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"7_CR4","unstructured":"Chuang, C.Y., Varun, J., Li, Y., Torralba, A., Jegelka, S.: Debiasing vision-language models via biased prompts. arXiv:2302.00070 (2023)"},{"issue":"5","key":"7_CR5","doi-asserted-by":"publisher","first-page":"5561","DOI":"10.1109\/TPAMI.2022.3210780","volume":"45","author":"D Gao","year":"2023","unstructured":"Gao, D., Wang, R., Shan, S., Chen, X.: Cric: a vqa dataset for compositional reasoning on vision and commonsense. IEEE TPAMI 45(5), 5561\u20135578 (2023). https:\/\/doi.org\/10.1109\/TPAMI.2022.3210780","journal-title":"IEEE TPAMI"},{"key":"7_CR6","doi-asserted-by":"crossref","unstructured":"Goyal, Y., Khot, T., Summers-Stay, D., Batra, D., Parikh, D.: Making the V in VQA matter: elevating the role of image understanding in Visual Question Answering. In: IEEE CVPR (2017)","DOI":"10.1109\/CVPR.2017.670"},{"key":"7_CR7","unstructured":"Huang, X., Huang, Y.J., Zhang, Y., Tian, W., Feng, R., Zhang, Y., Xie, Y., Li, Y., Zhang, L.: Open-set image tagging with multi-grained text supervision. arXiv:2310 (2023)"},{"key":"7_CR8","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., Zhu, Y., Groth, O., Johnson, J., Hata, K., Kravitz, J., Chen, S., Kalantidis, Y., Li, L.J., Shamma, D.A., et al.: Visual genome: Connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vision 123, 32\u201373 (2017)","journal-title":"Int. J. Comput. Vision"},{"key":"7_CR9","doi-asserted-by":"crossref","unstructured":"Li, C., Xu, H., Tian, J., Wang, W., Yan, M., Bi, B., Ye, J., Chen, H., Xu, G., Cao, Z., et\u00a0al.: mplug: Effective and efficient vision-language learning by cross-modal skip-connections. arXiv:2205.12005 (2022)","DOI":"10.18653\/v1\/2022.emnlp-main.488"},{"key":"7_CR10","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv:2301.12597 (2023)"},{"key":"7_CR11","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In: ICML (2022)"},{"key":"7_CR12","unstructured":"Li, J., Selvaraju, R.R., Gotmare, A.D., Joty, S., Xiong, C., Hoi, S.: Align before fuse: Vision and language representation learning with momentum distillation. In: NeurIPS (2021)"},{"key":"7_CR13","doi-asserted-by":"crossref","unstructured":"Li, Y., Du, Y., Zhou, K., Wang, J., Zhao, W.X., Wen, J.R.: Evaluating object hallucination in large vision-language models. arXiv:2305.10355 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.20"},{"key":"7_CR14","doi-asserted-by":"crossref","unstructured":"Liang, K., Wang, X., Zhang, H., Ma, Z., Guo, J.: Hierarchical visual attribute learning in the wild. In: ACM MM, pp. 3415\u20133423 (2023)","DOI":"10.1145\/3581783.3612274"},{"key":"7_CR15","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., Zitnick, C.L.: Microsoft coco: Common objects in context. In: Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13, pp. 740\u2013755. Springer (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"7_CR16","unstructured":"Ordonez, V., Kulkarni, G., Berg, T.: Im2text: Describing images using 1 million captioned photographs. Advances in Neural Information Processing Systems 24 (2011)"},{"key":"7_CR17","unstructured":"Patel, D., Dangati, P., Lee, J.Y., Boratko, M., McCallum, A.: Modeling label space interactions in multi-label classification using box embeddings. In: ICLR 2022 Poster (2022)"},{"key":"7_CR18","doi-asserted-by":"crossref","unstructured":"Patterson, G., Hays, J.: Coco attributes: Attributes for people, animals, and objects. In: ECCV, pp. 85\u2013100. Springer (2016)","DOI":"10.1007\/978-3-319-46466-4_6"},{"key":"7_CR19","doi-asserted-by":"crossref","unstructured":"Pham, K., Kafle, K., Lin, Z., Ding, Z., Cohen, S., Tran, Q., Shrivastava, A.: Learning to predict visual attributes in the wild. In: IEEE CVPR, pp. 13018\u201313028 (2021)","DOI":"10.1109\/CVPR46437.2021.01282"},{"key":"7_CR20","doi-asserted-by":"crossref","unstructured":"Pham, K., Kafle, K., Lin, Z., Ding, Z., Cohen, S., Tran, Q., Shrivastava, A.: Improving closed and open-vocabulary attribute prediction using transformers. In: ECCV (2022)","DOI":"10.1007\/978-3-031-19806-9_12"},{"key":"7_CR21","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., Caicedo, J.C., Hockenmaier, J., Lazebnik, S.: Flickr30k entities: Collecting region-to-phrase correspondences for richer image-to-sentence models. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2641\u20132649 (2015)","DOI":"10.1109\/ICCV.2015.303"},{"key":"7_CR22","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML, pp. 8748\u20138763. PMLR (2021)"},{"key":"7_CR23","unstructured":"Schuhmann, C., Vencu, R., Beaumont, R., Kaczmarczyk, R., Mullis, C., Katta, A., Coombes, T., Jitsev, J., Komatsuzaki, A.: Laion-400m: Open dataset of clip-filtered 400 million image-text pairs. arXiv:2111.02114 (2021)"},{"key":"7_CR24","doi-asserted-by":"crossref","unstructured":"Selvaraju, R.R., Cogswell, M., Das, A., Vedantam, R., Parikh, D., Batra, D.: Grad-cam: Visual explanations from deep networks via gradient-based localization. In: IEEE ICCV, pp. 618\u2013626 (2017)","DOI":"10.1109\/ICCV.2017.74"},{"key":"7_CR25","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., Soricut, R.: Conceptual captions: A cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 2556\u20132565 (2018)","DOI":"10.18653\/v1\/P18-1238"},{"key":"7_CR26","unstructured":"Yamada, Y., Tang, Y., Yildirim, I.: When are lemons purple? the concept association bias of clip. arXiv:2212.12043 (2022)"},{"key":"7_CR27","unstructured":"Yuksekgonul, M., Bianchi, F., Kalluri, P., Jurafsky, D., Zou, J.: When and why vision-language models behave like bag-of-words models, and what to do about it? arXiv:2210.01936 (2022)"},{"key":"7_CR28","doi-asserted-by":"crossref","unstructured":"Zeng, H., Ai, H., Zhuang, Z., Chen, L.: Multi-task learning via co-attentive sharing for pedestrian attribute recognition. In: IEEE ICME, pp.\u00a01\u20136 (2020)","DOI":"10.1109\/ICME46284.2020.9102757"},{"key":"7_CR29","doi-asserted-by":"crossref","unstructured":"Zhang, P., Goyal, Y., Summers-Stay, D., Batra, D., Parikh, D.: Yin and Yang: balancing and answering binary visual questions. In: IEEE CVPR (2016)","DOI":"10.1109\/CVPR.2016.542"},{"key":"7_CR30","doi-asserted-by":"publisher","unstructured":"Zhao, T., Zhang, T., Zhu, M., Shen, H., Lee, K., Lu, X., Yin, J.: Vl-checklist: Evaluating pre-trained vision-language models with objects, attributes and relations. https:\/\/doi.org\/10.48550\/ARXIV.2207.00221. https:\/\/arxiv.org\/abs\/2207.00221 (2022)","DOI":"10.48550\/ARXIV.2207.00221"},{"key":"7_CR31","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv:2304.10592 (2023)"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-97-8620-6_7","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,14]],"date-time":"2025-01-14T20:17:00Z","timestamp":1736885820000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-97-8620-6_7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,20]]},"ISBN":["9789819786190","9789819786206"],"references-count":31,"URL":"https:\/\/doi.org\/10.1007\/978-981-97-8620-6_7","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,10,20]]},"assertion":[{"value":"20 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Urumqi","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 October 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/2024.prcv.cn\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}