{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T17:35:42Z","timestamp":1777570542110,"version":"3.51.4"},"publisher-location":"Cham","reference-count":65,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726729","type":"print"},{"value":"9783031726736","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,22]],"date-time":"2024-10-22T00:00:00Z","timestamp":1729555200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,22]],"date-time":"2024-10-22T00:00:00Z","timestamp":1729555200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72673-6_26","type":"book-chapter","created":{"date-parts":[[2024,10,21]],"date-time":"2024-10-21T16:03:50Z","timestamp":1729526630000},"page":"474-491","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":8,"title":["FineMatch: Aspect-Based Fine-Grained Image and\u00a0Text Mismatch Detection and\u00a0Correction"],"prefix":"10.1007","author":[{"given":"Hang","family":"Hua","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jing","family":"Shi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kushal","family":"Kafle","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Simon","family":"Jenni","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Daoan","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"John","family":"Collomosse","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Scott","family":"Cohen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiebo","family":"Luo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,10,22]]},"reference":[{"key":"26_CR1","unstructured":"Gpt-4V(ision) system card (2023). https:\/\/api.semanticscholar.org\/CorpusID:263218031"},{"key":"26_CR2","doi-asserted-by":"crossref","unstructured":"Agrawal, H., et al.: nocaps: novel object captioning at scale. International Conference on Computer Vision pp. 8947\u20138956 (2019). https:\/\/api.semanticscholar.org\/CorpusID:56517630","DOI":"10.1109\/ICCV.2019.00904"},{"key":"26_CR3","first-page":"23716","volume":"35","author":"JB Alayrac","year":"2022","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning. Adv. Neural. Inf. Process. Syst. 35, 23716\u201323736 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"26_CR4","unstructured":"Awadalla, A., et\u00a0al.: OpenFlamingo: an open-source framework for training large autoregressive vision-language models. arXiv preprint arXiv:2308.01390 (2023)"},{"key":"26_CR5","unstructured":"Byeon, M., Park, B., Kim, H., Lee, S., Baek, W., Kim, S.: Coyo-700m: Image-text pair dataset (2022). https:\/\/github.com\/kakaobrain\/coyo-dataset"},{"key":"26_CR6","unstructured":"Chen, J., Huang, Y., Lv, T., Cui, L., Chen, Q., Wei, F.: TextDiffuser: Diffusion models as text painters. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"26_CR7","unstructured":"Chen, J., et al.: MiniGPT-v2: large language model as a unified interface for vision-language multi-task learning. arXiv preprint arXiv:2310.09478 (2023)"},{"key":"26_CR8","doi-asserted-by":"crossref","unstructured":"Chen, L., et al.: ShareGPT4V: improving large multi-modal models with better captions. arXiv preprint arXiv:2311.12793 (2023)","DOI":"10.1007\/978-3-031-72643-9_22"},{"key":"26_CR9","doi-asserted-by":"crossref","unstructured":"Diwan, A., Berry, L., Choi, E., Harwath, D., Mahowald, K.: Why is winoground hard? Investigating failures in visuolinguistic compositionality. arXiv preprint arXiv:2211.00768 (2022)","DOI":"10.18653\/v1\/2022.emnlp-main.143"},{"key":"26_CR10","unstructured":"Dong, X., et al.: InternLM-XComposer2: mastering free-form text-image composition and comprehension in vision-language large model. arXiv preprint arXiv:2401.16420 (2024)"},{"key":"26_CR11","unstructured":"Gao, P., et al.: Llama-adapter V2: parameter-efficient visual instruction model. arXiv preprint arXiv:2304.15010 (2023)"},{"key":"26_CR12","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"417","DOI":"10.1007\/978-3-030-58520-4_25","volume-title":"Computer Vision \u2013 ECCV 2020","author":"D Gurari","year":"2020","unstructured":"Gurari, D., Zhao, Y., Zhang, M., Bhattacharya, N.: Captioning images taken by people who are blind. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12362, pp. 417\u2013434. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58520-4_25"},{"key":"26_CR13","unstructured":"Hsieh, C.Y., Zhang, J., Ma, Z., Kembhavi, A., Krishna, R.: SugarCrepe: fixing hackable benchmarks for vision-language compositionality. arXiv preprint arXiv:2306.14610 (2023)"},{"key":"26_CR14","doi-asserted-by":"crossref","unstructured":"Hu, Y., Hua, H., Yang, Z., Shi, W., Smith, N.A., Luo, J.: PromptCap: prompt-guided task-aware image captioning. arXiv preprint arXiv:2211.09699 (2022)","DOI":"10.1109\/ICCV51070.2023.00277"},{"key":"26_CR15","doi-asserted-by":"crossref","unstructured":"Hua, H., Li, X., Dou, D., Xu, C.Z., Luo, J.: Noise stability regularization for improving BERT fine-tuning. arXiv preprint arXiv:2107.04835 (2021)","DOI":"10.18653\/v1\/2021.naacl-main.258"},{"key":"26_CR16","unstructured":"Hua, H., Li, X., Dou, D., Xu, C.Z., Luo, J.: Fine-tuning pre-trained language models with noise stability regularization. arXiv preprint arXiv:2206.05658 (2022)"},{"key":"26_CR17","unstructured":"Hua, H., Tang, Y., Xu, C., Luo, J.: V2Xum-LLM: cross-modal video summarization with temporal prompt instruction tuning. arXiv preprint arXiv:2404.12353 (2024)"},{"key":"26_CR18","unstructured":"Huang, K., Sun, K., Xie, E., Li, Z., Liu, X.: T2i-CompBench: a comprehensive benchmark for open-world compositional text-to-image generation. ArXiv abs\/2307.06350 (2023). https:\/\/api.semanticscholar.org\/CorpusID:259847295"},{"key":"26_CR19","doi-asserted-by":"crossref","unstructured":"Hudson, D.A., Manning, C.D.: GQA: A new dataset for real-world visual reasoning and compositional question answering. In: Conference on Computer Vision and Pattern Recognition (CVPR) (2019)","DOI":"10.1109\/CVPR.2019.00686"},{"key":"26_CR20","doi-asserted-by":"publisher","unstructured":"Ilharco, G., et al.: Openclip (2021). https:\/\/doi.org\/10.5281\/zenodo.5143773, https:\/\/doi.org\/10.5281\/zenodo.5143773, if you use this software, please cite it as below","DOI":"10.5281\/zenodo.5143773"},{"key":"26_CR21","unstructured":"Li, B., Zhang, Y., Chen, L., Wang, J., Yang, J., Liu, Z.: Otter: a multi-modal model with in-context instruction tuning. arXiv preprint arXiv:2305.03726 (2023)"},{"key":"26_CR22","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)"},{"key":"26_CR23","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning, pp. 12888\u201312900. PMLR (2022)"},{"key":"26_CR24","doi-asserted-by":"crossref","unstructured":"Lin, J., Hua, H., Chen, M., Li, Y., Hsiao, J., Ho, C., Luo, J.: VideoXum: cross-modal visual and textural summarization of videos. arXiv preprint arXiv:2303.12060 (2023)","DOI":"10.1109\/TMM.2023.3335875"},{"key":"26_CR25","unstructured":"Liu, F., Lin, K., Li, L., Wang, J., Yacoob, Y., Wang, L.: Aligning large multi-modal model with robust instruction tuning. arXiv preprint arXiv:2306.14565 (2023)"},{"key":"26_CR26","unstructured":"Liu, H., et al.: A survey on hallucination in large vision-language models. arXiv preprint arXiv:2402.00253 (2024)"},{"key":"26_CR27","unstructured":"Liu, H., et al.: LLaVA-NeXT: improved reasoning, OCR, and world knowledge (2024). https:\/\/llava-vl.github.io\/blog\/2024-01-30-llava-next\/"},{"key":"26_CR28","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. ArXiv abs\/2304.08485 (2023). https:\/\/api.semanticscholar.org\/CorpusID:258179774"},{"key":"26_CR29","doi-asserted-by":"crossref","unstructured":"Liu, J., Wang, W., Wang, D., Smith, N.A., Choi, Y., Hajishirzi, H.: Vera: a general-purpose plausibility estimation model for commonsense statements. arXiv preprint arXiv:2305.03695 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.81"},{"key":"26_CR30","unstructured":"Lu, X.H., Kasner, Z., Reddy, S.: WebLINX: real-world website navigation with multi-turn dialogue (2024). https:\/\/api.semanticscholar.org\/CorpusID:267547883"},{"key":"26_CR31","doi-asserted-by":"crossref","unstructured":"Ma, Z., Hong, J., Gul, M.O., Gandhi, M., Gao, I., Krishna, R.: CREPE: can vision-language foundation models reason compositionally? In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10910\u201310921 (2023)","DOI":"10.1109\/CVPR52729.2023.01050"},{"key":"26_CR32","doi-asserted-by":"crossref","unstructured":"Morris, J.X., Lifland, E., Yoo, J.Y., Grigsby, J., Jin, D., Qi, Y.: TextAttack: a framework for adversarial attacks, data augmentation, and adversarial training in NLP. arXiv preprint arXiv:2005.05909 (2020)","DOI":"10.18653\/v1\/2020.emnlp-demos.16"},{"key":"26_CR33","doi-asserted-by":"publisher","unstructured":"Mu, N., Kirillov, A., Wagner, D., Xie, S.: SLIP: Self-supervision Meets Language-Image Pre-training. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision \u2013 ECCV 2022. ECCV 2022. LNCS, vol. 13686. Springer, Cham. https:\/\/doi.org\/10.1007\/978-3-031-19809-0_30","DOI":"10.1007\/978-3-031-19809-0_30"},{"key":"26_CR34","unstructured":"OpenAI: GPT-4 technical report. ArXiv abs\/2303.08774 (2023). https:\/\/api.semanticscholar.org\/CorpusID:257532815"},{"key":"26_CR35","doi-asserted-by":"crossref","unstructured":"Parcalabescu, L., Cafagna, M., Muradjan, L., Frank, A., Calixto, I., Gatt, A.: VALSE: a task-independent benchmark for vision and language models centered on linguistic phenomena. arXiv preprint arXiv:2112.07566 (2021)","DOI":"10.18653\/v1\/2022.acl-long.567"},{"key":"26_CR36","unstructured":"Parcalabescu, L., Gatt, A., Frank, A., Calixto, I.: Seeing past words: testing the cross-modal capabilities of pretrained V &L models on counting tasks. arXiv preprint arXiv:2012.12352 (2020)"},{"key":"26_CR37","doi-asserted-by":"crossref","unstructured":"Popovic, M.: chrF: character n-gram F-score for automatic MT evaluation. In: WMT@EMNLP (2015). https:\/\/api.semanticscholar.org\/CorpusID:15349458","DOI":"10.18653\/v1\/W15-3049"},{"key":"26_CR38","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"26_CR39","doi-asserted-by":"crossref","unstructured":"Rajpurkar, P., Zhang, J., Lopyrev, K., Liang, P.: SQuAD: 100,000+ questions for machine comprehension of text. arXiv preprint arXiv:1606.05250 (2016)","DOI":"10.18653\/v1\/D16-1264"},{"key":"26_CR40","unstructured":"Rawte, V., Sheth, A., Das, A.: A survey of hallucination in large foundation models. arXiv preprint arXiv:2309.05922 (2023)"},{"key":"26_CR41","unstructured":"Ray, A., Radenovic, F., Dubey, A., Plummer, B.A., Krishna, R., Saenko, K.: COLA: a benchmark for compositional text-to-image retrieval (2023). https:\/\/api.semanticscholar.org\/CorpusID:258546995"},{"key":"26_CR42","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"26_CR43","unstructured":"Schuhmann, C., et al.: LAION-5B: an open large-scale dataset for training next generation image-text models. ArXiv abs\/2210.08402 (2022). https:\/\/api.semanticscholar.org\/CorpusID:252917726"},{"key":"26_CR44","doi-asserted-by":"crossref","unstructured":"Singh, A., et a;l.: FLAVA: a foundational language and vision alignment model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15638\u201315650 (2022)","DOI":"10.1109\/CVPR52688.2022.01519"},{"key":"26_CR45","unstructured":"Smithsonian Institution: Smithsonian Open Access (2023). https:\/\/www.si.edu\/openaccess"},{"key":"26_CR46","doi-asserted-by":"crossref","unstructured":"Song, L., Yin, G., Jin, Z., Dong, X., Xu, C.: Emotional listener portrait: Realistic listener motion simulation in conversation. In: 2023 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 20782\u201320792. IEEE (2023)","DOI":"10.1109\/ICCV51070.2023.01905"},{"key":"26_CR47","unstructured":"Song, Y., et al.: ObjectStitch: generative object compositing. arXiv preprint arXiv:2212.00932 (2022)"},{"key":"26_CR48","unstructured":"Sun, Q., et\u00a0al.: Generative multimodal models are in-context learners. arXiv preprint arXiv:2312.13286 (2023)"},{"key":"26_CR49","unstructured":"Team, G., et\u00a0al.: Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)"},{"key":"26_CR50","doi-asserted-by":"crossref","unstructured":"Thrush, T., Jiang, R., Bartolo, M., Singh, A., Williams, A., Kiela, D., Ross, C.: Winoground: probing vision and language models for visio-linguistic compositionality. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5238\u20135248 (2022)","DOI":"10.1109\/CVPR52688.2022.00517"},{"key":"26_CR51","unstructured":"Touvron, H., et al.: Llama: open and efficient foundation language models. ArXiv abs\/2302.13971 (2023). https:\/\/api.semanticscholar.org\/CorpusID:257219404"},{"key":"26_CR52","unstructured":"Wang, B., et\u00a0al.: VIGC: visual instruction generation and correction. arXiv preprint arXiv:2308.12714 (2023)"},{"key":"26_CR53","unstructured":"Wang, P., et al.: OFA: unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. In: International Conference on Machine Learning, pp. 23318\u201323340. PMLR (2022)"},{"key":"26_CR54","unstructured":"Wang, T., et\u00a0al.: Caption anything: interactive image description with diverse multimodal controls. arXiv preprint arXiv:2305.02677 (2023)"},{"key":"26_CR55","doi-asserted-by":"crossref","unstructured":"Wang, W., et\u00a0al.: Image as a foreign language: Beit pretraining for all vision and vision-language tasks. arXiv preprint arXiv:2208.10442 (2022)","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"26_CR56","unstructured":"Yarom, M., et al.: What you see is what you read? Improving text-image alignment evaluation. arXiv preprint arXiv:2305.10400 (2023)"},{"key":"26_CR57","doi-asserted-by":"crossref","unstructured":"Ye, Q., et al.: mPLUG-Owl2: revolutionizing multi-modal large language model with modality collaboration. arXiv preprint arXiv:2311.04257 (2023)","DOI":"10.1109\/CVPR52733.2024.01239"},{"key":"26_CR58","unstructured":"Yin, S., et al.: Woodpecker: hallucination correction for multimodal large language models. arXiv preprint arXiv:2310.16045 (2023)"},{"key":"26_CR59","unstructured":"Yu, J., Wang, Z., Vasudevan, V., Yeung, L., Seyedhosseini, M., Wu, Y.: CoCa: contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917 (2022)"},{"key":"26_CR60","unstructured":"Yu, Y., Zeng, Z., Hua, H., Fu, J., Luo, J.: PromptFix: you prompt and we fix the photo. arXiv preprint arXiv:2405.16785 (2024)"},{"key":"26_CR61","unstructured":"Yuksekgonul, M., Bianchi, F., Kalluri, P., Jurafsky, D., Zou, J.: When and why vision-language models behave like bags-of-words, and what to do about it? In: The Eleventh International Conference on Learning Representations (2022)"},{"key":"26_CR62","unstructured":"Zhang, K., Mo, L., Chen, W., Sun, H., Su, Y.: MagicBrush: a manually annotated dataset for instruction-guided image editing. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"26_CR63","unstructured":"Zhang, T., Kishore, V., Wu, F., Weinberger, K.Q., Artzi, Y.: BertScore: evaluating text generation with BERT. arXiv preprint arXiv:1904.09675 (2019)"},{"key":"26_CR64","unstructured":"Zhao, H., et al.: MMICL: empowering vision-language model with multi-modal in-context learning. arXiv preprint arXiv:2309.07915 (2023)"},{"key":"26_CR65","doi-asserted-by":"crossref","unstructured":"Zhao, T., et al.: Vl-checklist: evaluating pre-trained vision-language models with objects, attributes and relations. arXiv preprint arXiv:2207.00221 (2022)","DOI":"10.18653\/v1\/2022.emnlp-demos.4"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72673-6_26","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T00:04:38Z","timestamp":1732925078000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72673-6_26"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,22]]},"ISBN":["9783031726729","9783031726736"],"references-count":65,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72673-6_26","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,22]]},"assertion":[{"value":"22 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}