{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T16:16:38Z","timestamp":1775578598502,"version":"3.50.1"},"publisher-location":"Cham","reference-count":65,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031730269","type":"print"},{"value":"9783031730276","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,26]],"date-time":"2024-11-26T00:00:00Z","timestamp":1732579200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,26]],"date-time":"2024-11-26T00:00:00Z","timestamp":1732579200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73027-6_17","type":"book-chapter","created":{"date-parts":[[2024,11,25]],"date-time":"2024-11-25T17:54:35Z","timestamp":1732557275000},"page":"291-309","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":21,"title":["DOCCI: Descriptions of\u00a0Connected and\u00a0Contrasting Images"],"prefix":"10.1007","author":[{"given":"Yasumasa","family":"Onoe","sequence":"first","affiliation":[]},{"given":"Sunayana","family":"Rane","sequence":"additional","affiliation":[]},{"given":"Zachary","family":"Berger","sequence":"additional","affiliation":[]},{"given":"Yonatan","family":"Bitton","sequence":"additional","affiliation":[]},{"given":"Jaemin","family":"Cho","sequence":"additional","affiliation":[]},{"given":"Roopal","family":"Garg","sequence":"additional","affiliation":[]},{"given":"Alexander","family":"Ku","sequence":"additional","affiliation":[]},{"given":"Zarana","family":"Parekh","sequence":"additional","affiliation":[]},{"given":"Jordi","family":"Pont-Tuset","sequence":"additional","affiliation":[]},{"given":"Garrett","family":"Tanzer","sequence":"additional","affiliation":[]},{"given":"Su","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Jason","family":"Baldridge","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,26]]},"reference":[{"key":"17_CR1","unstructured":"Adobe: Adobe Firefly (2023)"},{"key":"17_CR2","doi-asserted-by":"crossref","unstructured":"Agrawal, H., et al.: nocaps: novel object captioning at scale. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00904"},{"key":"17_CR3","unstructured":"Anil, R., et\u00a0al.: PaLM 2 technical report. arXiv (2023)"},{"key":"17_CR4","doi-asserted-by":"crossref","unstructured":"Bakr, E.M., Sun, P., Shen, X., Khan, F.F., Li, L.E., Elhoseiny, M.: HRS-Bench: holistic, reliable and scalable benchmark for text-to-image models. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01834"},{"key":"17_CR5","unstructured":"Banerjee, S., Lavie, A.: METEOR: an automatic metric for MT evaluation with improved correlation with human judgments. In: ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization (2005)"},{"key":"17_CR6","unstructured":"Chang, H., Zhang, H., et al.: Muse: text-To-image generation via masked generative transformers. In: ICML (2023)"},{"key":"17_CR7","doi-asserted-by":"crossref","unstructured":"Changpinyo, S., Sharma, P., Ding, N., Soricut, R.: Conceptual 12M: pushing web-scale image-text pre-training to recognize long-tail visual concepts. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"17_CR8","unstructured":"Chen, X., et al.: PaLI-3 vision language models: smaller, faster, stronger (2023)"},{"key":"17_CR9","unstructured":"Chen, X., et al.: PaLI: a jointly-scaled multilingual language-image model. In: ICLR (2023)"},{"key":"17_CR10","unstructured":"Chen, X., et al.: Microsoft COCO Captions: data collection and evaluation server. arXiv (2015)"},{"key":"17_CR11","unstructured":"Cho, J., et al.: Davidsonian scene graph: improving reliability in fine-grained evaluation for text-image generation. In: ICLR (2024)"},{"key":"17_CR12","doi-asserted-by":"crossref","unstructured":"Cho, J., Zala, A., Bansal, M.: DALL-Eval: probing the reasoning skills and social biases of text-to-image generation models. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00283"},{"key":"17_CR13","unstructured":"Conwell, C., Ullman, T.D.: Testing relational understanding in text-guided image generation. arXiv (2022)"},{"key":"17_CR14","unstructured":"Dai, W., et al.: InstructBLIP: towards general-purpose vision-language models with instruction tuning. In: NeurIPS (2023)"},{"key":"17_CR15","unstructured":"Desai, K., Kaul, G., Aysola, Z.T., Johnson, J.: RedCaps: web-curated image-text data created by the people, for the people. In: NeurIPS: Datasets and Benchmarks Track (2021)"},{"key":"17_CR16","unstructured":"Doveh, S., et al.: Dense and aligned captions (DAC) promote compositional reasoning in VL models. In: NeurIPS (2023)"},{"key":"17_CR17","doi-asserted-by":"crossref","unstructured":"Freitag, M., Grangier, D., Caswell, I.: BLEU might be guilty but references are not innocent. In: Webber, B., Cohn, T., He, Y., Liu, Y. (eds.) EMNLP (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.5"},{"key":"17_CR18","unstructured":"Fu, S., et al.: DreamSim: learning new dimensions of human visual similarity using synthetic data. arXiv (2023)"},{"key":"17_CR19","doi-asserted-by":"crossref","unstructured":"Gardner, M., et al.: Evaluating models\u2019 local decision boundaries via contrast sets. In: Cohn, T., He, Y., Liu, Y. (eds.) Findings of EMNLP (2020)","DOI":"10.18653\/v1\/2020.findings-emnlp.117"},{"key":"17_CR20","doi-asserted-by":"crossref","unstructured":"Gebru, T., et al.: Datasheets for datasets (2021)","DOI":"10.1201\/9781003278290-23"},{"key":"17_CR21","doi-asserted-by":"crossref","unstructured":"Hessel, J., Holtzman, A., Forbes, M., Le\u00a0Bras, R., Choi, Y.: CLIPScore: a reference-free evaluation metric for image captioning (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"17_CR22","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: GANs trained by a two time-scale update rule converge to a local Nash equilibrium (2018)"},{"key":"17_CR23","doi-asserted-by":"crossref","unstructured":"Hu, Y., et al.: TIFA: accurate and interpretable text-to-image faithfulness evaluation with question answering. In: CVPR (2023)","DOI":"10.1109\/ICCV51070.2023.01866"},{"key":"17_CR24","doi-asserted-by":"crossref","unstructured":"Hutchinson, B., Baldridge, J., Prabhakaran, V.: Underspecification in scene description-to-depiction tasks (2022)","DOI":"10.18653\/v1\/2022.aacl-main.86"},{"key":"17_CR25","doi-asserted-by":"crossref","unstructured":"Jayasumana, S., Ramalingam, S., Veit, A., Glasner, D., Chakrabarti, A., Kumar, S.: Rethinking FID: towards a better evaluation metric for image generation (2024)","DOI":"10.1109\/CVPR52733.2024.00889"},{"key":"17_CR26","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: ICML (2021)"},{"key":"17_CR27","doi-asserted-by":"crossref","unstructured":"Kasai, J., et al.: Transparent human evaluation for image captioning. In: NAACL (2022)","DOI":"10.18653\/v1\/2022.naacl-main.254"},{"key":"17_CR28","doi-asserted-by":"crossref","unstructured":"Kincaid, P., Fishburne, R.P., Rogers, R.L., Chissom, B.S.: Derivation of new readability formulas (automated readability index, fog count and Flesch reading ease formula) for navy enlisted personnel (1975)","DOI":"10.21236\/ADA006655"},{"key":"17_CR29","doi-asserted-by":"crossref","unstructured":"Kirillov, A., et al.: Segment anything. arXiv (2023)","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"17_CR30","unstructured":"Krasin, I., et al.: OpenImages: a public dataset for large-scale multi-label and multi-class image classification. Dataset https:\/\/github.com\/openimages (2016)"},{"key":"17_CR31","doi-asserted-by":"crossref","unstructured":"Krause, J., Johnson, J., Krishna, R., Fei-Fei, L.: A hierarchical approach for generating descriptive image paragraphs. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.356"},{"issue":"1","key":"17_CR32","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., et al.: Visual Genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vis. 123(1), 32\u201373 (2017). https:\/\/doi.org\/10.1007\/s11263-016-0981-7","journal-title":"Int. J. Comput. Vis."},{"issue":"8","key":"17_CR33","first-page":"639","volume":"12","author":"GHM Laughlin","year":"1969","unstructured":"Laughlin, G.H.M.: SMOG grading-a new readability formula. J. Read. 12(8), 639\u2013646 (1969)","journal-title":"J. Read."},{"key":"17_CR34","unstructured":"Lee, T., et al.: Holistic evaluation of text-to-image models (2023)"},{"key":"17_CR35","unstructured":"Lin, C.Y.: ROUGE: a package for automatic evaluation of summaries. In: Text Summarization Branches Out (2004)"},{"key":"17_CR36","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"17_CR37","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: NeurIPS (2023)"},{"key":"17_CR38","unstructured":"Midjourney: Midjourney (2022)"},{"key":"17_CR39","doi-asserted-by":"crossref","unstructured":"Ohta,\u00a0S., Fukui,\u00a0N., Sakai, K.L.: Computational principles of syntax in the regions specialized for language: integrating theoretical linguistics and functional neuroimaging (2013)","DOI":"10.3389\/fnbeh.2013.00204"},{"key":"17_CR40","unstructured":"OpenAI, et al.: GPT-4 technical report (2024)"},{"key":"17_CR41","unstructured":"OpenAI: GPT-4V(ision) system card (2022)"},{"key":"17_CR42","unstructured":"OpenAI: DALL$$\\cdot $$E 3 system card (2023)"},{"key":"17_CR43","doi-asserted-by":"crossref","unstructured":"Otani, M., et al.: Toward verifiable and reproducible human evaluation for text-to-image generation. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01372"},{"key":"17_CR44","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: BLEU: a method for automatic evaluation of machine translation. In: ACL (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"17_CR45","unstructured":"Podell, D., et al.: SDXL: improving latent diffusion models for high-resolution image synthesis. In: ICLR (2024)"},{"key":"17_CR46","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"647","DOI":"10.1007\/978-3-030-58558-7_38","volume-title":"Computer Vision \u2013 ECCV 2020","author":"J Pont-Tuset","year":"2020","unstructured":"Pont-Tuset, J., Uijlings, J., Changpinyo, S., Soricut, R., Ferrari, V.: Connecting vision and language with localized narratives. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12350, pp. 647\u2013664. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58558-7_38"},{"key":"17_CR47","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"17_CR48","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with CLIP latents (2022)"},{"key":"17_CR49","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"17_CR50","doi-asserted-by":"crossref","unstructured":"Ruiz, N., Li, Y., Jampani, V., Pritch, Y., Rubinstein, M., Aberman, K.: DreamBooth: fine tuning text-to-image diffusion models for subject-driven generation. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"17_CR51","unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding. In: NeurIPS (2022)"},{"key":"17_CR52","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding. Adv. Neural. Inf. Process. Syst. 35, 36479\u201336494 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"17_CR53","unstructured":"Schuhmann, C., et al.: LAION-5B: an open large-scale dataset for training next generation image-text models. In: NeurIPS: Datasets and Benchmarks Track (2022)"},{"key":"17_CR54","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., Soricut, R.: Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: ACL (2018)","DOI":"10.18653\/v1\/P18-1238"},{"key":"17_CR55","doi-asserted-by":"crossref","unstructured":"Srinivasan, K., Raman, K., Chen, J., Bendersky, M., Najork, M.: WIT: Wikipedia-based image text dataset for multimodal multilingual machine learning. In: SIGIR (2021)","DOI":"10.1145\/3404835.3463257"},{"key":"17_CR56","unstructured":"Stein, G., et al.: Exposing flaws of generative model evaluation metrics and their unfair treatment of diffusion models. In: NeurIPS (2023)"},{"key":"17_CR57","doi-asserted-by":"crossref","unstructured":"Thapliyal, A., Pont-Tuset, J., Chen, X., Soricut, R.: CrossModal-3600: a massively multilingual multimodal evaluation dataset. In: EMNLP (2022)","DOI":"10.18653\/v1\/2022.emnlp-main.45"},{"key":"17_CR58","doi-asserted-by":"publisher","first-page":"64","DOI":"10.1145\/2812802","volume":"59","author":"B Thomee","year":"2016","unstructured":"Thomee, B., et al.: YFCC100M: the new data in multimedia research. ACM Commun. 59, 64\u201373 (2016)","journal-title":"ACM Commun."},{"key":"17_CR59","doi-asserted-by":"crossref","unstructured":"Urbanek, J., Bordes, F., Astolfi, P., Williamson, M., Sharma, V., Romero-Soriano, A.: A picture is worth more than 77 text tokens: evaluating CLIP-style models on dense captions (2023)","DOI":"10.1109\/CVPR52733.2024.02521"},{"key":"17_CR60","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Zitnick, C.L., Parikh, D.: CIDEr: consensus-based image description evaluation. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"17_CR61","doi-asserted-by":"crossref","unstructured":"Wang, S., et al.: Imagen Editor and EditBench: advancing and evaluating text-guided image inpainting. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01761"},{"key":"17_CR62","unstructured":"Yarom, M.: What you see is what you read? Improving text-image alignment evaluation. In: NeurIPS (2023)"},{"key":"17_CR63","doi-asserted-by":"crossref","unstructured":"Young, P., Lai, A., Hodosh, M., Hockenmaier, J.: From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions. In: TACL (2014)","DOI":"10.1162\/tacl_a_00166"},{"key":"17_CR64","unstructured":"Yu, J., et al.: Scaling autoregressive models for content-rich text-to-image generation (2022)"},{"key":"17_CR65","unstructured":"Yu, J., et\u00a0al.: Scaling autoregressive models for content-rich text-to-image generation. arXiv preprint arXiv:2206.10789, 2(3), 5 (2022)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73027-6_17","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T23:46:04Z","timestamp":1733096764000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73027-6_17"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,26]]},"ISBN":["9783031730269","9783031730276"],"references-count":65,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73027-6_17","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,26]]},"assertion":[{"value":"26 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}