{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T21:28:05Z","timestamp":1743110885420,"version":"3.40.3"},"publisher-location":"Cham","reference-count":51,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031730382"},{"type":"electronic","value":"9783031730399"}],"license":[{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73039-9_27","type":"book-chapter","created":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T14:57:07Z","timestamp":1730300227000},"page":"474-490","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["IG Captioner: Information Gain Captioners Are Strong Zero-Shot Classifiers"],"prefix":"10.1007","author":[{"given":"Chenglin","family":"Yang","sequence":"first","affiliation":[]},{"given":"Siyuan","family":"Qiao","sequence":"additional","affiliation":[]},{"given":"Yuan","family":"Cao","sequence":"additional","affiliation":[]},{"given":"Yu","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Tao","family":"Zhu","sequence":"additional","affiliation":[]},{"given":"Alan","family":"Yuille","sequence":"additional","affiliation":[]},{"given":"Jiahui","family":"Yu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,31]]},"reference":[{"key":"27_CR1","unstructured":"Achiam, J., et\u00a0al.: GPT-4 technical report. arXiv preprint arXiv:2303.08774 (2023)"},{"key":"27_CR2","first-page":"23716","volume":"35","author":"JB Alayrac","year":"2022","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning. Adv. Neural. Inf. Process. Syst. 35, 23716\u201323736 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"27_CR3","unstructured":"Bai, J., et al.: Qwen-VL: A versatile vision-language model for understanding, localization, text reading, and beyond (2023)"},{"key":"27_CR4","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., et al.: Language models are few-shot learners. Adv. Neural. Inf. Process. Syst. 33, 1877\u20131901 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"27_CR5","unstructured":"Chen, X., et al.: Microsoft COCO captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325 (2015)"},{"issue":"1","key":"27_CR6","first-page":"22","volume":"16","author":"K Church","year":"1990","unstructured":"Church, K., Hanks, P.: Word association norms, mutual information, and lexicography. Comput. Linguist. 16(1), 22\u201329 (1990)","journal-title":"Comput. Linguist."},{"key":"27_CR7","unstructured":"Clark, K., Jaini, P.: Text-to-image diffusion models are zero-shot classifiers. arXiv preprint arXiv:2303.15233 (2023)"},{"key":"27_CR8","unstructured":"Dai, W., et al.: InstructBLIP: Towards general-purpose vision-language models with instruction tuning (2023)"},{"key":"27_CR9","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"27_CR10","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"27_CR11","doi-asserted-by":"publisher","unstructured":"Gafni, O., Polyak, A., Ashual, O., Sheynin, S., Parikh, D., Taigman, Y.: Make-a-scene: scene-based text-to-image generation with human priors. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision - ECCV 2022. ECCV 2022. LNCS, vol. 13675, pp. 89\u2013106. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19784-0_6","DOI":"10.1007\/978-3-031-19784-0_6"},{"key":"27_CR12","doi-asserted-by":"crossref","unstructured":"Ge, Y., Zeng, X., Huffman, J.S., Lin, T.Y., Liu, M.Y., Cui, Y.: Visual fact checker: enabling high-fidelity detailed caption generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14033\u201314042 (2024)","DOI":"10.1109\/CVPR52733.2024.01331"},{"key":"27_CR13","doi-asserted-by":"crossref","unstructured":"te\u00a0Grotenhuis, M., Matthijssen, A.: Basic SPSS tutorial. Sage Publications (2015)","DOI":"10.4135\/9781483397634"},{"key":"27_CR14","doi-asserted-by":"crossref","unstructured":"He, H., Peng, N., Liang, P.: Pun generation with surprise. arXiv preprint arXiv:1904.06828 (2019)","DOI":"10.18653\/v1\/N19-1172"},{"key":"27_CR15","doi-asserted-by":"publisher","first-page":"535","DOI":"10.1016\/S0079-6123(06)65034-6","volume":"165","author":"GE Hinton","year":"2007","unstructured":"Hinton, G.E.: To recognize shapes, first learn to generate images. Prog. Brain Res. 165, 535\u2013547 (2007)","journal-title":"Prog. Brain Res."},{"key":"27_CR16","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. Adv. Neural. Inf. Process. Syst. 33, 6840\u20136851 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"27_CR17","unstructured":"Ho, J., Salimans, T.: Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598 (2022)"},{"key":"27_CR18","unstructured":"Huang, S., et al.: Language is not all you need: Aligning perception with language models. arXiv:abs\/2302.14045 (2023)"},{"key":"27_CR19","doi-asserted-by":"crossref","unstructured":"Li, A.C., Prabhudesai, M., Duggal, S., Brown, E., Pathak, D.: Your diffusion model is secretly a zero-shot classifier. arXiv preprint arXiv:2303.16203 (2023)","DOI":"10.1109\/ICCV51070.2023.00210"},{"key":"27_CR20","unstructured":"Li, B., et al.: LLaVA-NeXT: Stronger LLMs supercharge multimodal capabilities in the wild (2024). https:\/\/llava-vl.github.io\/blog\/2024-05-10-llava-next-stronger-llms\/"},{"key":"27_CR21","doi-asserted-by":"crossref","unstructured":"Li, J., Galley, M., Brockett, C., Gao, J., Dolan, B.: A diversity-promoting objective function for neural conversation models. arXiv preprint arXiv:1510.03055 (2015)","DOI":"10.18653\/v1\/N16-1014"},{"key":"27_CR22","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: International Conference on Machine Learning, pp. 19730\u201319742. PMLR (2023)"},{"key":"27_CR23","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning, pp. 12888\u201312900. PMLR (2022)"},{"key":"27_CR24","doi-asserted-by":"crossref","unstructured":"Li, X.L., et al.: Contrastive decoding: Open-ended text generation as optimization. arXiv preprint arXiv:2210.15097 (2022)","DOI":"10.18653\/v1\/2023.acl-long.687"},{"key":"27_CR25","doi-asserted-by":"crossref","unstructured":"Liu, A., et al.: DExperts: Decoding-time controlled text generation with experts and anti-experts. arXiv preprint arXiv:2105.03023 (2021)","DOI":"10.18653\/v1\/2021.acl-long.522"},{"key":"27_CR26","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning (2023)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"27_CR27","unstructured":"Liu, H., et al.: LLaVA-NeXT: Improved reasoning, OCR, and world knowledge (2024). https:\/\/llava-vl.github.io\/blog\/2024-01-30-llava-next\/"},{"key":"27_CR28","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"27_CR29","unstructured":"Ng, A., Jordan, M.: On discriminative vs. generative classifiers: a comparison of logistic regression and naive Bayes. In: Advances in Neural Information Processing Systems, vol. 14 (2001)"},{"key":"27_CR30","unstructured":"Nichol, A., et al.: GLIDE: Towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741 (2021)"},{"key":"27_CR31","unstructured":"OpenAI: GPT-4 technical report. ArXiv abs\/2303.08774 (2023), https:\/\/arxiv.org\/abs\/2303.08774"},{"key":"27_CR32","doi-asserted-by":"crossref","unstructured":"Pearson, K.: VII. Note on regression and inheritance in the case of two parents. proc. R. Soc. London 58(347-352), 240\u2013242 (1895)","DOI":"10.1098\/rspl.1895.0041"},{"key":"27_CR33","unstructured":"Peng, Z., et al.: Kosmos-2: Grounding multimodal large language models to the world. arXiv preprint arXiv:2306.14824 (2023)"},{"key":"27_CR34","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., Caicedo, J.C., Hockenmaier, J., Lazebnik, S.: Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2641\u20132649 (2015)","DOI":"10.1109\/ICCV.2015.303"},{"key":"27_CR35","doi-asserted-by":"publisher","first-page":"81","DOI":"10.1007\/BF00116251","volume":"1","author":"JR Quinlan","year":"1986","unstructured":"Quinlan, J.R.: Induction of decision trees. Mach. Learn. 1, 81\u2013106 (1986)","journal-title":"Mach. Learn."},{"key":"27_CR36","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"27_CR37","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with CLIP latents (2022). https:\/\/arxiv.org\/abs\/2204.061257 (2022)"},{"key":"27_CR38","doi-asserted-by":"crossref","unstructured":"Ranzato, M., Susskind, J., Mnih, V., Hinton, G.: On deep generative models with applications to recognition. In: CVPR 2011, pp. 2857\u20132864. IEEE (2011)","DOI":"10.1109\/CVPR.2011.5995710"},{"key":"27_CR39","unstructured":"Reid, M., et\u00a0al.: Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context. arXiv preprint arXiv:2403.05530 (2024)"},{"key":"27_CR40","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"27_CR41","doi-asserted-by":"crossref","unstructured":"Saharia, C., et al.: Palette: image-to-image diffusion models. In: ACM SIGGRAPH 2022 Conference Proceedings, pp. 1\u201310 (2022)","DOI":"10.1145\/3528233.3530757"},{"key":"27_CR42","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding. Adv. Neural. Inf. Process. Syst. 35, 36479\u201336494 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"27_CR43","first-page":"25278","volume":"35","author":"C Schuhmann","year":"2022","unstructured":"Schuhmann, C., et al.: LAION-5B: an open large-scale dataset for training next generation image-text models. Adv. Neural. Inf. Process. Syst. 35, 25278\u201325294 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"27_CR44","unstructured":"Sohl-Dickstein, J., Weiss, E., Maheswaranathan, N., Ganguli, S.: Deep unsupervised learning using nonequilibrium thermodynamics. In: International Conference on Machine Learning, pp. 2256\u20132265. PMLR (2015)"},{"key":"27_CR45","first-page":"21548","volume":"35","author":"Y Su","year":"2022","unstructured":"Su, Y., Lan, T., Wang, Y., Yogatama, D., Kong, L., Collier, N.: A contrastive framework for neural text generation. Adv. Neural. Inf. Process. Syst. 35, 21548\u201321561 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"27_CR46","unstructured":"Team, G., et\u00a0al.: Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)"},{"key":"27_CR47","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"27_CR48","unstructured":"Wang, P., et al.: OFA: unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. In: International Conference on Machine Learning, pp. 23318\u201323340. PMLR (2022)"},{"key":"27_CR49","unstructured":"Yu, J., Wang, Z., Vasudevan, V., Yeung, L., Seyedhosseini, M., Wu, Y.: CoCa: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917 (2022)"},{"key":"27_CR50","unstructured":"Yu, J., et\u00a0al.: Scaling autoregressive models for content-rich text-to-image generation. arXiv preprint arXiv:2206.10789 (2022)"},{"key":"27_CR51","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: MiniGPT-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73039-9_27","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T15:27:26Z","timestamp":1730302046000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73039-9_27"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,31]]},"ISBN":["9783031730382","9783031730399"],"references-count":51,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73039-9_27","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,10,31]]},"assertion":[{"value":"31 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}