{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T10:12:24Z","timestamp":1777889544708,"version":"3.51.4"},"reference-count":74,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.01600","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"17224-17234","source":"Crossref","is-referenced-by-count":0,"title":["Calibrating MLLM-as-a-judge via Multimodal Bayesian Prompt Ensembles"],"prefix":"10.1109","author":[{"given":"Eric","family":"Slyman","sequence":"first","affiliation":[{"name":"Adobe Systems"}]},{"given":"Mehrab","family":"Tanjim","sequence":"additional","affiliation":[{"name":"Adobe Systems"}]},{"given":"Kushal","family":"Kafle","sequence":"additional","affiliation":[{"name":"Adobe Systems"}]},{"given":"Stefan","family":"Lee","sequence":"additional","affiliation":[{"name":"Oregon State University"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"ref3","article-title":"Openflamingo: An open-source framework for training large autoregressive vision-language models","author":"Awadalla","year":"2023","journal-title":"ICLR"},{"key":"ref4","article-title":"Qwen-vl: A versatile vision-language model for understanding, localization, text reading, and beyond","author":"Bai","year":"2023","journal-title":"ICLR"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1214\/aos\/1013699998"},{"key":"ref6","article-title":"Improving image generation with better captions","author":"Betker","year":"2023","journal-title":"ICLR"},{"key":"ref7","article-title":"Visit-bench: A benchmark for visionlanguage instruction following inspired by real-world use","author":"Bitton","year":"2024","journal-title":"NeurIPS, Datasets and Benchmarks"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1016\/S0031-3203(96)00142-2"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1175\/1520-0493(1950)078<0001:VOFEIT>2.0.CO;2"},{"key":"ref10","article-title":"X-iqe: explainable image quality evaluation for text-to-image generation with visual large language models","author":"Chen","year":"2023","journal-title":"ICLR"},{"key":"ref11","article-title":"Mj-bench: Is your multimodal reward model really a good judge for text-toimage generation?","author":"Chen","year":"2024","journal-title":"arXiv preprint"},{"key":"ref12","article-title":"Dall-eval: Probing the reasoning skills and social biases of text-toimage generation models","author":"Cho","year":"2023","journal-title":"ICLR"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1177\/001316446002000104"},{"key":"ref14","article-title":"Ex-ploring GPT-4 vision for text-to-image synthesis evaluation","volume-title":"ICLR, Tiny Papers","author":"Cui","year":"2024"},{"key":"ref15","article-title":"Instructblip: Towards generalpurpose vision-language models with instruction tuning","author":"Dai","year":"2024","journal-title":"ICLR"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02303"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.121"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143874"},{"key":"ref19","article-title":"Mllm-bench, evaluating multimodal llms using gpt-4v","author":"Ge","year":"2023","journal-title":"ICLR"},{"key":"ref20","article-title":"Selective classification for deep neural networks","volume":"30","author":"Geifman","year":"2017","journal-title":"ICLR"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.5555\/2986459.2986721"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1016\/j.xinn.2025.101253"},{"key":"ref23","first-page":"1321","article-title":"On calibration of modern neural networks","volume-title":"Proceedings of the 34th International Conference on Machine Learning (ICML)","author":"Guo"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1007\/978-0-387-21606-5"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.674"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1007\/s10994-024-06534-x"},{"key":"ref27","article-title":"Gans trained by a two time-scale update rule converge to a local nash equilibrium","volume":"30","author":"Heusel","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref28","first-page":"13309","article-title":"Promptboosting: Black-box text classification with ten forward passes","volume-title":"International Conference on Machine Learning","author":"Hou"},{"key":"ref29","article-title":"Are 11 m-based evaluators confusing nlg quality criteria?","author":"Hu","year":"2024","journal-title":"arXiv"},{"key":"ref30","article-title":"Gpt-4o system card","author":"Hurst","year":"2024","journal-title":"arXiv preprint"},{"key":"ref31","article-title":"Calibrating language models via augmented prompt ensembles","volume-title":"ICML Workshop on Deployable Generative AI","author":"Jiang","year":"2023"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00324"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/tbdata.2019.2921572"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1594"},{"key":"ref35","article-title":"Benchmarking cognitive biases in large language models as evaluators","author":"Koo","year":"2023","journal-title":"ICLR"},{"key":"ref36","article-title":"Improved precision and recall metric for assessing generative models","volume":"32","author":"Kynk\u00e4\u00e4nniemi","year":"2019","journal-title":"Advances in neural information processing systems"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01263"},{"key":"ref38","article-title":"From generation to judgment: Opportunities and challenges of 11 m-as-a-judge","author":"Li","year":"2024","journal-title":"arXiv preprint"},{"key":"ref39","article-title":"Llms-as-judges: a comprehensive survey on 11 m-based evaluation methods","author":"Li","year":"2024","journal-title":"arXiv preprint"},{"key":"ref40","article-title":"ROUGE: A package for automatic evaluation of summaries","author":"Lin","year":"2004","journal-title":"Text Summarization Branches Out"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref42","article-title":"Mitigating hallucination in large multi-modal models via robust instruction tuning","author":"Liu","year":"2024","journal-title":"ICLR"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1516"},{"key":"ref44","article-title":"Llms as narcissistic evaluators: When ego inflates evaluation scores","author":"Liu","year":"2023","journal-title":"ICLR"},{"key":"ref45","article-title":"Predict responsibly: improving fairness and accuracy by learning to defer","volume":"31","author":"Madras","year":"2018","journal-title":"ICLR"},{"key":"ref46","volume-title":"Gpt-4v(ision) technical work and authors","year":"2023"},{"key":"ref47","volume-title":"Openai o1 system card","year":"2024"},{"key":"ref48","article-title":"Llm evaluators recognize and favor their own generations","author":"Panickssery","year":"2024","journal-title":"ICLR"},{"key":"ref49","article-title":"Bleu: a method for automatic evaluation of machine translation","author":"Papineni","year":"2002","journal-title":"ICLR"},{"key":"ref50","article-title":"Kosmos-2: Grounding multimodal large language models to the world","author":"Peng","year":"2023","journal-title":"ICLR"},{"key":"ref51","first-page":"1","article-title":"Sdxl: improving latent diffusion models for high-resolution image synthesis","volume-title":"International Conference on Learning Representations","author":"Podell"},{"key":"ref52","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021","journal-title":"ICLR"},{"key":"ref53","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International conference on machine learning","author":"Radford"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.52202\/068431-2643"},{"key":"ref56","article-title":"Verbosity bias in preference labeling by large language models","author":"Saito","year":"2023","journal-title":"arXiv preprint"},{"key":"ref57","article-title":"Improved techniques for training gans","volume":"29","author":"Salimans","year":"2016","journal-title":"Advances in neural information processing systems"},{"key":"ref58","article-title":"Gemini: a family of highly capable multimodal models","author":"Team","year":"2023","journal-title":"arXiv preprint"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.728"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.511"},{"key":"ref61","article-title":"A prompt pattern catalog to enhance prompt engineering with chatgpt","author":"White","year":"2023","journal-title":"arXiv preprint"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20059-5_9"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.trustnlp-1.28"},{"key":"ref64","article-title":"Gpt4 v (ision) is a human-aligned evaluator for text-to-3d generation","author":"Wu","year":"2024","journal-title":"ICLR"},{"key":"ref65","article-title":"Human preference score v2: A solid benchmark for evaluating human preferences of text-to-image synthesis","volume-title":"arXiv preprint","author":"Wu","year":"2023"},{"key":"ref66","article-title":"Llavacritic: Learning to evaluate multimodal models","author":"Xiong","year":"2024","journal-title":"arXiv preprint"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73030-6_3"},{"key":"ref68","article-title":"Justice or prejudice? quantifying biases in 11 m-as-a-judge","author":"Ye","year":"2024","journal-title":"arXiv preprint"},{"key":"ref69","article-title":"Lamm: Language-assisted multimodal instruction-tuning dataset, framework, and benchmark","author":"Yin","year":"2023","journal-title":"NeurIPS, Datasets and Benchmarks"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"ref71","article-title":"Llama-adapter: Efficient fine-tuning of language models with zero-init attention","author":"Zhang","year":"2024","journal-title":"ICLR"},{"key":"ref72","article-title":"Gpt-4v (ision) as a generalist evaluator for vision-language tasks","author":"Zhang","year":"2023","journal-title":"ICLR"},{"key":"ref73","article-title":"Judging 11m-as-a-judge with mt-bench and chatbot arena","author":"Zheng","year":"2023","journal-title":"ICLR"},{"key":"ref74","article-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models","author":"Zhu","year":"2024","journal-title":"ICLR"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11443617.pdf?arnumber=11443617","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T05:16:27Z","timestamp":1777612587000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11443617\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":74,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.01600","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}