{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T04:39:12Z","timestamp":1764995952578,"version":"3.46.0"},"reference-count":96,"publisher":"Springer Science and Business Media LLC","issue":"12","license":[{"start":{"date-parts":[[2025,9,15]],"date-time":"2025-09-15T00:00:00Z","timestamp":1757894400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,9,15]],"date-time":"2025-09-15T00:00:00Z","timestamp":1757894400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62206009","62022009"],"award-info":[{"award-number":["62206009","62022009"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012166","name":"National Key R D Program of China","doi-asserted-by":"crossref","award":["2022ZD0116310"],"award-info":[{"award-number":["2022ZD0116310"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,12]]},"DOI":"10.1007\/s11263-025-02556-7","type":"journal-article","created":{"date-parts":[[2025,9,15]],"date-time":"2025-09-15T12:11:34Z","timestamp":1757938294000},"page":"8332-8355","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["GenderBias-VL: Benchmarking Gender Bias in Vision Language Models via Counterfactual Probing"],"prefix":"10.1007","volume":"133","author":[{"given":"Yisong","family":"Xiao","sequence":"first","affiliation":[]},{"given":"Xianglong","family":"Liu","sequence":"additional","affiliation":[]},{"given":"QianJia","family":"Cheng","sequence":"additional","affiliation":[]},{"given":"Zhenfei","family":"Yin","sequence":"additional","affiliation":[]},{"given":"Siyuan","family":"Liang","sequence":"additional","affiliation":[]},{"given":"Jiapeng","family":"Li","sequence":"additional","affiliation":[]},{"given":"Jing","family":"Shao","sequence":"additional","affiliation":[]},{"given":"Aishan","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Dacheng","family":"Tao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,9,15]]},"reference":[{"key":"2556_CR1","unstructured":"Achiam, J., Adler, S., Agarwal, S., Ahmad, L., Akkaya, I., Aleman, F.\u00a0L., Almeida, D., Altenschmidt, J., Altman, S., Anadkat, S.(2023). et\u00a0al. Gpt-4 technical report. arXiv:2303.08774."},{"key":"2556_CR2","unstructured":"Agarwal, S., Krueger, G., Clark, J., Radford, A., Kim, J.\u00a0W., & Brundage, M. (2021). Evaluating clip: towards characterization of broader capabilities and downstream implications. arXiv:2108.02818."},{"key":"2556_CR3","unstructured":"Andrew. Chatgpt: How to generate prompts for stable diffusion. https:\/\/stable-diffusion-art.com\/chatgpt-prompt\/, (2023)."},{"key":"2556_CR4","unstructured":"Andrew. Stable diffusion prompt: a definitive guide. https:\/\/stable-diffusion-art.com\/prompt-guide\/, (2024)."},{"key":"2556_CR5","unstructured":"Bai, J., Bai, S., Yang, S., Wang, S., Tan, S., Wang, P., Lin, J., Zhou, C., & Zhou, J. (2023). Qwen-vl: A versatile vision-language model for understanding, localization, text reading, and beyond."},{"key":"2556_CR6","doi-asserted-by":"crossref","unstructured":"Barocas, S., & Selbst, A.\u00a0D.(2016). Big data\u2019s disparate impact. Calif. L. Rev., 104:671.","DOI":"10.2139\/ssrn.2477899"},{"key":"2556_CR7","unstructured":"Bhargava, S., & Forsyth, D. (2019). Exposing and correcting the gender bias in image captioning datasets and models. arXiv:1912.00578."},{"key":"2556_CR8","doi-asserted-by":"crossref","unstructured":"Bianchi, F., Kalluri, P., Durmus, E., Ladhak, F., Cheng, M., Nozza, D., Hashimoto, T., Jurafsky, D., Zou, J., & Caliskan, A. (2023). Easily accessible text-to-image generation amplifies demographic stereotypes at large scale. In FAccT.","DOI":"10.1145\/3593013.3594095"},{"key":"2556_CR9","unstructured":"Birhane, A., Prabhu, V.\u00a0U., & Kahembwe, E.(2021). Multimodal datasets: misogyny, pornography, and malignant stereotypes. arXiv:2110.01963."},{"key":"2556_CR10","doi-asserted-by":"crossref","unstructured":"Brooks, T., Holynski, A., & Efros, A.\u00a0A. (2023). Instructpix2pix: Learning to follow image editing instructions. In CVPR.","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"2556_CR11","unstructured":"Brown, T., Mann, B., Ryder, N., Subbiah, M., Kaplan, J.\u00a0D., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., & Askell, A. et\u00a0al.(2020). Language models are few-shot learners. NIPS."},{"issue":"6334","key":"2556_CR12","doi-asserted-by":"publisher","first-page":"183","DOI":"10.1126\/science.aal4230","volume":"356","author":"A Caliskan","year":"2017","unstructured":"Caliskan, A., Bryson, J. J., & Narayanan, A. (2017). Semantics derived automatically from language corpora contain human-like biases. Science, 356(6334), 183\u2013186.","journal-title":"Science"},{"key":"2556_CR13","doi-asserted-by":"crossref","unstructured":"Carothers, B.\u00a0J., & Reis, H.\u00a0T.(2013). Men and women are from earth: examining the latent structure of gender. Journal of personality and social psychology, 104(2):385.","DOI":"10.1037\/a0030437"},{"key":"2556_CR14","unstructured":"Chen, J., Zhu, D., Shen, X., Li, X., Liu, Z., Zhang, P., Krishnamoorthi, R., Chandra, V., Xiong, Y., & Elhoseiny, M. (2023). Minigpt-v2: large language model as a unified interface for vision-language multi-task learning. arXiv:2310.09478."},{"key":"2556_CR15","unstructured":"Chen, K., Zhang, Z., Zeng, W., Zhang, R., Zhu, F., & Zhao, R. (2023). Shikra: Unleashing multimodal llm\u2019s referential dialogue magic. arXiv:2306.15195."},{"key":"2556_CR16","doi-asserted-by":"crossref","unstructured":"Cherti, M., Beaumont, R., Wightman, R., Wortsman, M., Ilharco, G., Gordon, C., Schuhmann, C., Schmidt, L., & Jitsev, J. (2023). Reproducible scaling laws for contrastive language-image learning. In CVPR.","DOI":"10.1109\/CVPR52729.2023.00276"},{"key":"2556_CR17","unstructured":"Chiang, W.-L., Li, Z., Lin, Z., Sheng, Y., Wu, Z., Zhang, H., Zheng, L., Zhuang, S., Zhuang, Y., Gonzalez, J.\u00a0E. et\u00a0al. (2023). Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality, march 2023. URL https:\/\/lmsys. org\/blog\/2023-03-30-vicuna, 3(5)."},{"key":"2556_CR18","unstructured":"CompVis.stable-diffusion-safety-checker. https:\/\/huggingface.co\/CompVis\/stable-diffusion-safety-checker, (2022)."},{"key":"2556_CR19","unstructured":"Cortes, P., & Pan, J. (2019). Gender, occupational segregation, and automation. Economics Studies at Brookings, pages 1\u201332."},{"key":"2556_CR20","unstructured":"Dai, W., Li, J., Li, D., Tiong, A.\u00a0M.\u00a0H., Zhao, J., Wang, W., Li, B., Fung, P.\u00a0N., & Hoi, S. (2024). Instructblip: Towards general-purpose vision-language models with instruction tuning. NIPS."},{"key":"2556_CR21","doi-asserted-by":"crossref","unstructured":"De-Arteaga, M.,Romanov, A., Wallach, H., Chayes, J., Borgs, C., Chouldechova, A., Geyik, S., Kenthapadi, K., & Kalai, A.\u00a0T. (2019). Bias in bios: A case study of semantic representation bias in a high-stakes setting. In FAccT.","DOI":"10.1145\/3287560.3287572"},{"key":"2556_CR22","doi-asserted-by":"crossref","unstructured":"Dev, S., Monajatipoor, M., Ovalle, A., Subramonian, A., Phillips, J.\u00a0M., & Chang, K.-W. (2021). Harms of gender exclusivity and challenges in non-binary representation in language technologies. arXiv:2108.12084.","DOI":"10.18653\/v1\/2021.emnlp-main.150"},{"key":"2556_CR23","unstructured":"Dong, X., Zhang, P., Zang, Y., Cao, Y., Wang, B., Ouyang, L., Wei, X., Zhang, S., Duan, H., Cao, M., et\u00a0al. (2024). Internlm-xcomposer2: Mastering free-form text-image composition and comprehension in vision-language large model. arXiv:2401.16420."},{"key":"2556_CR24","doi-asserted-by":"crossref","unstructured":"Dwork, C., Hardt, M., Pitassi, T., Reingold, O., & Zemel, R.(2012). Fairness through awareness. In Proceedings of the 3rd innovations in theoretical computer science conference.","DOI":"10.1145\/2090236.2090255"},{"key":"2556_CR25","doi-asserted-by":"publisher","first-page":"458","DOI":"10.4135\/9781446249222.n49","volume":"2","author":"AH Eagly","year":"2012","unstructured":"Eagly, A. H., & Wood, W. (2012). Social role theory. Handbook of theories of social psychology, 2, 458\u2013476.","journal-title":"Handbook of theories of social psychology"},{"key":"2556_CR26","doi-asserted-by":"crossref","unstructured":"Feldman, M.,Friedler,S.\u00a0A., Moeller,J., Scheidegger, C., & Venkatasubramanian, S. (2015). Certifying and removing disparate impact. In SIGKDD.","DOI":"10.1145\/2783258.2783311"},{"key":"2556_CR27","doi-asserted-by":"crossref","unstructured":"Fraser, K.\u00a0C., & Kiritchenko, S. (2024). Examining gender and racial bias in large vision-language models using a novel dataset of parallel images. arXiv:2402.05779.","DOI":"10.18653\/v1\/2024.eacl-long.41"},{"key":"2556_CR28","doi-asserted-by":"crossref","unstructured":"Galhotra, S., Brun, Y., & Meliou, A. (2017). Fairness testing: testing software for discrimination. In Proceedings of the 2017 11th Joint meeting on foundations of software engineering, pages 498\u2013510.","DOI":"10.1145\/3106237.3106277"},{"key":"2556_CR29","unstructured":"Gao, P., Han, J., Zhang, R., Lin, Z., Geng, S., Zhou, A., Zhang, W., Lu, P., He, C., Yue, X. et\u00a0al. (2023). Llama-adapter v2: Parameter-efficient visual instruction model. arXiv:2304.15010."},{"key":"2556_CR30","doi-asserted-by":"crossref","unstructured":"Ghosh, S., & Caliskan, A. (2023). Chatgpt perpetuates gender bias in machine translation and ignores non-gendered pronouns: Findings across bengali and five other low-resource languages. In AAAI\/ACM Conference on AI, Ethics, and Society.","DOI":"10.1145\/3600211.3604672"},{"key":"2556_CR31","unstructured":"Hall, S.\u00a0M., Gon\u00e7alves\u00a0Abrantes, F., Zhu, H., Sodunke, G., Shtedritski, A., & Kirk, H.\u00a0R. (2024). Visogender: A dataset for benchmarking gender bias in image-text pronoun resolution. NIPS."},{"key":"2556_CR32","unstructured":"Hardt, M., Price, E., & Srebro, N. (2016). Equality of opportunity in supervised learning. NIPS."},{"key":"2556_CR33","doi-asserted-by":"crossref","unstructured":"Hendricks, L.\u00a0A., Burns, K., Saenko, K., Darrell, T., & Rohrbach, A. (2018). Women also snowboard: Overcoming bias in captioning models. In ECCV.","DOI":"10.1007\/978-3-030-01219-9_47"},{"key":"2556_CR34","doi-asserted-by":"crossref","unstructured":"Howard, P., Bhiwandiwalla, A., Fraser, K.\u00a0C., & Kiritchenko, S. (2024). Uncovering bias in large vision-language models with counterfactuals. arXiv:2404.00166.","DOI":"10.18653\/v1\/2025.naacl-long.305"},{"key":"2556_CR35","doi-asserted-by":"crossref","unstructured":"Howard, P., Madasu, A., Le, T., Moreno, G.\u00a0L., Bhiwandiwalla, A., & Lal, V. (2023). Probing and mitigating intersectional social biases in vision-language models with counterfactual examples. arXiv:2312.00825.","DOI":"10.1109\/CVPR52733.2024.01138"},{"key":"2556_CR36","unstructured":"Ilharco, G., Wortsman, M., Wightman, R., Gordon, C., Carlini, N., Taori, R., Dave, A., Shankar, V., Namkoong, H., Miller, J., Hajishirzi, H., Farhadi, A., & Schmidt, L. (2021). Openclip. If you use this software, please cite it as below."},{"key":"2556_CR37","doi-asserted-by":"crossref","unstructured":"Iskander, S., Radinsky, K., & Belinkov, Y. (2023). Shielded representations: Protecting sensitive attributes through iterative gradient-based projection. In Findings of ACL.","DOI":"10.18653\/v1\/2023.findings-acl.369"},{"key":"2556_CR38","doi-asserted-by":"crossref","unstructured":"Janghorbani, S., & De\u00a0Melo, G. (2023). Multimodal bias: Introducing a framework for stereotypical bias assessment beyond gender and race in vision language models. arXiv:2303.12734.","DOI":"10.18653\/v1\/2023.eacl-main.126"},{"key":"2556_CR39","doi-asserted-by":"crossref","unstructured":"Kirillov, A., Mintun, E., Ravi,N., Mao, H., Rolland, C., Gustafson, L., Xiao, T., Whitehead, S., Berg, A.\u00a0C., Lo, W.-Y. et\u00a0al. (2023). Segment anything. In ICCV.","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"2556_CR40","unstructured":"Kusner, M.\u00a0J., Loftus, J., Russell, C., & Silva, R. (2017). Counterfactual fairness. NIPS."},{"key":"2556_CR41","unstructured":"Lauren\u00e7on, H., Marafioti, A., Sanh, V., & Tronchon, L. (2024). Building and better understanding vision-language models: insights and future directions. In Workshop on Responsibly Building the Next Generation of Multimodal Foundational Models."},{"key":"2556_CR42","unstructured":"Lee, N., Bang, Y., Lovenia, H., Cahyawijaya, S., Dai, W., & Fung, P. (2023). Survey of social bias in vision-language models. arXiv:2309.14381."},{"key":"2556_CR43","doi-asserted-by":"crossref","unstructured":"Li, B., Ge, Y., Ge, Y., Wang, G., Wang, R., Zhang, R., & Shan, Y. (2024). Seed-bench: Benchmarking multimodal large language models. In CVPR.","DOI":"10.1109\/CVPR52733.2024.01263"},{"key":"2556_CR44","unstructured":"Li, B., Zhang, Y., Chen, L., Wang, J., Pu, F., Yang, J., Li, C., & Liu, Z. (2023). Mimic-it: Multi-modal in-context instruction tuning. arXiv:2306.05425."},{"key":"2556_CR45","unstructured":"Li, B., Zhang, Y., Guo, D., Zhang, R., Li, F., Zhang, H., Zhang, K., Zhang, P., Li, Y., Liu, Z. et\u00a0al. (2024). Llava-onevision: Easy visual task transfer. arXiv:2408.03326."},{"key":"2556_CR46","unstructured":"Li, J., Li, D., Savarese, S., & Hoi, S.(2023). Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In ICML."},{"key":"2556_CR47","doi-asserted-by":"crossref","unstructured":"Liang, P.\u00a0P., Li, I.\u00a0M., Zheng, E., Lim, Y.\u00a0C., Salakhutdinov, R., & Morency, L.-P. (2020). Towards debiasing sentence representations. In ACL.","DOI":"10.18653\/v1\/2020.acl-main.488"},{"key":"2556_CR48","doi-asserted-by":"crossref","unstructured":"Limisiewicz, T. & Mare\u010dek, D. (2022). Don\u2019t forget about pronouns: Removing gender bias in language models without losing factual gender information. In Proceedings of the 4th Workshop on Gender Bias in Natural Language Processing (GeBNLP), pages 17\u201329.","DOI":"10.18653\/v1\/2022.gebnlp-1.3"},{"key":"2556_CR49","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., & Lee, Y.\u00a0J.(2023). Improved baselines with visual instruction tuning. arXiv:2310.03744.","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"2556_CR50","unstructured":"Liu, H., Li, C., Wu, Q., & Lee, Y.\u00a0J. (2024). Visual instruction tuning. NIPS."},{"key":"2556_CR51","doi-asserted-by":"crossref","unstructured":"Liu, S., Zeng, Z., Ren, T., Li, F., Zhang, H., Yang, J., Li, C., Yang, J., Su, H., Zhu, J. et\u00a0al. (2023). Grounding dino: Marrying dino with grounded pre-training for open-set object detection. arXiv:2303.05499.","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"2556_CR52","doi-asserted-by":"crossref","unstructured":"Liu, Y., Duan, H., Zhang, Y., Li, B., Zhang, S., Zhao, W., Yuan, Y., Wang, J., He, C., Liu, Z. et\u00a0al.(2023). Mmbench: Is your multi-modal model an all-around player? arXiv:2307.06281.","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"2556_CR53","unstructured":"Luccioni, S., Akiki, C., Mitchell, M., Jernite, Y. (2024). Stable bias: Evaluating societal representations in diffusion models. NIPS."},{"key":"2556_CR54","doi-asserted-by":"crossref","unstructured":"M.\u00a0Mitchell, D.\u00a0Baker, N.\u00a0Moorosi, E.\u00a0Denton, B.\u00a0Hutchinson, A.\u00a0Hanna, T.\u00a0Gebru, and J.\u00a0Morgenstern. (2020). Diversity and inclusion metrics in subset selection. In Proceedings of the AAAI\/ACM Conference on AI, Ethics, and Society.","DOI":"10.1145\/3375627.3375832"},{"key":"2556_CR55","unstructured":"U.\u00a0B. of\u00a0Labor\u00a0Statistics. Labor force statistics from the current population survey. https:\/\/www.bls.gov\/cps\/cpsaat11.htm, (2024)."},{"key":"2556_CR56","unstructured":"OpenAI. Hello gpt-4o. https:\/\/openai.com\/index\/hello-gpt-4o\/, (2024)."},{"key":"2556_CR57","unstructured":"L.\u00a0Ouyang, J.\u00a0Wu, X.\u00a0Jiang, D.\u00a0Almeida, C.\u00a0Wainwright, P.\u00a0Mishkin, C.\u00a0Zhang, S.\u00a0Agarwal, K.\u00a0Slama, A.\u00a0Ray, et\u00a0al. (2022). Training language models to follow instructions with human feedback. NIPS."},{"key":"2556_CR58","unstructured":"Z.\u00a0Peng, W.\u00a0Wang, L.\u00a0Dong, Y.\u00a0Hao, S.\u00a0Huang, S.\u00a0Ma, and F.\u00a0Wei. (2023). Kosmos-2: Grounding multimodal large language models to the world. arXiv:2306.14824."},{"key":"2556_CR59","doi-asserted-by":"crossref","unstructured":"Pezeshkpour, P., & Hruschka, E. (2023). Large language models sensitivity to the order of options in multiple-choice questions. arXiv:2308.11483.","DOI":"10.18653\/v1\/2024.findings-naacl.130"},{"key":"2556_CR60","doi-asserted-by":"crossref","unstructured":"Pezeshkpour, P., & Hruschka, E.(2024). Large language models sensitivity to the order of options in multiple-choice questions. In Findings of NAACL.","DOI":"10.18653\/v1\/2024.findings-naacl.130"},{"key":"2556_CR61","unstructured":"D.\u00a0Podell, Z.\u00a0English, K.\u00a0Lacey, A.\u00a0Blattmann, T.\u00a0Dockhorn, J.\u00a0M\u00fcller, J.\u00a0Penna, and R.\u00a0Rombach. (2023). Sdxl: Improving latent diffusion models for high-resolution image synthesis. arXiv:2307.01952."},{"key":"2556_CR62","doi-asserted-by":"crossref","unstructured":"H.\u00a0Qiu, Z.-Y. Dou, T.\u00a0Wang, A.\u00a0Celikyilmaz, and N.\u00a0Peng. (2023). Gender biases in automatic evaluation metrics for image captioning. In EMNLP.","DOI":"10.18653\/v1\/2023.emnlp-main.520"},{"key":"2556_CR63","unstructured":"A.\u00a0Radford, J.\u00a0W. Kim, C.\u00a0Hallacy, A.\u00a0Ramesh, G.\u00a0Goh, S.\u00a0Agarwal, G.\u00a0Sastry, A.\u00a0Askell, P.\u00a0Mishkin, J.\u00a0Clark, et\u00a0al. (2021). Learning transferable visual models from natural language supervision. In ICML."},{"key":"2556_CR64","unstructured":"N.\u00a0Ratzlaff, M.\u00a0L. Olson, M.\u00a0Hinck, S.-Y. Tseng, V.\u00a0Lal, and P.\u00a0Howard. Debiasing large vision-language models by ablating protected attribute representations. In Neurips Safe Generative AI Workshop 2024."},{"key":"2556_CR65","doi-asserted-by":"crossref","unstructured":"S.\u00a0Ravfogel, Y.\u00a0Elazar, H.\u00a0Gonen, M.\u00a0Twiton, and Y.\u00a0Goldberg.(2020). Null it out: Guarding protected attributes by iterative nullspace projection. In ACL.","DOI":"10.18653\/v1\/2020.acl-main.647"},{"key":"2556_CR66","first-page":"25278","volume":"35","author":"C Schuhmann","year":"2022","unstructured":"Schuhmann, C., Beaumont, R., Vencu, R., Gordon, C., Wightman, R., Cherti, M., Coombes, T., Katta, A., Mullis, C., Wortsman, M., et al. (2022). Laion-5b: An open large-scale dataset for training next generation image-text models. NIPS, 35, 25278\u201325294.","journal-title":"NIPS"},{"key":"2556_CR67","unstructured":"H.\u00a0Shi, S.\u00a0D. Dao, & J.\u00a0Cai. Llmformer: Large language model for open-vocabulary semantic segmentation. IJCV, pages 1\u201318, (2024)."},{"key":"2556_CR68","unstructured":"Z.\u00a0Shi, Z.\u00a0Wang, H.\u00a0Fan, Z.\u00a0Yin, L.\u00a0Sheng, Y.\u00a0Qiao, & J.\u00a0Shao. Chef: A comprehensive evaluation framework for standardized assessment of multimodal large language models. arXiv:2311.02692, (2023)."},{"key":"2556_CR69","doi-asserted-by":"crossref","unstructured":"V.\u00a0K. Singh, M.\u00a0Chayko, R.\u00a0Inamdar, and D.\u00a0Floegel. (2020). Female librarians and male computer programmers? gender bias in occupational images on digital media platforms. Journal of the Association for Information Science and Technology.","DOI":"10.1002\/asi.24335"},{"key":"2556_CR70","unstructured":"Q.\u00a0Sun, Y.\u00a0Fang, L.\u00a0Wu, X.\u00a0Wang, and Y.\u00a0Cao. Eva-clip: Improved training techniques for clip at scale. arXiv:2303.15389, (2023)."},{"key":"2556_CR71","doi-asserted-by":"crossref","unstructured":"Z.\u00a0Sun, S.\u00a0Shen, S.\u00a0Cao, H.\u00a0Liu, C.\u00a0Li, Y.\u00a0Shen, C.\u00a0Gan, L.-Y. Gui, Y.-X. Wang, Y.\u00a0Yang, et\u00a0al. (2023). Aligning large multimodal models with factually augmented rlhf. arXiv:2309.14525.","DOI":"10.18653\/v1\/2024.findings-acl.775"},{"key":"2556_CR72","unstructured":"G.\u00a0Team, R.\u00a0Anil, S.\u00a0Borgeaud, Y.\u00a0Wu, J.-B. Alayrac, J.\u00a0Yu, R.\u00a0Soricut, J.\u00a0Schalkwyk, A.\u00a0M. Dai, A.\u00a0Hauth, et\u00a0al. Gemini: a family of highly capable multimodal models. arXiv:2312.11805, (2023)."},{"key":"2556_CR73","unstructured":"G.\u00a0Team, P.\u00a0Georgiev, V.\u00a0I. Lei, R.\u00a0Burnell, L.\u00a0Bai, A.\u00a0Gulati, G.\u00a0Tanzer, D.\u00a0Vincent, Z.\u00a0Pan, S.\u00a0Wang, et\u00a0al. Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context. arXiv:2403.05530, (2024)."},{"key":"2556_CR74","unstructured":"G.-V. Team. Genderbias-vl. https:\/\/genderbiasvl.github.io\/, (2024)."},{"key":"2556_CR75","unstructured":"I.\u00a0Team. Internlm: A multilingual language model with progressively enhanced capabilities, (2023)."},{"key":"2556_CR76","unstructured":"O.\u00a0Team. Internvl2: Better than the best\u2014expanding performance boundaries of open-source multimodal models with the progressive scaling strategy, (2024)."},{"key":"2556_CR77","doi-asserted-by":"crossref","unstructured":"S.\u00a0Tong, Z.\u00a0Liu, Y.\u00a0Zhai, Y.\u00a0Ma, Y.\u00a0LeCun, & S.\u00a0Xie. Eyes wide shut? exploring the visual shortcomings of multimodal llms. In CVPR, (2024).","DOI":"10.1109\/CVPR52733.2024.00914"},{"key":"2556_CR78","unstructured":"Touvron, H.,Lavril, T., Izacard, G., Martinet, X., Lachaux, M.-A., Lacroix, T., Rozi\u00e8re, B., Goyal, N., Hambro, E., Azhar, F. et\u00a0al. (2023). Llama: Open and efficient foundation language models. arXiv:2302.13971."},{"key":"2556_CR79","unstructured":"Touvron, H., Martin, L., Stone, K., Albert, P., Almahairi, A., Babaei, Y., Bashlykov, N., Batra, S., Bhargava, P., Bhosale, S. et\u00a0al. (2023). Llama 2: Open foundation and fine-tuned chat models. arXiv:2307.09288."},{"key":"2556_CR80","doi-asserted-by":"crossref","unstructured":"Van Der\u00a0Goot, R., Ljube\u0161i\u0107, N., Matroos, I., Nissim, M., & Plank, B. (2018). Bleaching text: Abstract features for cross-lingual gender prediction. arXiv:1805.03122.","DOI":"10.18653\/v1\/P18-2061"},{"key":"2556_CR81","unstructured":"Wan, Y., & Chang, K.-W. (2024). The male ceo and the female assistant: Probing gender biases in text-to-image models through paired stereotype test. arXiv:2402.11089."},{"key":"2556_CR82","unstructured":"Wang, P., Bai, S., Tan, S., Wang, S., Fan, Z., Bai, J., Chen, K., Liu, X., Wang, J., Ge, W. et\u00a0al. (2024). Qwen2-vl: Enhancing vision-language model\u2019s perception of the world at any resolution. arXiv:2409.12191."},{"key":"2556_CR83","doi-asserted-by":"crossref","unstructured":"Wolfe, R., Yang, Y., Howe, B., & Caliskan, A. (2023). Contrastive language-vision ai models pretrained on web-scraped multimodal data exhibit sexual objectification bias. In FAccT.","DOI":"10.1145\/3593013.3594072"},{"key":"2556_CR84","doi-asserted-by":"crossref","unstructured":"Xiao, Y., Liu, A., Li, T., & Liu, X. (2023). Latent imitator: Generating natural individual discriminatory instances for black-box fairness testing. In ISSTA.","DOI":"10.1145\/3597926.3598099"},{"key":"2556_CR85","doi-asserted-by":"crossref","unstructured":"Ye, Q., Xu, H., Ye, J., Yan, M., Liu, H., Qian, Q., Zhang, J., Huang, F., & Zhou, J. (2023). mplug-owl2: Revolutionizing multi-modal large language model with modality collaboration. arXiv:2311.04257.","DOI":"10.1109\/CVPR52733.2024.01239"},{"key":"2556_CR86","unstructured":"Yin, Z., Wang, J., Cao, J., Shi, Z., Liu, D., Li, M., Huang, X., Wang, Z., Sheng, L., Bai, L. et\u00a0al. (2024). Lamm: Language-assisted multi-modal instruction-tuning dataset, framework, and benchmark. NIPS."},{"key":"2556_CR87","doi-asserted-by":"crossref","unstructured":"Yu, T., Yao, Y., Zhang, H., He, T., Han, Y., Cui, G., Hu, J., Liu, Z., Zheng, H.-T., Sun, M. et\u00a0al. (2023). Rlhf-v: Towards trustworthy mllms via behavior alignment from fine-grained correctional human feedback. arXiv:2312.00849.","DOI":"10.1109\/CVPR52733.2024.01310"},{"key":"2556_CR88","doi-asserted-by":"crossref","unstructured":"Zang, Y., Li, W., Han, J., Zhou, K., & Loy, C.\u00a0C. (2024). Contextual object detection with multimodal large language models. IJCV.","DOI":"10.1007\/s11263-024-02214-4"},{"key":"2556_CR89","unstructured":"Zhang, J., Wang, S., Cao, X., Yuan, Z., Shan, S., Chen, X., & Gao, W. (2024). Vlbiasbench: A comprehensive benchmark for evaluating bias in large vision-language model. arXiv:2406.14194."},{"key":"2556_CR90","doi-asserted-by":"crossref","unstructured":"Zhang, L., Zhang, Y., & Zhang, M. (2021). Efficient white-box fairness testing through gradient search. In ISSTA.","DOI":"10.1145\/3460319.3464820"},{"key":"2556_CR91","unstructured":"Zhang, P., Wang, X.\u00a0D.\u00a0B., Cao, Y., Xu, C., Ouyang, L., Zhao, Z., Ding, S., Zhang, S., Duan, H., Yan, H. et\u00a0al. (2023). Internlm-xcomposer: A vision-language large model for advanced text-image comprehension and composition. arXiv:2309.15112."},{"key":"2556_CR92","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Wang, J., & Sang, J. (2022). Counterfactually measuring and eliminating social bias in vision-language pre-training models. In ACM MM.","DOI":"10.1145\/3503161.3548396"},{"key":"2556_CR93","doi-asserted-by":"crossref","unstructured":"Zhao, D., Wang, A., & Russakovsky, O. (2021). Understanding and evaluating racial biases in image captioning. In ICCV.","DOI":"10.1109\/ICCV48922.2021.01456"},{"key":"2556_CR94","doi-asserted-by":"crossref","unstructured":"Zhao, J., Wang, T., Yatskar, M., Ordonez, V., & Chang, K.-W. (2017). Men also like shopping: Reducing gender bias amplification using corpus-level constraints. arXiv:1707.09457.","DOI":"10.18653\/v1\/D17-1323"},{"key":"2556_CR95","doi-asserted-by":"crossref","unstructured":"Zheng, H., Chen, Z., Du, T., Zhang, X., Cheng, Y., Ji, S., Wang, J., Yu, Y., & Chen, J. (2022). Neuronfair: Interpretable white-box fairness testing through biased neuron identification. In ICSE.","DOI":"10.1145\/3510003.3510123"},{"key":"2556_CR96","doi-asserted-by":"crossref","unstructured":"Zhou, K., LAI, Y., & Jiang, J. (2022). Vlstereoset: A study of stereotypical bias in pre-trained vision-language models. ACL.","DOI":"10.18653\/v1\/2022.aacl-main.40"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02556-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02556-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02556-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T04:04:29Z","timestamp":1764993869000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02556-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,15]]},"references-count":96,"journal-issue":{"issue":"12","published-print":{"date-parts":[[2025,12]]}},"alternative-id":["2556"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02556-7","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"type":"print","value":"0920-5691"},{"type":"electronic","value":"1573-1405"}],"subject":[],"published":{"date-parts":[[2025,9,15]]},"assertion":[{"value":"2 September 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 July 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 September 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"Our paper aims to probe and benchmark the occupation-related gender bias in LVLMs. While evaluation results may raise ethical concerns and potentially harm readers, our intention is not to cause harm. Rather, our work seeks to facilitate bias evaluation for LVLMs, serving as a crucial initial step toward mitigating discriminatory outcomes.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical statement and broader impact"}}]}}