{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,6]],"date-time":"2026-01-06T13:32:11Z","timestamp":1767706331187,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":61,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754833","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:56:44Z","timestamp":1761375404000},"page":"2831-2840","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["BiMa: Towards Biases Mitigation for Text-Video Retrieval via Scene Element Guidance"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2021-9678","authenticated-orcid":false,"given":"Huy","family":"Le","sequence":"first","affiliation":[{"name":"FPT Software AI Center, Hanoi, Vietnam"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2966-3492","authenticated-orcid":false,"given":"Nhat","family":"Chung","sequence":"additional","affiliation":[{"name":"FPT Software AI Center, Hanoi, Vietnam"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7696-1444","authenticated-orcid":false,"given":"Tung","family":"Kieu","sequence":"additional","affiliation":[{"name":"Aalborg University, Aalborg, Denmark and Pioneer Centre for AI, Copenhagen, Denmark"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1449-211X","authenticated-orcid":false,"given":"Anh","family":"Nguyen","sequence":"additional","affiliation":[{"name":"University of Liverpool, Liverpool, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2571-0511","authenticated-orcid":false,"given":"Ngan","family":"Le","sequence":"additional","affiliation":[{"name":"University of Arkansas, Fayetteville, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Frozen in Time: A Joint Video and Image Encoder for End-to-End Retrieval. In IEEE\/CVF International Conference on Computer Vision (ICCV). 1708-1718","author":"Bain Max","year":"2021","unstructured":"Max Bain, Arsha Nagrani, G\u00fcl Varol, and Andrew Zisserman. 2021. Frozen in Time: A Joint Video and Image Encoder for End-to-End Retrieval. In IEEE\/CVF International Conference on Computer Vision (ICCV). 1708-1718."},{"key":"e_1_3_2_1_2_1","first-page":"4349","volume-title":"Annual Conference on Neural Information Processing Systems (NeurIPS)","author":"Bolukbasi Tolga","year":"2016","unstructured":"Tolga Bolukbasi, Kai-Wei Chang, James Y Zou, Venkatesh Saligrama, and Adam T Kalai. 2016. Man is to computer programmer as woman is to homemaker? debiasing word embeddings. Annual Conference on Neural Information Processing Systems (NeurIPS) (2016), 4349-4357."},{"volume-title":"Collecting Highly Parallel Data for Paraphrase Evaluation. In Annual Meeting of the Association for Computational Linguistics: Human Language Technologies (ACL). 190-200","author":"David","key":"e_1_3_2_1_3_1","unstructured":"David L. Chen and William B. Dolan. 2011. Collecting Highly Parallel Data for Paraphrase Evaluation. In Annual Meeting of the Association for Computational Linguistics: Human Language Technologies (ACL). 190-200."},{"key":"e_1_3_2_1_4_1","volume-title":"Fine-Grained Video-Text Retrieval With Hierarchical Graph Reasoning. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 10635-10644","author":"Chen Shizhe","year":"2020","unstructured":"Shizhe Chen, Yida Zhao, Qin Jin, and Qi Wu. 2020. Fine-Grained Video-Text Retrieval With Hierarchical Graph Reasoning. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 10635-10644."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01249-6_32"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i1.25113"},{"key":"e_1_3_2_1_7_1","volume-title":"UATVR: Uncertainty-Adaptive Text-Video Retrieval. In IEEE\/CVF International Conference on Computer Vision (ICCV). 13677-13687","author":"Fang Bo","year":"2023","unstructured":"Bo Fang, Wenhao Wu, Chang Liu, Yu Zhou, Yuxin Song, Weiping Wang, Xiangbo Shu, Xiangyang Ji, and Jingdong Wang. 2023. UATVR: Uncertainty-Adaptive Text-Video Retrieval. In IEEE\/CVF International Conference on Computer Vision (ICCV). 13677-13687."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1073\/pnas.1720347115"},{"key":"e_1_3_2_1_9_1","volume-title":"X-Pool: Cross-Modal Language-Video Attention for Text-Video Retrieval. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 4996-5005","author":"Gorti Satya Krishna","year":"2022","unstructured":"Satya Krishna Gorti, No\u00ebl Vouitsis, Junwei Ma, Keyvan Golestan, Maksims Volkovs, Animesh Garg, and Guangwei Yu. 2022. X-Pool: Cross-Modal Language-Video Attention for Text-Video Retrieval. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 4996-5005."},{"key":"e_1_3_2_1_10_1","volume-title":"Large Minibatch SGD: Training ImageNet in 1 Hour. arXiv preprint arXiv:706.02677","author":"Goyal Priya","year":"2017","unstructured":"Priya Goyal, Piotr Doll\u00e1r, Ross B. Girshick, Pieter Noordhuis, Lukasz Wesolowski, Aapo Kyrola, Andrew Tulloch, Yangqing Jia, and Kaiming He. 2017. Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour. arXiv preprint arXiv:706.02677 (2017)."},{"key":"e_1_3_2_1_11_1","volume-title":"MSCap: Multi-Style Image Captioning With Unpaired Stylized Text. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 4204-4213","author":"Guo Longteng","year":"2019","unstructured":"Longteng Guo, Jing Liu, Peng Yao, Jiangwei Li, and Hanqing Lu. 2019. MSCap: Multi-Style Image Captioning With Unpaired Stylized Text. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 4204-4213."},{"volume-title":"Localizing Moments in Video with Natural Language. In IEEE\/CVF International Conference on Computer Vision (ICCV). 5804-5813","author":"Hendricks Lisa Anne","key":"e_1_3_2_1_12_1","unstructured":"Lisa Anne Hendricks, Oliver Wang, Eli Shechtman, Josef Sivic, Trevor Darrell, and Bryan C. Russell. 2017. Localizing Moments in Video with Natural Language. In IEEE\/CVF International Conference on Computer Vision (ICCV). 5804-5813."},{"key":"e_1_3_2_1_13_1","volume-title":"International Conference on Learning Representations (ICLR).","author":"Higgins Irina","year":"2017","unstructured":"Irina Higgins, Lo\u00efc Matthey, Arka Pal, Christopher P. Burgess, Xavier Glorot, Matthew M. Botvinick, Shakir Mohamed, and Alexander Lerchner. 2017. beta-VAE: Learning Basic Visual Concepts with a Constrained Variational Framework. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.7"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02242"},{"key":"e_1_3_2_1_16_1","volume-title":"ImageNet-X: Understanding Model Mistakes with Factor of Variation Annotations. In International Conference on Learning Representation (ICLR).","author":"Idrissi Badr Youbi","year":"2023","unstructured":"Badr Youbi Idrissi, Diane Bouchacourt, Randall Balestriero, Ivan Evtimov, Caner Hazirbas, Nicolas Ballas, Pascal Vincent, Michal Drozdzal, David Lopez-Paz, and Mark Ibrahim. 2023. ImageNet-X: Understanding Model Mistakes with Factor of Variation Annotations. In International Conference on Learning Representation (ICLR)."},{"key":"e_1_3_2_1_17_1","volume-title":"Expectation-Maximization Contrastive Learning for Compact Video-and-Language Representations. In Annual Conference on Neural Information Processing Systems (NeurIPS). 30291-30306","author":"Jin Peng","year":"2022","unstructured":"Peng Jin, Jinfa Huang, Fenglin Liu, Xian Wu, Shen Ge, Guoli Song, David A. Clifton, and Jie Chen. 2022. Expectation-Maximization Contrastive Learning for Compact Video-and-Language Representations. In Annual Conference on Neural Information Processing Systems (NeurIPS). 30291-30306."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00244"},{"key":"e_1_3_2_1_19_1","volume-title":"Text-Video Retrieval with Disentangled Conceptualization and Set-to-Set Alignment. In International Joint Conference on Artificial Intelligence (IJCAI). 938-946","author":"Jin Peng","year":"2023","unstructured":"Peng Jin, Hao Li, Zesen Cheng, Jinfa Huang, Zhennan Wang, Li Yuan, Chang Liu, and Jie Chen. 2023b. Text-Video Retrieval with Disentangled Conceptualization and Set-to-Set Alignment. In International Joint Conference on Artificial Intelligence (IJCAI). 938-946."},{"key":"e_1_3_2_1_20_1","volume-title":"DiffusionRet: Generative Text-Video Retrieval with Diffusion Model. In IEEE\/CVF International Conference on Computer Vision (ICCV). 2470-2481","author":"Jin Peng","year":"2023","unstructured":"Peng Jin, Hao Li, Zesen Cheng, Kehan Li, Xiangyang Ji, Chang Liu, Li Yuan, and Jie Chen. 2023c. DiffusionRet: Generative Text-Video Retrieval with Diffusion Model. In IEEE\/CVF International Conference on Computer Vision (ICCV). 2470-2481."},{"key":"e_1_3_2_1_21_1","volume-title":"Disentangled Representation Learning for Non-Parallel Text Style Transfer. In Annual Meeting of the Association for Computational Linguistics: Human Language Technologies (ACL). 424-434","author":"John Vineet","year":"2019","unstructured":"Vineet John, Lili Mou, Hareesh Bahuleyan, and Olga Vechtomova. 2019. Disentangled Representation Learning for Non-Parallel Text Style Transfer. In Annual Meeting of the Association for Computational Linguistics: Human Language Technologies (ACL). 424-434."},{"volume-title":"Adam: A Method for Stochastic Optimization. In International Conference on Learning Representation (ICLR).","author":"Diederik","key":"e_1_3_2_1_22_1","unstructured":"Diederik P. Kingma and Jimmy Ba. 2015. Adam: A Method for Stochastic Optimization. In International Conference on Learning Representation (ICLR)."},{"volume-title":"Auto-Encoding Variational Bayes. In International Conference on Learning Representation (ICLR).","author":"Diederik","key":"e_1_3_2_1_23_1","unstructured":"Diederik P. Kingma and Max Welling. 2014. Auto-Encoding Variational Bayes. In International Conference on Learning Representation (ICLR)."},{"key":"e_1_3_2_1_24_1","volume-title":"Dense-Captioning Events in Videos. In IEEE\/CVF International Conference on Computer Vision (ICCV). 706-715","author":"Krishna Ranjay","year":"2017","unstructured":"Ranjay Krishna, Kenji Hata, Frederic Ren, Li Fei-Fei, and Juan Carlos Niebles. 2017. Dense-Captioning Events in Videos. In IEEE\/CVF International Conference on Computer Vision (ICCV). 706-715."},{"key":"e_1_3_2_1_25_1","volume-title":"WAVER: Writing-Style Agnostic Text-Video Retrieval Via Distilling Vision-Language Models Through Open-Vocabulary Knowledge","author":"Le Huy","year":"2024","unstructured":"Huy Le, Tung Kieu, Anh Nguyen, and Ngan Le. 2024. WAVER: Writing-Style Agnostic Text-Video Retrieval Via Distilling Vision-Language Models Through Open-Vocabulary Knowledge. In IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2024, Seoul, Republic of Korea, April 14-19, 2024."},{"key":"e_1_3_2_1_26_1","volume-title":"Annual Conference on Neural Information Processing Systems (NeurIPS). 8498-8511","author":"Leclerc Guillaume","year":"2022","unstructured":"Guillaume Leclerc, Hadi Salman, Andrew Ilyas, Sai Vemprala, Logan Engstrom, Vibhav Vineet, Kai Xiao, Pengchuan Zhang, Shibani Santurkar, Greg Yang, et al., 2022. 3db: A framework for debugging computer vision models. In Annual Conference on Neural Information Processing Systems (NeurIPS). 8498-8511."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"e_1_3_2_1_28_1","volume-title":"Prototype-based Aleatoric Uncertainty Quantification for Cross-modal Retrieval. In Annual Conference on Neural Information Processing Systems (NeurIPS). 24564-2458","author":"Li Hao","year":"2023","unstructured":"Hao Li, Jingkuan Song, Lianli Gao, Xiaosu Zhu, and Hengtao Shen. 2023. Prototype-based Aleatoric Uncertainty Quantification for Cross-modal Retrieval. In Annual Conference on Neural Information Processing Systems (NeurIPS). 24564-2458."},{"key":"e_1_3_2_1_29_1","volume-title":"International Conference on Machine Learning (ICML). 12888-12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022a. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International Conference on Machine Learning (ICML). 12888-12900."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01593"},{"key":"e_1_3_2_1_31_1","volume-title":"Towards Debiasing Sentence Representations. In Annual Meeting of the Association for Computational Linguistics: Human Language Technologies (ACL).","author":"Liang Paul Pu","year":"2020","unstructured":"Paul Pu Liang, Irene Mengze Li, Emily Zheng, Yao Chong Lim, Ruslan Salakhutdinov, and Louis-Philippe Morency. 2020. Towards Debiasing Sentence Representations. In Annual Meeting of the Association for Computational Linguistics: Human Language Technologies (ACL)."},{"key":"e_1_3_2_1_32_1","first-page":"38655","volume-title":"Annual Conference on Neural Information Processing Systems (NeurIPS)","author":"Lin Chengzhi","year":"2022","unstructured":"Chengzhi Lin, Ancong Wu, Junwei Liang, Jun Zhang, Wenhang Ge, Wei-Shi Zheng, and Chunhua Shen. 2022. Text-adaptive multiple visual prototype matching for video-text retrieval. Annual Conference on Neural Information Processing Systems (NeurIPS) (2022), 38655-38666."},{"key":"e_1_3_2_1_33_1","volume-title":"Reducing the Vision and Language Bias for Temporal Sentence Grounding. In ACM International Conference on Multimedia (MM). 4092-4101","author":"Liu Daizong","year":"2022","unstructured":"Daizong Liu, Xiaoye Qu, and Wei Hu. 2022b. Reducing the Vision and Language Bias for Temporal Sentence Grounding. In ACM International Conference on Multimedia (MM). 4092-4101."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3217449"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19781-9_19"},{"key":"e_1_3_2_1_36_1","unstructured":"Zhuang Liu and Kaiming He. 2025. A Decade's Battle on Dataset Bias: Are We There Yet?. In ICLR."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"e_1_3_2_1_38_1","volume-title":"Support-Set Bottlenecks for Video-Text Representation Learning. In International Conference on Learning Representations (ICLR).","author":"Patrick Mandela","year":"2021","unstructured":"Mandela Patrick, Po-Yao Huang, Yuki Markus Asano, Florian Metze, Alexander G. Hauptmann, Jo a, o F. Henriques, and Andrea Vedaldi. 2021. Support-Set Bottlenecks for Video-Text Representation Learning. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_39_1","volume-title":"Marco T\u00falio Ribeiro, and Ameet Talwalkar","author":"Plumb Gregory","year":"2022","unstructured":"Gregory Plumb, Marco T\u00falio Ribeiro, and Ameet Talwalkar. 2022. Finding and Fixing Spurious Patterns with Explanations. Transactions on Machine Learning Research (2022)."},{"key":"e_1_3_2_1_40_1","volume-title":"International Conference on Machine Learning (ICML). 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning (ICML). 8748-8763."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0987-1"},{"key":"e_1_3_2_1_42_1","volume-title":"Yongjun Bao, and Guiguang Ding.","author":"Shen Leqi","year":"2025","unstructured":"Leqi Shen, Tianxiang Hao, Tao He, Sicheng Zhao, Yifeng Zhang, pengzhang liu, Yongjun Bao, and Guiguang Ding. 2025. TempMe: Video Temporal Token Merging for Efficient Text-Video Retrieval. In ICLR."},{"key":"e_1_3_2_1_43_1","volume-title":"In-Style: Bridging Text and Uncurated Videos with Style Transfer for Text-Video Retrieval. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 21981-21992","author":"Shvetsova Nina","year":"2023","unstructured":"Nina Shvetsova, Anna Kukleva, Bernt Schiele, and Hilde Kuehne. 2023. In-Style: Bridging Text and Uncurated Videos with Style Transfer for Text-Video Retrieval. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 21981-21992."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","unstructured":"Nina Shvetsova Arsha Nagrani Bernt Schiele Hilde Kuehne and Christian Rupprecht. 2025. Unbiasing through Textual Descriptions: Mitigating Representation Bias in Video Benchmarks. (2025).","DOI":"10.1109\/CVPR52734.2025.02705"},{"key":"e_1_3_2_1_45_1","volume-title":"Mitigating Gender Bias in Natural Language Processing: Literature Review. In Annual Meeting of the Association for Computational Linguistics: Human Language Technologies (ACL). 1630-1640","author":"Sun Tony","year":"2019","unstructured":"Tony Sun, Andrew Gaut, Shirlyn Tang, Yuxin Huang, Mai ElSherief, Jieyu Zhao, Diba Mirza, Elizabeth M. Belding, Kai-Wei Chang, and William Yang Wang. 2019. Mitigating Gender Bias in Natural Language Processing: Literature Review. In Annual Meeting of the Association for Computational Linguistics: Human Language Technologies (ACL). 1630-1640."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548295"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01622"},{"key":"e_1_3_2_1_48_1","volume-title":"Representation Learning with Contrastive Predictive Coding. arXiv preprint arXiv:1807.03748","author":"van den Oord A\u00e4ron","year":"2018","unstructured":"A\u00e4ron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation Learning with Contrastive Predictive Coding. arXiv preprint arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_1_49_1","first-page":"2579","article-title":"Visualizing Data using t-SNE","volume":"9","author":"van der Maaten Laurens","year":"2008","unstructured":"Laurens van der Maaten and Geoffrey Hinton. 2008. Visualizing Data using t-SNE. Journal of Machine Learning Research, Vol. 9 (2008), 2579-2605.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_50_1","volume-title":"Annual Conference on Neural Information Processing Systems (NeurIPS). 5998-6008","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Annual Conference on Neural Information Processing Systems (NeurIPS). 5998-6008."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01702-9"},{"key":"e_1_3_2_1_52_1","volume-title":"Diffusion-Inspired Truncated Sampler for Text-Video Retrieval. In Advances in Neural Information Processing Systems 38: Annual Conference on Neural Information Processing Systems 2024 NeurIPS","author":"Wang Jiamian","year":"2024","unstructured":"Jiamian Wang, Pichao Wang, Dongfang Liu, Qiang Guan, Sohail A. Dianat, Majid Rabbani, Raghuveer Rao, and Zhiqiang Tao. 2024. Diffusion-Inspired Truncated Sampler for Text-Video Retrieval. In Advances in Neural Information Processing Systems 38: Annual Conference on Neural Information Processing Systems 2024 NeurIPS 2024)."},{"key":"e_1_3_2_1_53_1","volume-title":"Towards Fairness in Visual Recognition: Effective Strategies for Bias Mitigation. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 8916-8925","author":"Wang Zeyu","year":"2020","unstructured":"Zeyu Wang, Klint Qinami, Ioannis Christos Karakozis, Kyle Genova, Prem Nair, Kenji Hata, and Olga Russakovsky. 2020. Towards Fairness in Visual Recognition: Effective Strategies for Bias Mitigation. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 8916-8925."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i8.32935"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i3.25412"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i3.25414"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i7.28475"},{"key":"e_1_3_2_1_59_1","volume-title":"CoCa: Contrastive Captioners are Image-Text Foundation Models. Transactions on Machine Learning Research","author":"Yu Jiahui","year":"2022","unstructured":"Jiahui Yu, Zirui Wang, Vijay Vasudevan, Legg Yeung, Mojtaba Seyedhosseini, and Yonghui Wu. 2022. CoCa: Contrastive Captioners are Image-Text Foundation Models. Transactions on Machine Learning Research (2022)."},{"key":"e_1_3_2_1_60_1","volume-title":"Gender Bias in Contextualized Word Embeddings. In Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT). 629-634","author":"Zhao Jieyu","year":"2019","unstructured":"Jieyu Zhao, Tianlu Wang, Mark Yatskar, Ryan Cotterell, Vicente Ordonez, and Kai-Wei Chang. 2019. Gender Bias in Contextualized Word Embeddings. In Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT). 629-634."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3531950"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754833","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:42:05Z","timestamp":1765309325000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754833"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":61,"alternative-id":["10.1145\/3746027.3754833","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754833","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}