{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T18:30:20Z","timestamp":1775068220015,"version":"3.50.1"},"reference-count":80,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,6,16]],"date-time":"2024-06-16T00:00:00Z","timestamp":1718496000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,6,16]],"date-time":"2024-06-16T00:00:00Z","timestamp":1718496000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,6,16]]},"DOI":"10.1109\/cvpr52733.2024.01307","type":"proceedings-article","created":{"date-parts":[[2024,9,16]],"date-time":"2024-09-16T17:34:53Z","timestamp":1726508093000},"page":"13774-13784","source":"Crossref","is-referenced-by-count":12,"title":["Contrasting Intra-Modal and Ranking Cross-Modal Hard Negatives to Enhance Visio-Linguistic Compositional Understanding"],"prefix":"10.1109","author":[{"given":"Le","family":"Zhang","sequence":"first","affiliation":[{"name":"Mila - Quebec AI Institute"}]},{"given":"Rabiul","family":"Awal","sequence":"additional","affiliation":[{"name":"Mila - Quebec AI Institute"}]},{"given":"Aishwarya","family":"Agrawal","sequence":"additional","affiliation":[{"name":"Mila - Quebec AI Institute"}]}],"member":"263","reference":[{"key":"ref1","author":"Alayrac","year":"2022","journal-title":"Flamingo: a visual language model for few-shot learning"},{"key":"ref2","author":"Awadalla","year":"2023","journal-title":"Openflamingo: An open-source framework for training large autoregressive vision-language models"},{"key":"ref3","article-title":"Augmenting clip with im-proved visio-linguistic reasoning","author":"Basu","year":"2023","journal-title":"ArXiv"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.87"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01844"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.145"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-naacl.39"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1023\/A:1022627411411"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.143"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52729.2023.00261"},{"key":"ref13","article-title":"Shimon Ull-man, and Leonid Karlinsky","author":"Doveh","year":"2023","journal-title":"Dense and aligned captions (dac) promote compositional reasoning in vl models"},{"key":"ref14","author":"Gadre","year":"2023","journal-title":"Datacomp: In search of the next generation of multimodal datasets"},{"key":"ref15","first-page":"1","article-title":"Clip-adapter: Better vision-language models with feature adapters","author":"Gao","year":"2023","journal-title":"International Journal of Computer Vision"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_17"},{"key":"ref17","author":"Goel","year":"2022","journal-title":"Cyclip: Cyclic contrastive language-image pretraining"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.307"},{"key":"ref21","author":"Herzig","year":"2023","journal-title":"Incorporating structured representations into pretrained vi-sion - language models using scene graphs"},{"key":"ref22","author":"Hessel","year":"2022","journal-title":"Clipscore: A reference-free evaluation metric for image captioning"},{"key":"ref23","article-title":"spaCy 2: Natural language understanding with Bloom embeddings, convolutional neural networks and incremental parsing","author":"Honnibal","year":"2017","journal-title":"To appear"},{"key":"ref24","author":"Hsieh","year":"2023","journal-title":"Sugarcrepe: Fixing hackable benchmarks for vision-language compositionality"},{"key":"ref25","author":"Jia","year":"2021","journal-title":"Scaling up visual and vision-language representation learning with noisy text supervision"},{"key":"ref26","article-title":"Hard negative mixing for contrastive learning","author":"Kalantidis","year":"2020","journal-title":"ArXiv"},{"key":"ref27","first-page":"21798","article-title":"Hard negative mixing for con-trastive learning","volume":"33","author":"Kalantidis","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref28","article-title":"Segment any-thing","author":"Kirillov","year":"2023","journal-title":"arXiv preprint"},{"key":"ref29","author":"Krishna","year":"2016","journal-title":"Visual genome: Connecting language and vision using crowdsourced dense image annotations"},{"key":"ref30","author":"Krizhevsky","journal-title":"Cifar-10 (canadian institute for advanced research)"},{"key":"ref31","author":"Li","year":"2021","journal-title":"Align before fuse: Vision and language representation learning with momentum distillation"},{"key":"ref32","author":"Li","year":"2022","journal-title":"Blip: Bootstrapping language-image pre-training for unified vision- language understanding and generation"},{"key":"ref33","author":"Li","year":"2023","journal-title":"Blip- 2: Bootstrapping language-image pre-training with frozen image encoders and large language models"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref36","article-title":"Adap-tiveface: Adaptive margin and sampling for face recognition","volume-title":"Proceedings of the IEEEICVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Liu","year":"2019"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.33540\/2168"},{"key":"ref38","author":"Liu","year":"2023","journal-title":"Vera: A general- purpose plausibility estimation model for commonsense state-ments"},{"key":"ref39","author":"Liu","year":"2019","journal-title":"Roberta: A robustly optimized bert pretraining approach"},{"key":"ref40","article-title":"Crepe: Can vision-language foundation models reason compositionally?","author":"Ma","year":"2022","journal-title":"ar Xiv preprint"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.309"},{"key":"ref42","author":"Manas","year":"2023","journal-title":"Mapl: Parameter-efficient adaptation of unimodal pre-trained mod-els for vision-language few-shot prompting"},{"key":"ref43","author":"Manas","year":"2023","journal-title":"Mapl: Parameter-efficient adaptation of unimodal pre-trained mod-els for vision-language few-shot prompting"},{"key":"ref44","article-title":"Autoclip: Auto-tuning zero-shot classifiers for vision-language models","author":"Metzen","year":"2023","journal-title":"ArXiv"},{"key":"ref45","author":"Minderer","year":"2022","journal-title":"Simple open-vocabulary object detection with vi-sion transformers"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-demos.16"},{"key":"ref47","article-title":"Chils: Zero-shot image classi-fication with hierarchical label sets","author":"Novack","year":"2023","journal-title":"ArXiv"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00294"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.567"},{"key":"ref50","article-title":"Dreamfusion: Text-to-3d using 2d diffusion","author":"Poole","year":"2022","journal-title":"arXiv preprint"},{"key":"ref51","article-title":"Understanding and improving robustness of vision transformers through patch-based negative augmentation","author":"Qin","year":"2021","journal-title":"ArXiv"},{"key":"ref52","first-page":"8748","article-title":"Learning transferable visual models from natural language supervi-sion","volume-title":"International conference on machine learning","author":"Radford"},{"key":"ref53","author":"Ramesh","year":"2022","journal-title":"Hierarchical text-conditional image generation with clip latents"},{"key":"ref54","article-title":"Contrastive learning with hard negative sam-ples","author":"Robinson","year":"2020","journal-title":"ArXiv"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref56","author":"Schuhmann","year":"2021","journal-title":"Laion-400m: Open dataset of clip-filtered 400 million image-text pairs"},{"key":"ref57","author":"Schuhmann","year":"2022","journal-title":"Laion-5b: An open large-scale dataset for training next generation image-text models"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01519"},{"key":"ref60","author":"Singh","year":"2023","journal-title":"Coarse-to-fine contrastive learning in image-text-graph space for im-proved vision-language compositionality"},{"key":"ref61","author":"Sun","year":"2023","journal-title":"Eva-clip: Improved training techniques for clip at scale"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1514"},{"key":"ref63","first-page":"5238","article-title":"Wino ground: Probing vision and language models for visio-linguistic compositionality","volume-title":"Proceedings of the IEEEICVF Conference on Computer Vision and Pattern Recognition","author":"Thrush","year":"2022"},{"key":"ref64","author":"Wang","year":"2023","journal-title":"Sam-clip: Merging vision foundation models towards semantic and spatial understanding"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.180"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52729.2023.01838"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3069908"},{"key":"ref68","article-title":"A simple baseline for zero-shot semantic segmentation with pre-trained vision-language model","author":"Xu","year":"2021","journal-title":"ArXiv"},{"key":"ref69","author":"Yu","year":"2022","journal-title":"Coca: Contrastive captioners are image-text foundation models"},{"key":"ref70","article-title":"When and why vision-language models behave like bags-of-words, and what to do about it?","author":"Yuksekgonul","year":"2022","journal-title":"arXiv e-prints"},{"key":"ref71","author":"Zeng","year":"2022","journal-title":"Multi-grained vision language pre-training: Aligning texts with visual concepts"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"ref73","first-page":"3801","article-title":"Col-laborative and adversarial network for unsupervised domain adaptation","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","author":"Zhang","year":"2018"},{"key":"ref74","author":"Zhao","year":"2022","journal-title":"VI-checklist: Evaluating pre-trained vision-language models with objects, attributes and relations"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00393"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_40"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01075"},{"key":"ref79","article-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models","author":"Zhu","year":"2023","journal-title":"ar Xiv preprint"},{"key":"ref80","author":"Zhu","year":"2023","journal-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models"}],"event":{"name":"2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","location":"Seattle, WA, USA","start":{"date-parts":[[2024,6,16]]},"end":{"date-parts":[[2024,6,22]]}},"container-title":["2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10654794\/10654797\/10656018.pdf?arnumber=10656018","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,20]],"date-time":"2024-09-20T06:16:44Z","timestamp":1726813004000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10656018\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,16]]},"references-count":80,"URL":"https:\/\/doi.org\/10.1109\/cvpr52733.2024.01307","relation":{},"subject":[],"published":{"date-parts":[[2024,6,16]]}}}