{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T13:51:03Z","timestamp":1765547463468,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":69,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62301046"],"award-info":[{"award-number":["62301046"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755032","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:47:42Z","timestamp":1761371262000},"page":"3251-3260","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Decoupled Global-Local Alignment for Improving Compositional Understanding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-2186-1276","authenticated-orcid":false,"given":"Xiaoxing","family":"Hu","sequence":"first","affiliation":[{"name":"School of Information and Electronics, Beijing Institute of Technology, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-6073-9014","authenticated-orcid":false,"given":"Kaicheng","family":"Yang","sequence":"additional","affiliation":[{"name":"DeepGlint, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9980-1112","authenticated-orcid":false,"given":"Jun","family":"Wang","sequence":"additional","affiliation":[{"name":"DeepGlint, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2091-6158","authenticated-orcid":false,"given":"Haoran","family":"Xu","sequence":"additional","affiliation":[{"name":"Microsoft, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-8689-8366","authenticated-orcid":false,"given":"Ziyong","family":"Feng","sequence":"additional","affiliation":[{"name":"DeepGlint, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9771-6229","authenticated-orcid":false,"given":"Yupei","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Information and Electronic, Beijing Institute of Technology, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"23716","article-title":"Flamingo: a visual language model for few-shot learning","volume":"35","author":"Alayrac Jean-Baptiste","year":"2022","unstructured":"Jean-Baptiste Alayrac, Jeff Donahue, Pauline Luc, Antoine Miech, Iain Barr, Yana Hasson, Karel Lenc, Arthur Mensch, Katherine Millican, Malcolm Reynolds, et al. 2022. Flamingo: a visual language model for few-shot learning. NIPS 35 (2022), 23716--23736.","journal-title":"NIPS"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10599-4_29"},{"key":"e_1_3_2_1_3_1","volume-title":"Microsoft coco captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325","author":"Chen Xinlei","year":"2015","unstructured":"Xinlei Chen, Hao Fang, Tsung-Yi Lin, Ramakrishna Vedantam, Saurabh Gupta, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2015. Microsoft coco captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325 (2015)."},{"key":"e_1_3_2_1_4_1","volume-title":"Paul Hongsuck Seo, and Seungryong Kim","author":"Cho Seokju","year":"2024","unstructured":"Seokju Cho, Heeseong Shin, Sunghwan Hong, Anurag Arnab, Paul Hongsuck Seo, and Seungryong Kim. 2024. Cat-seg: Cost aggregation for open-vocabulary semantic segmentation. In CVPR. 4113--4123."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Mircea Cimpoi Subhransu Maji Iasonas Kokkinos Sammy Mohamed and Andrea Vedaldi. 2014. Describing textures in the wild. In CVPR. 3606--3613.","DOI":"10.1109\/CVPR.2014.461"},{"key":"e_1_3_2_1_6_1","volume-title":"Imagenet: A large-scale hierarchical image database. In CVPR. Ieee, 248--255.","author":"Deng Jia","year":"2009","unstructured":"Jia Deng,Wei Dong, Richard Socher, Li-Jia Li, Kai Li, and Li Fei-Fei. 2009. Imagenet: A large-scale hierarchical image database. In CVPR. Ieee, 248--255."},{"key":"e_1_3_2_1_7_1","first-page":"76137","article-title":"Dense and aligned captions (dac) promote compositional reasoning in vl models","volume":"36","author":"Doveh Sivan","year":"2023","unstructured":"Sivan Doveh, Assaf Arbelle, Sivan Harary, Roei Herzig, Donghyun Kim, Paola Cascante-Bonilla, Amit Alfassy, Rameswar Panda, Raja Giryes, Rogerio Feris, et al. 2023. Dense and aligned captions (dac) promote compositional reasoning in vl models. NIPS 36 (2023), 76137--76150.","journal-title":"NIPS"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Sivan Doveh Assaf Arbelle Sivan Harary Eli Schwartz Roei Herzig Raja Giryes Rogerio Feris Rameswar Panda Shimon Ullman and Leonid Karlinsky. 2023. Teaching structured vision & language concepts to vision & language models. In CVPR. 2657--2668.","DOI":"10.1109\/CVPR52729.2023.00261"},{"key":"e_1_3_2_1_9_1","first-page":"35544","article-title":"Improving clip training with language rewrites","volume":"36","author":"Fan Lijie","year":"2023","unstructured":"Lijie Fan, Dilip Krishnan, Phillip Isola, Dina Katabi, and Yonglong Tian. 2023. Improving clip training with language rewrites. NIPS 36 (2023), 35544--35575.","journal-title":"NIPS"},{"volume-title":"Learning generative visual models from few training examples: An incremental bayesian approach tested on 101 object categories","author":"Fei-Fei Li","key":"e_1_3_2_1_10_1","unstructured":"Li Fei-Fei, Rob Fergus, and Pietro Perona. 2004. Learning generative visual models from few training examples: An incremental bayesian approach tested on 101 object categories. In CVPR. IEEE, 178--178."},{"key":"e_1_3_2_1_11_1","volume-title":"HiCLIP: Contrastive language-image pretraining with hierarchy-aware attention. ICLR","author":"Geng Shijie","year":"2023","unstructured":"Shijie Geng, Jianbo Yuan, Yu Tian, Yuxiao Chen, and Yongfeng Zhang. 2023. HiCLIP: Contrastive language-image pretraining with hierarchy-aware attention. ICLR (2023)."},{"key":"e_1_3_2_1_12_1","first-page":"6704","article-title":"Cyclip: Cyclic contrastive language-image pretraining","volume":"35","author":"Goel Shashank","year":"2022","unstructured":"Shashank Goel, Hritik Bansal, Sumit Bhatia, Ryan Rossi, Vishwa Vinay, and Aditya Grover. 2022. Cyclip: Cyclic contrastive language-image pretraining. NIPS 35 (2022), 6704--6719.","journal-title":"NIPS"},{"key":"e_1_3_2_1_13_1","unstructured":"Aaron Grattafiori Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Alex Vaughan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_1_14_1","volume-title":"RWKV-CLIP: a robust vision-language representation learner. arXiv preprint arXiv:2406.06973","author":"Gu Tiancheng","year":"2024","unstructured":"Tiancheng Gu, Kaicheng Yang, Xiang An, Ziyong Feng, Dongnan Liu, Weidong Cai, and Jiankang Deng. 2024. RWKV-CLIP: a robust vision-language representation learner. arXiv preprint arXiv:2406.06973 (2024)."},{"key":"e_1_3_2_1_15_1","volume-title":"Breaking the Modality Barrier: Universal Embedding Learning with Multimodal LLMs. arXiv preprint arXiv:2504.17432","author":"Gu Tiancheng","year":"2025","unstructured":"Tiancheng Gu, Kaicheng Yang, Ziyong Feng, Xingjun Wang, Yanzhao Zhang, Dingkun Long, Yingda Chen, Weidong Cai, and Jiankang Deng. 2025. Breaking the Modality Barrier: Universal Embedding Learning with Multimodal LLMs. arXiv preprint arXiv:2504.17432 (2025)."},{"key":"e_1_3_2_1_16_1","volume-title":"RealSyn: An Effective and Scalable Multimodal Interleaved Document Transformation Paradigm. arXiv preprint arXiv:2502.12513","author":"Gu Tiancheng","year":"2025","unstructured":"Tiancheng Gu, Kaicheng Yang, Chaoyi Zhang, Yin Xie, Xiang An, Ziyong Feng, Dongnan Liu, Weidong Cai, and Jiankang Deng. 2025. RealSyn: An Effective and Scalable Multimodal Interleaved Document Transformation Paradigm. arXiv preprint arXiv:2502.12513 (2025)."},{"key":"e_1_3_2_1_17_1","volume-title":"Open-vocabulary object detection via vision and language knowledge distillation. arXiv preprint arXiv:2104.13921","author":"Gu Xiuye","year":"2021","unstructured":"Xiuye Gu, Tsung-Yi Lin, Weicheng Kuo, and Yin Cui. 2021. Open-vocabulary object detection via vision and language knowledge distillation. arXiv preprint arXiv:2104.13921 (2021)."},{"key":"e_1_3_2_1_18_1","volume-title":"Distilling the Knowledge in a Neural Network. arXiv preprint arXiv:1503.02531","author":"Hinton Geoffrey","year":"2015","unstructured":"Geoffrey Hinton. 2015. Distilling the Knowledge in a Neural Network. arXiv preprint arXiv:1503.02531 (2015)."},{"key":"e_1_3_2_1_19_1","volume-title":"Muhammad Ferjad Naeem, Luc Van Gool, and Federico Tombari.","author":"Hoyer Lukas","year":"2025","unstructured":"Lukas Hoyer, David Joseph Tan, Muhammad Ferjad Naeem, Luc Van Gool, and Federico Tombari. 2025. Semivl: Semi-supervised semantic segmentation with vision-language guidance. In ECCV. Springer, 257--275."},{"key":"e_1_3_2_1_20_1","volume-title":"Sugarcrepe: Fixing hackable benchmarks for vision-language compositionality. NIPS 36","author":"Hsieh Cheng-Yu","year":"2024","unstructured":"Cheng-Yu Hsieh, Jieyu Zhang, Zixian Ma, Aniruddha Kembhavi, and Ranjay Krishna. 2024. Sugarcrepe: Fixing hackable benchmarks for vision-language compositionality. NIPS 36 (2024)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i11.29153"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i3.28017"},{"key":"e_1_3_2_1_23_1","volume-title":"Tinybert: Distilling bert for natural language understanding. arXiv preprint arXiv:1909.10351","author":"Jiao Xiaoqi","year":"2019","unstructured":"Xiaoqi Jiao, Yichun Yin, Lifeng Shang, Xin Jiang, Xiao Chen, Linlin Li, Fang Wang, and Qun Liu. 2019. Tinybert: Distilling bert for natural language understanding. arXiv preprint arXiv:1909.10351 (2019)."},{"key":"e_1_3_2_1_24_1","volume-title":"What's'' up'' with vision language models? Investigating their struggle with spatial reasoning. arXiv preprint arXiv:2310.19785","author":"Kamath Amita","year":"2023","unstructured":"Amita Kamath, Jack Hessel, and Kai-Wei Chang. 2023. What's'' up'' with vision language models? Investigating their struggle with spatial reasoning. arXiv preprint arXiv:2310.19785 (2023)."},{"volume-title":"The hard positive truth about vision-language compositionality","author":"Kamath Amita","key":"e_1_3_2_1_25_1","unstructured":"Amita Kamath, Cheng-Yu Hsieh, Kai-Wei Chang, and Ranjay Krishna. 2024. The hard positive truth about vision-language compositionality. In ECCV. Springer, 37--54."},{"key":"e_1_3_2_1_26_1","unstructured":"Prannay Kaul Weidi Xie and Andrew Zisserman. 2023. Multi-modal classifiers for open-vocabulary object detection. In ICML. PMLR 15946--15969."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"Jonathan Krause Michael Stark Jia Deng and Li Fei-Fei. 2013. 3d object representations for fine-grained categorization. In ICCV. 554--561.","DOI":"10.1109\/ICCVW.2013.77"},{"key":"e_1_3_2_1_28_1","unstructured":"Alex Krizhevsky Geoffrey Hinton et al. 2009. Learning multiple layers of features from tiny images. (2009)."},{"key":"e_1_3_2_1_29_1","first-page":"1008","article-title":"Uniclip: Unified framework for contrastive language-image pre-training","volume":"35","author":"Lee Janghyeon","year":"2022","unstructured":"Janghyeon Lee, Jongsuk Kim, Hyounguk Shon, Bumsoo Kim, Seung Hwan Kim, Honglak Lee, and Junmo Kim. 2022. Uniclip: Unified framework for contrastive language-image pre-training. NIPS 35 (2022), 1008--1019.","journal-title":"NIPS"},{"key":"e_1_3_2_1_30_1","volume-title":"Language-driven semantic segmentation. arXiv preprint arXiv:2201.03546","author":"Li Boyi","year":"2022","unstructured":"Boyi Li, Kilian Q Weinberger, Serge Belongie, Vladlen Koltun, and Ren\u00e9 Ranftl. 2022. Language-driven semantic segmentation. arXiv preprint arXiv:2201.03546 (2022)."},{"key":"e_1_3_2_1_31_1","volume-title":"ICML. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In ICML. PMLR, 19730--19742."},{"key":"e_1_3_2_1_32_1","volume-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In ICML. PMLR, 12888--12900.","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In ICML. PMLR, 12888--12900."},{"key":"e_1_3_2_1_33_1","unstructured":"Yanghao Li Haoqi Fan Ronghang Hu Christoph Feichtenhofer and Kaiming He. 2023. Scaling language-image pre-training via masking. In CVPR. 23390--23400."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i2.25236"},{"key":"e_1_3_2_1_35_1","unstructured":"Zheng Li Jingwen Ye Mingli Song Ying Huang and Zhigeng Pan. 2021. Online knowledge distillation for efficient pose estimation. In ICCV. 11740--11750."},{"key":"e_1_3_2_1_36_1","volume-title":"Vera: A general-purpose plausibility estimation model for commonsense statements. arXiv preprint arXiv:2305.03695","author":"Liu Jiacheng","year":"2023","unstructured":"Jiacheng Liu, Wenya Wang, Dianzhuo Wang, Noah A Smith, Yejin Choi, and Hannaneh Hajishirzi. 2023. Vera: A general-purpose plausibility estimation model for commonsense statements. arXiv preprint arXiv:2305.03695 (2023)."},{"key":"e_1_3_2_1_37_1","first-page":"71078","article-title":"Codet: Cooccurrence guided region-word alignment for open-vocabulary object detection","volume":"36","author":"Ma Chuofan","year":"2023","unstructured":"Chuofan Ma, Yi Jiang, Xin Wen, Zehuan Yuan, and Xiaojuan Qi. 2023. Codet: Cooccurrence guided region-word alignment for open-vocabulary object detection. NIPS 36 (2023), 71078--71094.","journal-title":"NIPS"},{"key":"e_1_3_2_1_38_1","volume-title":"Fine-grained visual classification of aircraft. arXiv preprint arXiv:1306.5151","author":"Maji Subhransu","year":"2013","unstructured":"Subhransu Maji, Esa Rahtu, Juho Kannala, MatthewBlaschko, and Andrea Vedaldi. 2013. Fine-grained visual classification of aircraft. arXiv preprint arXiv:1306.5151 (2013)."},{"key":"e_1_3_2_1_39_1","volume-title":"Mapl: Parameter-efficient adaptation of unimodal pre-trained models for vision-language few-shot prompting. arXiv preprint arXiv:2210.07179","author":"Ma\u00f1as Oscar","year":"2022","unstructured":"Oscar Ma\u00f1as, Pau Rodriguez, Saba Ahmadi, Aida Nematzadeh, Yash Goyal, and Aishwarya Agrawal. 2022. Mapl: Parameter-efficient adaptation of unimodal pre-trained models for vision-language few-shot prompting. arXiv preprint arXiv:2210.07179 (2022)."},{"key":"e_1_3_2_1_40_1","volume-title":"Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, et al.","author":"Minderer Matthias","year":"2022","unstructured":"Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, et al. 2022. Simple open-vocabulary object detection. In ECCV. Springer, 728--755."},{"key":"e_1_3_2_1_41_1","volume-title":"Jake Grigsby, Di Jin, and Yanjun Qi.","author":"Morris John X","year":"2020","unstructured":"John X Morris, Eli Lifland, Jin Yong Yoo, Jake Grigsby, Di Jin, and Yanjun Qi. 2020. Textattack: A framework for adversarial attacks, data augmentation, and adversarial training in nlp. arXiv preprint arXiv:2005.05909 (2020)."},{"key":"e_1_3_2_1_42_1","volume-title":"Slip: Selfsupervision meets language-image pre-training","author":"Mu Norman","year":"2022","unstructured":"Norman Mu, Alexander Kirillov, David Wagner, and Saining Xie. 2022. Slip: Selfsupervision meets language-image pre-training. In ECCV. Springer, 529--544."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"e_1_3_2_1_44_1","volume-title":"Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748","author":"van den Oord Aaron","year":"2018","unstructured":"Aaron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_1_45_1","volume-title":"Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748","author":"van den Oord Aaron","year":"2018","unstructured":"Aaron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_1_46_1","volume-title":"VALSE: A task-independent benchmark for vision and language models centered on linguistic phenomena. arXiv preprint arXiv:2112.07566","author":"Parcalabescu Letitia","year":"2021","unstructured":"Letitia Parcalabescu, Michele Cafagna, Lilitta Muradjan, Anette Frank, Iacer Calixto, and Albert Gatt. 2021. VALSE: A task-independent benchmark for vision and language models centered on linguistic phenomena. arXiv preprint arXiv:2112.07566 (2021)."},{"volume-title":"Cats and dogs","author":"Parkhi Omkar M","key":"e_1_3_2_1_47_1","unstructured":"Omkar M Parkhi, Andrea Vedaldi, Andrew Zisserman, and CV Jawahar. 2012. Cats and dogs. In CVPR. IEEE, 3498--3505."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"crossref","unstructured":"Wujian Peng Sicheng Xie Zuyao You Shiyi Lan and Zuxuan Wu. 2024. Synthesize Diagnose and Optimize: Towards Fine-Grained Vision-Language Understanding. In CVPR. 13279--13288.","DOI":"10.1109\/CVPR52733.2024.01261"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"crossref","unstructured":"Bryan A Plummer LiweiWang ChrisMCervantes Juan C Caicedo Julia Hockenmaier and Svetlana Lazebnik. 2015. Flickr30k entities: Collecting region-to-phrase correspondences for richer image-to-sentence models. In ICCV. 2641--2649.","DOI":"10.1109\/ICCV.2015.303"},{"key":"e_1_3_2_1_50_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In ICML. PmLR, 8748--8763."},{"key":"e_1_3_2_1_51_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In ICML. PMLR, 8748--8763."},{"key":"e_1_3_2_1_52_1","volume-title":"Winoground: Probing vision and language models for visio-linguistic compositionality. In CVPR. 5238--5248.","author":"Thrush Tristan","year":"2022","unstructured":"Tristan Thrush, Ryan Jiang, Max Bartolo, Amanpreet Singh, Adina Williams, Douwe Kiela, and Candace Ross. 2022. Winoground: Probing vision and language models for visio-linguistic compositionality. In CVPR. 5238--5248."},{"key":"e_1_3_2_1_53_1","volume-title":"Saksham Singhal, Subhojit Som, et al.","author":"Wang Wenhui","year":"2022","unstructured":"Wenhui Wang, Hangbo Bao, Li Dong, Johan Bjorck, Zhiliang Peng, Qiang Liu, Kriti Aggarwal, Owais Khan Mohammed, Saksham Singhal, Subhojit Som, et al. 2022. Image as a foreign language: Beit pretraining for all vision and visionlanguage tasks. arXiv preprint arXiv:2208.10442 (2022)."},{"key":"e_1_3_2_1_54_1","volume-title":"Xinggang Wang, et al.","author":"Wu Kan","year":"2023","unstructured":"Kan Wu, Houwen Peng, Zhenghong Zhou, Bin Xiao, Mengchen Liu, Lu Yuan, Hong Xuan, Michael Valenzuela, Xi Stephen Chen, Xinggang Wang, et al. 2023. Tinyclip: Clip distillation via affinity mimicking and weight inheritance. In ICCV. 21970--21980."},{"key":"e_1_3_2_1_55_1","volume-title":"CLIPSelf: Vision Transformer Distills Itself for Open-Vocabulary Dense Prediction. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=DjzvJCRsVf","author":"Wu Size","year":"2024","unstructured":"Size Wu, Wenwei Zhang, Lumin Xu, Sheng Jin, Xiangtai Li, Wentao Liu, and Chen Change Loy. 2024. CLIPSelf: Vision Transformer Distills Itself for Open-Vocabulary Dense Prediction. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=DjzvJCRsVf"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"crossref","unstructured":"YaoWu Mingwei Xing Yachao Zhang Yuan Xie and Yanyun Qu. 2024. Clip2uda: Making frozen clip reward unsupervised domain adaptation in 3d semantic segmentation. In ACM MM. 8662--8671.","DOI":"10.1145\/3664647.3680582"},{"volume-title":"Sun database: Large-scale scene recognition from abbey to zoo","author":"Xiao Jianxiong","key":"e_1_3_2_1_57_1","unstructured":"Jianxiong Xiao, James Hays, Krista A Ehinger, Aude Oliva, and Antonio Torralba. 2010. Sun database: Large-scale scene recognition from abbey to zoo. In CVPR. IEEE, 3485--3492."},{"key":"e_1_3_2_1_58_1","volume-title":"Alip: Adaptive language-image pre-training with synthetic caption. In ICCV. 2922--2931.","author":"Yang Kaicheng","year":"2023","unstructured":"Kaicheng Yang, Jiankang Deng, Xiang An, Jiawei Li, Ziyong Feng, Jia Guo, Jing Yang, and Tongliang Liu. 2023. Alip: Adaptive language-image pre-training with synthetic caption. In ICCV. 2922--2931."},{"key":"e_1_3_2_1_59_1","volume-title":"Clip-cid: Efficient clip distillation via cluster-instance discrimination. AAAI","author":"Yang Kaicheng","year":"2024","unstructured":"Kaicheng Yang, Tiancheng Gu, Xiang An, Haiqiang Jiang, Xiangzi Dai, Ziyong Feng, Weidong Cai, and Jiankang Deng. 2024. Clip-cid: Efficient clip distillation via cluster-instance discrimination. AAAI (2024)."},{"key":"e_1_3_2_1_60_1","volume-title":"Filip: Fine-grained interactive language-image pre-training. arXiv preprint arXiv:2111.07783","author":"Yao Lewei","year":"2021","unstructured":"Lewei Yao, Runhui Huang, Lu Hou, Guansong Lu, Minzhe Niu, Hang Xu, Xiaodan Liang, Zhenguo Li, Xin Jiang, and Chunjing Xu. 2021. Filip: Fine-grained interactive language-image pre-training. arXiv preprint arXiv:2111.07783 (2021)."},{"key":"e_1_3_2_1_61_1","first-page":"32215","article-title":"Convolutions die hard: Open-vocabulary segmentation with single frozen convolutional clip","volume":"36","author":"Yu Qihang","year":"2023","unstructured":"Qihang Yu, Ju He, Xueqing Deng, Xiaohui Shen, and Liang-Chieh Chen. 2023. Convolutions die hard: Open-vocabulary segmentation with single frozen convolutional clip. NIPS 36 (2023), 32215--32234.","journal-title":"NIPS"},{"key":"e_1_3_2_1_62_1","volume-title":"When and why vision-language models behave like bags-of-words, and what to do about it? arXiv preprint arXiv:2210.01936","author":"Yuksekgonul Mert","year":"2022","unstructured":"Mert Yuksekgonul, Federico Bianchi, Pratyusha Kalluri, Dan Jurafsky, and James Zou. 2022. When and why vision-language models behave like bags-of-words, and what to do about it? arXiv preprint arXiv:2210.01936 (2022)."},{"volume-title":"Open-vocabulary detr with conditional matching","author":"Zang Yuhang","key":"e_1_3_2_1_63_1","unstructured":"Yuhang Zang, Wei Li, Kaiyang Zhou, Chen Huang, and Chen Change Loy. 2022. Open-vocabulary detr with conditional matching. In ECCV. Springer, 106--122."},{"key":"e_1_3_2_1_64_1","volume-title":"Multi-Grained Vision Language Pre-Training: Aligning Texts with Visual Concepts. arXiv preprint arXiv:2111.08276","author":"Zeng Yan","year":"2021","unstructured":"Yan Zeng, Xinsong Zhang, and Hang Li. 2021. Multi-Grained Vision Language Pre-Training: Aligning Texts with Visual Concepts. arXiv preprint arXiv:2111.08276 (2021)."},{"key":"e_1_3_2_1_65_1","volume-title":"Multi-grained vision language pre-training: Aligning texts with visual concepts. arXiv preprint arXiv:2111.08276","author":"Zeng Yan","year":"2021","unstructured":"Yan Zeng, Xinsong Zhang, and Hang Li. 2021. Multi-grained vision language pre-training: Aligning texts with visual concepts. arXiv preprint arXiv:2111.08276 (2021)."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"crossref","unstructured":"Xiaohua Zhai Basil Mustafa Alexander Kolesnikov and Lucas Beyer. 2023. Sigmoid loss for language image pre-training. In ICCV. 11975--11986.","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"crossref","unstructured":"Le Zhang Rabiul Awal and Aishwarya Agrawal. 2024. Contrasting Intra-Modal and Ranking Cross-Modal Hard Negatives to Enhance Visio-Linguistic Compositional Understanding. In CVPR. 13774--13784.","DOI":"10.1109\/CVPR52733.2024.01307"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"crossref","unstructured":"Lu Zhang Ke Yan and Shouhong Ding. 2024. AlignCLIP: Align Multi Domains of Texts Input for CLIP models with Object-IoU Loss. In ACM MM. 1092--1100.","DOI":"10.1145\/3664647.3681636"},{"key":"e_1_3_2_1_69_1","volume-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592","author":"Zhu Deyao","year":"2023","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2023. Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755032","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:16:31Z","timestamp":1765307791000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755032"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":69,"alternative-id":["10.1145\/3746027.3755032","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755032","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}