{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,24]],"date-time":"2026-03-24T16:28:49Z","timestamp":1774369729435,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":57,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3611826","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:12Z","timestamp":1698391632000},"page":"4480-4491","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":9,"title":["COPA : Efficient Vision-Language Pre-training through Collaborative Object- and Patch-Text Alignment"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-7282-159X","authenticated-orcid":false,"given":"Chaoya","family":"Jiang","sequence":"first","affiliation":[{"name":"Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9442-5912","authenticated-orcid":false,"given":"Haiyang","family":"Xu","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9331-4716","authenticated-orcid":false,"given":"Wei","family":"Ye","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7977-5540","authenticated-orcid":false,"given":"Qinghao","family":"Ye","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9077-3928","authenticated-orcid":false,"given":"Chenliang","family":"Li","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4959-8878","authenticated-orcid":false,"given":"Ming","family":"Yan","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2207-9146","authenticated-orcid":false,"given":"Bin","family":"Bi","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8576-2674","authenticated-orcid":false,"given":"Shikun","family":"Zhang","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3709-5053","authenticated-orcid":false,"given":"Fei","family":"Huang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3835-7975","authenticated-orcid":false,"given":"Ji","family":"Zhang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0966-6"},{"key":"e_1_3_2_1_2_1","volume-title":"nocaps: novel object captioning at scale. CoRR","author":"Agrawal Harsh","year":"2018","unstructured":"Harsh Agrawal, Karan Desai, Yufei Wang, Xinlei Chen, Rishabh Jain, Mark Johnson, Dhruv Batra, Devi Parikh, Stefan Lee, and Peter Anderson. 2018. nocaps: novel object captioning at scale. CoRR, Vol. abs\/1812.08658 (2018). [arXiv]1812.08658 http:\/\/arxiv.org\/abs\/1812.08658"},{"key":"e_1_3_2_1_3_1","volume-title":"Palm: Pre-training an autoencoding&autoregressive language model for context-conditioned generation. arXiv preprint arXiv:2004.07159","author":"Bi Bin","year":"2020","unstructured":"Bin Bi, Chenliang Li, Chen Wu, Ming Yan, Wei Wang, Songfang Huang, Fei Huang, and Luo Si. 2020. Palm: Pre-training an autoencoding&autoregressive language model for context-conditioned generation. arXiv preprint arXiv:2004.07159 (2020)."},{"key":"e_1_3_2_1_4_1","volume-title":"Faisal Ahmed, Zhe Gan, Yu Cheng, and Jingjing Liu.","author":"Chen Yen-Chun","year":"2020","unstructured":"Yen-Chun Chen, Linjie Li, Licheng Yu, Ahmed El Kholy, Faisal Ahmed, Zhe Gan, Yu Cheng, and Jingjing Liu. 2020. UNITER: UNiversal Image-TExt Representation Learning. In ECCV."},{"key":"e_1_3_2_1_5_1","volume-title":"Rethinking Attention with Performers. ArXiv","author":"Choromanski Krzysztof","year":"2021","unstructured":"Krzysztof Choromanski, Valerii Likhosherstov, David Dohan, Xingyou Song, Andreea Gane, Tam\u00e1s Sarl\u00f3s, Peter Hawkins, Jared Davis, Afroz Mohiuddin, Lukasz Kaiser, David Belanger, Lucy J. Colwell, and Adrian Weller. 2021. Rethinking Attention with Performers. ArXiv, Vol. abs\/2009.14794 (2021)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00359"},{"key":"e_1_3_2_1_7_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. ArXiv","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. ArXiv, Vol. abs\/1810.04805 (2019)."},{"key":"e_1_3_2_1_8_1","volume-title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. ArXiv","author":"Dosovitskiy Alexey","year":"1929","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. ArXiv, Vol. abs\/2010.11929 (2021)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Zi-Yi Dou Yichong Xu Zhe Gan Jianfeng Wang Shuohang Wang Lijuan Wang Chenguang Zhu Zicheng Liu Michael Zeng et al. 2021. An Empirical Study of Training End-to-End Vision-and-Language Transformers. arXiv preprint arXiv:2111.02387 (2021).","DOI":"10.1109\/CVPR52688.2022.01763"},{"key":"e_1_3_2_1_10_1","unstructured":"Zhe Gan Yen-Chun Chen Linjie Li Chen Zhu Yu Cheng and Jingjing Liu. 2020. Large-Scale Adversarial Training for Vision-and-Language Representation Learning. In NeurIPS."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"e_1_3_2_1_12_1","volume-title":"Mask R-CNN. 2017 IEEE International Conference on Computer Vision (ICCV)","author":"He Kaiming","year":"2017","unstructured":"Kaiming He, Georgia Gkioxari, Piotr Doll\u00e1r, and Ross B. Girshick. 2017. Mask R-CNN. 2017 IEEE International Conference on Computer Vision (ICCV) (2017), 2980--2988."},{"key":"e_1_3_2_1_13_1","volume-title":"Rethinking Spatial Dimensions of Vision Transformers. 2021 IEEE\/CVF International Conference on Computer Vision (ICCV) (2021","author":"Heo Byeongho","year":"2021","unstructured":"Byeongho Heo, Sangdoo Yun, Dongyoon Han, Sanghyuk Chun, Junsuk Choe, and Seong Joon Oh. 2021. Rethinking Spatial Dimensions of Vision Transformers. 2021 IEEE\/CVF International Conference on Computer Vision (ICCV) (2021), 11916--11925."},{"key":"e_1_3_2_1_14_1","volume-title":"Pixel-BERT: Aligning Image Pixels with Text by Deep Multi-Modal Transformers. ArXiv","author":"Huang Zhicheng","year":"2020","unstructured":"Zhicheng Huang, Zhaoyang Zeng, Bei Liu, Dongmei Fu, and Jianlong Fu. 2020. Pixel-BERT: Aligning Image Pixels with Text by Deep Multi-Modal Transformers. ArXiv, Vol. abs\/2004.00849 (2020)."},{"key":"e_1_3_2_1_15_1","volume-title":"Scaling up visual and vision-language representation learning with noisy text supervision. arXiv preprint arXiv:2102.05918","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V Le, Yunhsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. arXiv preprint arXiv:2102.05918 (2021)."},{"key":"e_1_3_2_1_16_1","volume-title":"TRIPS: Efficient Vision-and-Language Pre-training with Text-Relevant Image Patch Selection. In Conference on Empirical Methods in Natural Language Processing.","author":"Jiang Chaoya","year":"2022","unstructured":"Chaoya Jiang, Haiyang Xu, Chenliang Li, Ming Yan, Wei Ye, Shikun Zhang, Bin Bi, and Songfang Huang. 2022. TRIPS: Efficient Vision-and-Language Pre-training with Text-Relevant Image Patch Selection. In Conference on Empirical Methods in Natural Language Processing."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_2_1_19_1","unstructured":"Wonjae Kim Bokyung Son and Ildoo Kim. 2021. ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision. In ICML."},{"key":"e_1_3_2_1_20_1","volume-title":"Reformer: The Efficient Transformer. ArXiv","author":"Kitaev Nikita","year":"2020","unstructured":"Nikita Kitaev, Lukasz Kaiser, and Anselm Levskaya. 2020. Reformer: The Efficient Transformer. ArXiv, Vol. abs\/2001.04451 (2020)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Chenliang Li Haiyang Xu Junfeng Tian Wei Wang Ming Yan Bin Bi Jiabo Ye Hehong Chen Guohai Xu Zheng Cao Ji Zhang Songfang Huang Fei Huang Jingren Zhou and Luo Si. 2022b. mPLUG: Effective and Efficient Vision-Language Learning by Cross-modal Skip-connections.","DOI":"10.18653\/v1\/2022.emnlp-main.488"},{"key":"e_1_3_2_1_23_1","volume-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. arXiv preprint arXiv:2201.12086","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022a. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. arXiv preprint arXiv:2201.12086 (2022)."},{"key":"e_1_3_2_1_24_1","volume-title":"Shafiq R. Joty, Caiming Xiong, and Steven C. H. Hoi.","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath R. Selvaraju, Akhilesh Deepak Gotmare, Shafiq R. Joty, Caiming Xiong, and Steven C. H. Hoi. 2021. Align before Fuse: Vision and Language Representation Learning with Momentum Distillation. In NeurIPS."},{"key":"e_1_3_2_1_25_1","volume-title":"VisualBERT: A Simple and Performant Baseline for Vision and Language. ArXiv","author":"Li Liunian Harold","year":"2019","unstructured":"Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, and Kai-Wei Chang. 2019. VisualBERT: A Simple and Performant Baseline for Vision and Language. ArXiv, Vol. abs\/1908.03557 (2019)."},{"key":"e_1_3_2_1_26_1","volume-title":"Oscar: Object-Semantics Aligned Pre-training for Vision-Language Tasks. In ECCV.","author":"Li Xiujun","year":"2020","unstructured":"Xiujun Li, Xi Yin, Chunyuan Li, Xiaowei Hu, Pengchuan Zhang, Lei Zhang, Lijuan Wang, Houdong Hu, Li Dong, Furu Wei, Yejin Choi, and Jianfeng Gao. 2020. Oscar: Object-Semantics Aligned Pre-training for Vision-Language Tasks. In ECCV."},{"key":"e_1_3_2_1_27_1","volume-title":"Not All Patches are What You Need: Expediting Vision Transformers via Token Reorganizations. ArXiv","author":"Liang Youwei","year":"2022","unstructured":"Youwei Liang, Chongjian Ge, Zhan Tong, Yibing Song, Jue Wang, and Pengtao Xie. 2022. Not All Patches are What You Need: Expediting Vision Transformers via Token Reorganizations. ArXiv, Vol. abs\/2202.07800 (2022)."},{"key":"e_1_3_2_1_28_1","unstructured":"Tsung-Yi Lin Michael Maire Serge J. Belongie James Hays Pietro Perona Deva Ramanan Piotr Doll\u00e1r and C. Lawrence Zitnick. 2014. Microsoft COCO: Common Objects in Context. In ECCV."},{"key":"e_1_3_2_1_29_1","volume-title":"Jianwei Yang, Hang Su, Jun-Juan Zhu, and Lei Zhang.","author":"Liu Siyi","year":"2023","unstructured":"Siyi Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chun yue Li, Jianwei Yang, Hang Su, Jun-Juan Zhu, and Lei Zhang. 2023. Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection. ArXiv, Vol. abs\/2303.05499 (2023)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_31_1","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. Decoupled Weight Decay Regularization. In ICLR."},{"key":"e_1_3_2_1_32_1","volume-title":"ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks. NeurIPS","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks. NeurIPS (2019)."},{"key":"e_1_3_2_1_33_1","volume-title":"Berg","author":"Ordonez Vicente","year":"2011","unstructured":"Vicente Ordonez, Girish Kulkarni, and Tamara L. Berg. 2011. Im2Text: Describing Images Using 1 Million Captioned Photographs. In NIPS."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0965-7"},{"key":"e_1_3_2_1_35_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In ICML."},{"key":"e_1_3_2_1_36_1","unstructured":"Yongming Rao Wenliang Zhao Benlin Liu Jiwen Lu Jie Zhou and Cho-Jui Hsieh. 2021. DynamicViT: Efficient Vision Transformers with Dynamic Token Sparsification. In NeurIPS."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.91"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.131"},{"key":"e_1_3_2_1_40_1","unstructured":"Michael S. Ryoo A. J. Piergiovanni Anurag Arnab Mostafa Dehghani and Anelia Angelova. 2021. TokenLearner: Adaptive Space-Time Tokenization for Videos. In NeurIPS."},{"key":"e_1_3_2_1_41_1","volume-title":"Image Alt-text Dataset For Automatic Image Captioning. In Annual Meeting of the Association for Computational Linguistics.","author":"Sharma Piyush","year":"2018","unstructured":"Piyush Sharma, Nan Ding, Sebastian Goodman, and Radu Soricut. 2018. Conceptual Captions: A Cleaned, Hypernymed, Image Alt-text Dataset For Automatic Image Captioning. In Annual Meeting of the Association for Computational Linguistics."},{"key":"e_1_3_2_1_42_1","volume-title":"FLAVA: A Foundational Language And Vision Alignment Model. ArXiv","author":"Singh Amanpreet","year":"2021","unstructured":"Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela. 2021. FLAVA: A Foundational Language And Vision Alignment Model. ArXiv, Vol. abs\/2112.04482 (2021)."},{"key":"e_1_3_2_1_43_1","volume-title":"VL-BERT: Pre-training of Generic Visual-Linguistic Representations. ArXiv","author":"Su Weijie","year":"2020","unstructured":"Weijie Su, Xizhou Zhu, Yue Cao, Bin Li, Lewei Lu, Furu Wei, and Jifeng Dai. 2020. VL-BERT: Pre-training of Generic Visual-Linguistic Representations. ArXiv, Vol. abs\/1908.08530 (2020)."},{"key":"e_1_3_2_1_44_1","volume-title":"LXMERT: Learning Cross-Modality Encoder Representations from Transformers. arXiv preprint arXiv:1908.07490","author":"Tan Hao Hao","year":"2019","unstructured":"Hao Hao Tan and Mohit Bansal. 2019. LXMERT: Learning Cross-Modality Encoder Representations from Transformers. arXiv preprint arXiv:1908.07490 (2019)."},{"key":"e_1_3_2_1_45_1","volume-title":"Attention is All you Need. ArXiv","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam M. Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. ArXiv, Vol. abs\/1706.03762 (2017)."},{"key":"e_1_3_2_1_46_1","volume-title":"MiniVLM: A Smaller and Faster Vision-Language Model. ArXiv","author":"Wang Jianfeng","year":"2020","unstructured":"Jianfeng Wang, Xiaowei Hu, Pengchuan Zhang, Xiujun Li, Lijuan Wang, L. Zhang, Jianfeng Gao, and Zicheng Liu. 2020a. MiniVLM: A Smaller and Faster Vision-Language Model. ArXiv, Vol. abs\/2012.06946 (2020)."},{"key":"e_1_3_2_1_47_1","volume-title":"Linformer: Self-Attention with Linear Complexity. ArXiv","author":"Wang Sinong","year":"2020","unstructured":"Sinong Wang, Belinda Z. Li, Madian Khabsa, Han Fang, and Hao Ma. 2020b. Linformer: Self-Attention with Linear Complexity. ArXiv, Vol. abs\/2006.04768 (2020)."},{"key":"e_1_3_2_1_48_1","volume-title":"VLMo: Unified Vision-Language Pre-Training with Mixture-of-Modality-Experts. ArXiv","author":"Wang Wenhui","year":"2021","unstructured":"Wenhui Wang, Hangbo Bao, Li Dong, and Furu Wei. 2021a. VLMo: Unified Vision-Language Pre-Training with Mixture-of-Modality-Experts. ArXiv, Vol. abs\/2111.02358 (2021)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"e_1_3_2_1_50_1","volume-title":"Zihang Dai, Yulia Tsvetkov, and Yuan Cao.","author":"Wang Zirui","year":"2021","unstructured":"Zirui Wang, Jiahui Yu, Adams Wei Yu, Zihang Dai, Yulia Tsvetkov, and Yuan Cao. 2021c. SimVLM: Simple Visual Language Model Pretraining with Weak Supervision. ArXiv, Vol. abs\/2108.10904 (2021)."},{"key":"e_1_3_2_1_51_1","volume-title":"E2E-VLP: End-to-End Vision-Language Pre-training Enhanced by Visual Learning. ArXiv","author":"Xu Haiyang","year":"1804","unstructured":"Haiyang Xu, Ming Yan, Chenliang Li, Bin Bi, Songfang Huang, Wenming Xiao, and Fei Huang. 2021. E2E-VLP: End-to-End Vision-Language Pre-training Enhanced by Visual Learning. ArXiv, Vol. abs\/2106.01804 (2021)."},{"key":"e_1_3_2_1_52_1","volume-title":"Crossing the Format Boundary of Text and Boxes: Towards Unified Vision-Language Modeling. CoRR","author":"Yang Zhengyuan","year":"2085","unstructured":"Zhengyuan Yang, Zhe Gan, Jianfeng Wang, Xiaowei Hu, Faisal Ahmed, Zicheng Liu, Yumao Lu, and Lijuan Wang. 2021. Crossing the Format Boundary of Text and Boxes: Towards Unified Vision-Language Modeling. CoRR, Vol. abs\/2111.12085 (2021). [arXiv]2111.12085 https:\/\/arxiv.org\/abs\/2111.12085"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"crossref","unstructured":"Fei Yu Jiji Tang Weichong Yin Yu Sun Hao Tian Hua Wu and Haifeng Wang. 2021. ERNIE-ViL: Knowledge Enhanced Vision-Language Representations Through Scene Graph. In AAAI.","DOI":"10.1609\/aaai.v35i4.16431"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_5"},{"key":"e_1_3_2_1_55_1","volume-title":"Multi-Grained Vision Language Pre-Training: Aligning Texts with Visual Concepts. ArXiv","author":"Zeng Yan","year":"2021","unstructured":"Yan Zeng, Xinsong Zhang, and Hang Li. 2021. Multi-Grained Vision Language Pre-Training: Aligning Texts with Visual Concepts. ArXiv, Vol. abs\/2111.08276 (2021)."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"crossref","unstructured":"P. Zhang X. Li X. Hu J. Yang L. Zhang L. Wang Y. Choi and J. Gao. 2021. VinVL: Making Visual Representations Matter in Vision-Language Models. (2021).","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"e_1_3_2_1_57_1","volume-title":"Unified Vision-Language Pre-Training for Image Captioning and VQA. ArXiv","author":"Zhou Luowei","year":"2020","unstructured":"Luowei Zhou, Hamid Palangi, Lei Zhang, Houdong Hu, Jason J. Corso, and Jianfeng Gao. 2020. Unified Vision-Language Pre-Training for Image Captioning and VQA. ArXiv, Vol. abs\/1909.11059 (2020)."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611826","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3611826","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T23:56:59Z","timestamp":1755820619000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611826"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":57,"alternative-id":["10.1145\/3581783.3611826","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3611826","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}