{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T05:05:45Z","timestamp":1750309545812,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":52,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,4,22]],"date-time":"2025-04-22T00:00:00Z","timestamp":1745280000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"Outstanding Member Program of the Youth Innovation Promotion Association of Chinese Academy of Sciences","award":["Y2021112, Y2023118"],"award-info":[{"award-number":["Y2021112, Y2023118"]}]},{"name":"Tianshan Talent Training Program","award":["2022TSYCCX0059, 2023TSYCCX0041, 2023TSYCCX0044"],"award-info":[{"award-number":["2022TSYCCX0059, 2023TSYCCX0041, 2023TSYCCX0044"]}]},{"name":"Tianshan Elite Science and Technology Innovation Leading Talents Program","award":["2022TSYCLJ0046"],"award-info":[{"award-number":["2022TSYCLJ0046"]}]},{"name":"Natural Science Foundation of Xinjiang Uyghur Autonomous Region","award":["2022D01D04, 2023D01D17, 2022D01D81, 2024D01D29"],"award-info":[{"award-number":["2022D01D04, 2023D01D17, 2022D01D81, 2024D01D29"]}]},{"name":"Key Research and Development Program of Xinjiang Uyghur Autonomous Region","award":["2023B03024, 2023B01005"],"award-info":[{"award-number":["2023B03024, 2023B01005"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,4,28]]},"DOI":"10.1145\/3696410.3714861","type":"proceedings-article","created":{"date-parts":[[2025,5,5]],"date-time":"2025-05-05T16:42:02Z","timestamp":1746463322000},"page":"3438-3450","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["M\n            <sup>2<\/sup>\n            -VLP: Enhancing Multilingual Vision-Language Pre-Training via Multi-Grained Alignment"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6660-8989","authenticated-orcid":false,"given":"Ahtamjan","family":"Ahmat","sequence":"first","affiliation":[{"name":"Xinjiang Technical Institute of Physics &amp; Chemistry, Chinese Academy of Sciences, Beijing, China and University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6685-0858","authenticated-orcid":false,"given":"Lei","family":"Wang","sequence":"additional","affiliation":[{"name":"Xinjiang Technical Institute of Physics &amp; Chemistry, Chinese Academy of Sciences, Beijing, China, University of Chinese Academy of Sciences, Beijing, China, and Xinjiang Laboratory of Minority Speech and Language Information Processing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2639-3944","authenticated-orcid":false,"given":"Yating","family":"Yang","sequence":"additional","affiliation":[{"name":"Xinjiang Technical Institute of Physics &amp; Chemistry, Chinese Academy of Sciences, Beijing, China, University of Chinese Academy of Sciences, Beijing, China, and Xinjiang Laboratory of Minority Speech and Language Information Processing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3082-648X","authenticated-orcid":false,"given":"Bo","family":"Ma","sequence":"additional","affiliation":[{"name":"Xinjiang Technical Institute of Physics &amp; Chemistry, Chinese Academy of Sciences, Beijing, China, University of Chinese Academy of Sciences, Beijing, China, and Xinjiang Laboratory of Minority Speech and Language Information Processing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4110-3976","authenticated-orcid":false,"given":"Rui","family":"Dong","sequence":"additional","affiliation":[{"name":"Xinjiang Technical Institute of Physics &amp; Chemistry, Chinese Academy of Sciences, Beijing, China, University of Chinese Academy of Sciences, Beijing, China, and Xinjiang Laboratory of Minority Speech and Language Information Processing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6327-9171","authenticated-orcid":false,"given":"Kaiwen","family":"Lu","sequence":"additional","affiliation":[{"name":"Xinjiang Technical Institute of Physics &amp; Chemistry, Chinese Academy of Sciences, Beijing, China and University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2974-0569","authenticated-orcid":false,"given":"Rong","family":"Ma","sequence":"additional","affiliation":[{"name":"Xinjiang Technical Institute of Physics &amp; Chemistry, Chinese Academy of Sciences, Beijing, China and University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-3735-6475","authenticated-orcid":false,"given":"Xinyue","family":"Wang","sequence":"additional","affiliation":[{"name":"Hohai University, Nanjing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,4,22]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Zeljko Agic and Natalie Schluter. 2018. Baselines and Test Data for Cross-Lingual Inference. In LREC."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Xi Ai and Bin Fang. 2023. Multilingual Pre-training with Self-supervision from Global Co-occurrence Information. In ACL Findings. 7526--7543.","DOI":"10.18653\/v1\/2023.findings-acl.475"},{"key":"e_1_3_2_1_3_1","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican et al. 2022. Flamingo: a visual language model for few-shot learning. NeurIPS (2022) 23716--23736."},{"key":"e_1_3_2_1_4_1","volume-title":"Manning","author":"Bowman Samuel R.","year":"2015","unstructured":"Samuel R. Bowman, Gabor Angeli, Christopher Potts, and Christopher D. Manning. 2015. A large annotated corpus for learning natural language inference. In EMNLP. 632--642."},{"key":"e_1_3_2_1_5_1","volume-title":"Edoardo Maria Ponti, and Ivan Vulic","author":"Bugliarello Emanuele","year":"2022","unstructured":"Emanuele Bugliarello, Fangyu Liu, Jonas Pfeiffer, Siva Reddy, Desmond Elliott, Edoardo Maria Ponti, and Ivan Vulic. 2022. IGLUE: A Benchmark for Transfer Learning across Modalities, Tasks, and Languages. In ICML. 2370--2392."},{"key":"e_1_3_2_1_6_1","unstructured":"Fredrik Carlsson Philipp Eisen Faton Rekathati and Magnus Sahlgren. 2022. Cross-lingual and Multilingual CLIP. In LREC. 6848--6854."},{"key":"e_1_3_2_1_7_1","volume-title":"VLP: A Survey on Vision-language Pre-training. Int. J. Autom. Comput.","author":"Chen Feilong","year":"2023","unstructured":"Feilong Chen, Duzhen Zhang, Minglun Han, Xiuyi Chen, Jing Shi, Shuang Xu, and Bo Xu. 2023. VLP: A Survey on Vision-language Pre-training. Int. J. Autom. Comput. (2023), 38--56."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Zewen Chi Li Dong Furu Wei Nan Yang Saksham Singhal Wenhui Wang Xia Song XianLing Mao Heyan Huang and Ming Zhou. 2021. InfoXLM: An Information-Theoretic Framework for Cross-Lingual Language Model Pre-Training. In NAACL. 3576--3588.","DOI":"10.18653\/v1\/2021.naacl-main.280"},{"key":"e_1_3_2_1_9_1","volume-title":"What does it mean to be language-agnostic? probing multilingual sentence encoders for typological properties. arXiv:2009.12862","author":"Choenni Rochelle","year":"2020","unstructured":"Rochelle Choenni and Ekaterina Shutova. 2020. What does it mean to be language-agnostic? probing multilingual sentence encoders for typological properties. arXiv:2009.12862 (2020)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Alexis Conneau Kartikay Khandelwal Naman Goyal Vishrav Chaudhary Guillaume Wenzek Francisco Guzm\u00e1n Edouard Grave Myle Ott Luke Zettlemoyer and Veselin Stoyanov. 2020. Unsupervised Cross-lingual Representation Learning at Scale. In ACL. 8440--8451.","DOI":"10.18653\/v1\/2020.acl-main.747"},{"key":"e_1_3_2_1_11_1","unstructured":"Alexis Conneau and Guillaume Lample. 2019. Cross-lingual Language Model Pretraining. In NeurIPS. 7057--7067."},{"key":"e_1_3_2_1_12_1","volume-title":"Randaugment: Practical automated data augmentation with a reduced search space. In CVPR. 702--703.","author":"Cubuk Ekin D","year":"2020","unstructured":"Ekin D Cubuk, Barret Zoph, Jonathon Shlens, and Quoc V Le. 2020. Randaugment: Practical automated data augmentation with a reduced search space. In CVPR. 702--703."},{"key":"e_1_3_2_1_13_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NAACL. 4171--4186.","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, MingWei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NAACL. 4171--4186."},{"key":"e_1_3_2_1_14_1","unstructured":"Chris Dyer Victor Chahuneau and Noah A Smith. 2013. A simple fast and effective reparameterization of IBM model 2. In NAACL-HLT. 644--648."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Desmond Elliott Stella Frank Khalil Sima'an and Lucia Specia. 2016. Multi30K: Multilingual English-German Image Descriptions. In ACL. 70--74.","DOI":"10.18653\/v1\/W16-3210"},{"key":"e_1_3_2_1_16_1","unstructured":"Ping Guo Xiangpeng Wei Yue Hu Baosong Yang Dayiheng Liu Fei Huang and Jun Xie. 2023. EMMA-X: An EM-like Multilingual Pre-training Algorithm for Cross-lingual Representation Learning. In NeurIPS."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"Lianzhe Huang Shuming Ma Dongdong Zhang Furu Wei and Houfeng Wang. 2022. Zero-shot Cross-lingual Transfer of Prompt-based Tuning with a Unified Multilingual Prompt. In EMNLP. 11488--11497.","DOI":"10.18653\/v1\/2022.emnlp-main.790"},{"key":"e_1_3_2_1_18_1","volume-title":"Mural: multimodal, multitask retrieval across languages. arXiv:2109.05125","author":"Jain Aashi","year":"2021","unstructured":"Aashi Jain, Mandy Guo, Krishna Srinivasan, Ting Chen, Sneha Kudugunta, Chao Jia, Yinfei Yang, and Jason Baldridge. 2021. Mural: multimodal, multitask retrieval across languages. arXiv:2109.05125 (2021)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"Yatai Ji Rongcheng Tu Jie Jiang Weijie Kong Chengfei Cai Wenzhe Zhao Hongfa Wang Yujiu Yang and Wei Liu. 2023. Seeing What You Miss: Vision-Language Pre-training with Semantic Completion Learning. In CVPR. 6789--6798.","DOI":"10.1109\/CVPR52729.2023.00656"},{"key":"e_1_3_2_1_20_1","unstructured":"Chao Jia Yinfei Yang Ye Xia YiTing Chen Zarana Parekh Hieu Pham Quoc V. Le YunHsuan Sung Zhen Li and Tom Duerig. 2021. Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision. In ICML. 4904--4916."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2598339"},{"key":"e_1_3_2_1_22_1","volume-title":"Visual Genome: Connecting Language and Vision Using Crowdsourced Dense Image Annotations. Int. J. Comput. Vis.","author":"Krishna Ranjay","year":"2017","unstructured":"Ranjay Krishna, Yuke Zhu, Oliver Groth, Justin Johnson, Kenji Hata, Joshua Kravitz, Stephanie Chen, Yannis Kalantidis, LiJia Li, David A. Shamma, Michael S. Bernstein, and Li FeiFei. 2017. Visual Genome: Connecting Language and Vision Using Crowdsourced Dense Image Annotations. Int. J. Comput. Vis. (2017), 32--73."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"Anne Lauscher Vinit Ravishankar Ivan Vulic and Goran Glavas. 2020. From Zero to Hero: On the Limitations of Zero-Shot Language Transfer with Multilingual Transformers. In EMNLP. 4483--4499.","DOI":"10.18653\/v1\/2020.emnlp-main.363"},{"key":"e_1_3_2_1_24_1","unstructured":"Kuang-Huei Lee Xi Chen Gang Hua Houdong Hu and Xiaodong He. 2018. Stacked cross attention for image-text matching. In ECCV. 201--216."},{"key":"e_1_3_2_1_25_1","unstructured":"Junnan Li Dongxu Li Silvio Savarese and Steven Hoi. 2023b. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In ICML. 19730--19742."},{"key":"e_1_3_2_1_26_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. NeurIPS","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. NeurIPS (2021), 9694--9705."},{"key":"e_1_3_2_1_27_1","volume-title":"Visualbert: A simple and performant baseline for vision and language. arXiv:1908.03557","author":"Li Liunian Harold","year":"2019","unstructured":"Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, and Kai-Wei Chang. 2019b. Visualbert: A simple and performant baseline for vision and language. arXiv:1908.03557 (2019)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2019.2896494"},{"key":"e_1_3_2_1_29_1","volume-title":"Oscar: Object-Semantics Aligned Pre-training for Vision-Language Tasks. In ECCV. 121--137.","author":"Li Xiujun","year":"2020","unstructured":"Xiujun Li, Xi Yin, Chunyuan Li, Pengchuan Zhang, Xiaowei Hu, Lei Zhang, Lijuan Wang, Houdong Hu, Li Dong, Furu Wei, Yejin Choi, and Jianfeng Gao. 2020. Oscar: Object-Semantics Aligned Pre-training for Vision-Language Tasks. In ECCV. 121--137."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"Zejun Li Zhihao Fan Jingjing Chen Qi Zhang Xuanjing Huang and Zhongyu Wei. 2023a. Unifying Cross-Lingual and Cross-Modal Modeling Towards Weakly Supervised Multilingual Vision-Language Pre-training. In ACL. 5939--5958.","DOI":"10.18653\/v1\/2023.acl-long.327"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Tsung-Yi Lin Michael Maire Serge J. Belongie James Hays Pietro Perona Deva Ramanan Piotr Doll\u00e1r and C. Lawrence Zitnick. 2014. Microsoft COCO: Common Objects in Context. In ECCV. 740--755.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_32_1","volume-title":"Siva Reddy, Nigel Collier, and Desmond Elliott.","author":"Liu Fangyu","year":"2021","unstructured":"Fangyu Liu, Emanuele Bugliarello, Edoardo Maria Ponti, Siva Reddy, Nigel Collier, and Desmond Elliott. 2021. Visually Grounded Reasoning across Languages and Cultures. In EMNLP. 10467--10485."},{"key":"e_1_3_2_1_33_1","volume-title":"NeurIPS","volume":"36","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2024. Visual instruction tuning. NeurIPS, Vol. 36 (2024)."},{"key":"e_1_3_2_1_34_1","unstructured":"Minheng Ni Haoyang Huang Lin Su Edward Cui Taroon Bharti Lijuan Wang Dongdong Zhang and Nan Duan. 2021. M3P: Learning Universal Representations via Multitask Multilingual Multimodal Pre-Training. In CVPR. 3977--3986."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"crossref","unstructured":"Jonas Pfeiffer Gregor Geigle Aishwarya Kamath Jan-Martin O. Steitz Stefan Roth Ivan Vulic and Iryna Gurevych. 2022. xGQA: Cross-Lingual Visual Question Answering. In ACL Findings. 2497--2511.","DOI":"10.18653\/v1\/2022.findings-acl.196"},{"key":"e_1_3_2_1_36_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In ICML. 8748--8763."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Holger Schwenk Vishrav Chaudhary Shuo Sun Hongyu Gong and Francisco Guzm\u00e1n. 2021. WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia. In EACL. 1351--1361.","DOI":"10.18653\/v1\/2021.eacl-main.115"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-019-01228-7"},{"key":"e_1_3_2_1_39_1","volume-title":"Conceptual Captions: A Cleaned, Hypernymed, Image Alt-text Dataset For Automatic Image Captioning. In ACL. 2556--2565.","author":"Sharma Piyush","year":"2018","unstructured":"Piyush Sharma, Nan Ding, Sebastian Goodman, and Radu Soricut. 2018. Conceptual Captions: A Cleaned, Hypernymed, Image Alt-text Dataset For Automatic Image Captioning. In ACL. 2556--2565."},{"key":"e_1_3_2_1_40_1","volume-title":"WIT: Wikipedia-based Image Text Dataset for Multimodal Multilingual Machine Learning. In SIGIR. 2443--2449.","author":"Srinivasan Krishna","year":"2021","unstructured":"Krishna Srinivasan, Karthik Raman, Jiecao Chen, Michael Bendersky, and Marc Najork. 2021. WIT: Wikipedia-based Image Text Dataset for Multimodal Multilingual Machine Learning. In SIGIR. 2443--2449."},{"key":"e_1_3_2_1_41_1","unstructured":"Weijie Su Xizhou Zhu Yue Cao Bin Li Lewei Lu Furu Wei and Jifeng Dai. 2020. VL-BERT: Pre-training of Generic Visual-Linguistic Representations. In ICLR."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"Guohao Sun Yue Bai Xueying Yang Yi Fang Yun Fu and Zhiqiang Tao. 2024. Aligning Out-of-Distribution Web Images and Caption Semantics via Evidential Learning. In WWW. 2271--2281.","DOI":"10.1145\/3589334.3645653"},{"key":"e_1_3_2_1_43_1","article-title":"Visualizing data using t-SNE","volume":"9","author":"der Maaten Laurens Van","year":"2008","unstructured":"Laurens Van der Maaten and Geoffrey Hinton. 2008. Visualizing data using t-SNE. Journal of machine learning research, Vol. 9, 11 (2008).","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_44_1","volume-title":"Saksham Singhal, Subhojit Som, et al.","author":"Wang Wenhui","year":"2023","unstructured":"Wenhui Wang, Hangbo Bao, Li Dong, Johan Bjorck, Zhiliang Peng, Qiang Liu, Kriti Aggarwal, Owais Khan Mohammed, Saksham Singhal, Subhojit Som, et al. 2023. Image as a foreign language: Beit pretraining for vision and vision-language tasks. In CVPR. 19175--19186."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"crossref","unstructured":"Wei Wei Jiabin Tang Lianghao Xia Yangqin Jiang and Chao Huang. 2024. PromptMM: Multi-Modal Knowledge Distillation for Recommendation with Prompt-Tuning. In WWW. 3217--3228.","DOI":"10.1145\/3589334.3645359"},{"key":"e_1_3_2_1_46_1","unstructured":"Xiangpeng Wei Rongxiang Weng Yue Hu Luxi Xing Heng Yu and Weihua Luo. 2021. On Learning Universal Representations Across Languages. In ICLR."},{"key":"e_1_3_2_1_47_1","unstructured":"Guillaume Wenzek Marie-Anne Lachaux Alexis Conneau Vishrav Chaudhary Francisco Guzm\u00e1n Armand Joulin and Edouard Grave. 2020. CCNet: Extracting High Quality Monolingual Datasets from Web Crawl Data. In LREC. 4003--4012."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"crossref","unstructured":"Yuya Yoshikawa Yutaro Shigeto and Akikazu Takeuchi. 2017. STAIR Captions: Constructing a Large-Scale Japanese Image Caption Dataset. In ACL. 417--421.","DOI":"10.18653\/v1\/P17-2066"},{"key":"e_1_3_2_1_49_1","volume-title":"From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions. Trans. Assoc. Comput. Linguistics","author":"Young Peter","year":"2014","unstructured":"Peter Young, Alice Lai, Micah Hodosh, and Julia Hockenmaier. 2014. From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions. Trans. Assoc. Comput. Linguistics (2014), 67--78."},{"key":"e_1_3_2_1_50_1","unstructured":"Lianmin Zheng WeiLin Chiang Ying Sheng Siyuan Zhuang Zhanghao Wu Yonghao Zhuang Zi Lin Zhuohan Li Dacheng Li Eric P. Xing Hao Zhang Joseph E. Gonzalez and Ion Stoica. 2023. Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. In NeurIPS."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Chulun Zhou Yunlong Liang Fandong Meng Jinan Xu Jinsong Su and Jie Zhou. 2023. RC3: Regularized Contrastive Cross-lingual Cross-modal Pre-training. In ACL Findings. 11747--11762.","DOI":"10.18653\/v1\/2023.findings-acl.746"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"crossref","unstructured":"Mingyang Zhou Luowei Zhou Shuohang Wang Yu Cheng Linjie Li Zhou Yu and Jingjing Liu. 2021. UC2: Universal Cross-Lingual Cross-Modal Vision-and-Language Pre-Training. In CVPR. 4155--4165.","DOI":"10.1109\/CVPR46437.2021.00414"}],"event":{"name":"WWW '25: The ACM Web Conference 2025","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"],"location":"Sydney NSW Australia","acronym":"WWW '25"},"container-title":["Proceedings of the ACM on Web Conference 2025"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696410.3714861","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3696410.3714861","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:42Z","timestamp":1750295922000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696410.3714861"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,22]]},"references-count":52,"alternative-id":["10.1145\/3696410.3714861","10.1145\/3696410"],"URL":"https:\/\/doi.org\/10.1145\/3696410.3714861","relation":{},"subject":[],"published":{"date-parts":[[2025,4,22]]},"assertion":[{"value":"2025-04-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}