{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T05:01:47Z","timestamp":1777611707111,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":82,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3611992","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:40Z","timestamp":1698391660000},"page":"3637-3646","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Cross-Lingual Transfer of Large Language Model by Visually-Derived Supervision Toward Low-Resource Languages"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-3620-1231","authenticated-orcid":false,"given":"Masayasu","family":"Muraoka","sequence":"first","affiliation":[{"name":"IBM Research, Chuo-ku, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-7097-4891","authenticated-orcid":false,"given":"Bishwaranjan","family":"Bhattacharjee","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown Heights, NY, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4358-8671","authenticated-orcid":false,"given":"Michele","family":"Merler","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown Heights, NY, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-0609-7613","authenticated-orcid":false,"given":"Graeme","family":"Blackwood","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown Heights, NY, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3412-0732","authenticated-orcid":false,"given":"Yulong","family":"Li","sequence":"additional","affiliation":[{"name":"IBM Research, Yorktown Heights, NY, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3774-6773","authenticated-orcid":false,"given":"Yang","family":"Zhao","sequence":"additional","affiliation":[{"name":"IBM Research, Chuo-ku, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Gerry TM Altmann and Yuki Kamide. 2004. Now you see it now you don't: mediating the mapping between language and the visual world. In The interface of language vision and action: eye movements and the visual world. 347--368."},{"key":"e_1_3_2_1_2_1","volume-title":"Jamie Ryan Kiros, and Geoffrey E. Hinton","author":"Ba Jimmy Lei","year":"2016","unstructured":"Jimmy Lei Ba, Jamie Ryan Kiros, and Geoffrey E. Hinton. 2016. Layer Normalization. arXiv:1607.06450 [stat.ML]"},{"key":"e_1_3_2_1_3_1","volume-title":"ICLR","author":"Bahdanau Dzmitry","unstructured":"Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. 2015. Neural Machine Translation by Jointly Learning to Align and Translate. In ICLR. http:\/\/arxiv.org\/ abs\/1409.0473"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Paul Bloom. 2000. How children learn the meanings of words. (2000).","DOI":"10.7551\/mitpress\/3577.001.0001"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","unstructured":"Patrick Bordes Eloi Zablocki Laure Soulier Benjamin Piwowarski and Patrick Gallinari. 2019. Incorporating Visual Semantics into Sentence Representations within a Grounded Space. In EMNLP-IJCNLP. 696--707. https:\/\/doi.org\/10.18653\/v1\/D19-1064","DOI":"10.18653\/v1\/D19-1064"},{"key":"e_1_3_2_1_6_1","volume-title":"Pranava Madhyastha, Erkut Erdem, Aykut Erdem, and Lucia Specia.","author":"Caglayan Ozan","year":"2021","unstructured":"Ozan Caglayan, Menekse Kuyu, Mustafa Sercan Amac, Pranava Madhyastha, Erkut Erdem, Aykut Erdem, and Lucia Specia. 2021. Cross-lingual Visual Pretraining for Multimodal Machine Translation. In EACL. 1317--1324. https:\/\/doi. org\/10.18653\/v1\/2021.eacl-main.112"},{"key":"e_1_3_2_1_7_1","volume-title":"Carlos Riquelme Ruiz, Andreas Peter Steiner, Anelia Angelova, Xiaohua Zhai, Neil Houlsby, and Radu Soricut.","author":"Chen Xi","year":"2023","unstructured":"Xi Chen, Xiao Wang, Soravit Changpinyo, AJ Piergiovanni, Piotr Padlewski, Daniel Salz, Sebastian Goodman, Adam Grycner, Basil Mustafa, Lucas Beyer, Alexander Kolesnikov, Joan Puigcerver, Nan Ding, Keran Rong, Hassan Akbari, Gaurav Mishra, Linting Xue, Ashish V Thapliyal, James Bradbury, Weicheng Kuo, Mojtaba Seyedhosseini, Chao Jia, Burcu Karagol Ayan, Carlos Riquelme Ruiz, Andreas Peter Steiner, Anelia Angelova, Xiaohua Zhai, Neil Houlsby, and Radu Soricut. 2023. PaLI: A Jointly-Scaled Multilingual Language-Image Model. In ICLR. https:\/\/openreview.net\/forum?id=mWVoBz4W0u"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","unstructured":"Gordon Christie Ankit Laddha Aishwarya Agrawal Stanislaw Antol Yash Goyal Kevin Kochersberger and Dhruv Batra. 2016. Resolving Language and Vision Ambiguities Together: Joint Segmentation & Prepositional Attachment Resolution in Captioned Scenes. In EMNLP. 1493--1503. https:\/\/doi.org\/10.18653\/v1\/D16-1156","DOI":"10.18653\/v1\/D16-1156"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00317"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v31i1.11155"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","unstructured":"Alexis Conneau Kartikay Khandelwal Naman Goyal Vishrav Chaudhary Guillaume Wenzek Francisco Guzm\u00e1n Edouard Grave Myle Ott Luke Zettlemoyer and Veselin Stoyanov. 2020. Unsupervised Cross-lingual Representation Learning at Scale. In ACL. 8440--8451. https:\/\/doi.org\/10.18653\/v1\/2020.acl-main.747","DOI":"10.18653\/v1\/2020.acl-main.747"},{"key":"e_1_3_2_1_12_1","unstructured":"Alexis Conneau and Guillaume Lample. 2019. Cross-Lingual Language Model Pretraining. In NeurIPS. https:\/\/dl.acm.org\/doi\/abs\/10.5555\/3454287.3454921"},{"key":"e_1_3_2_1_13_1","volume-title":"XNLI: Evaluating Crosslingual Sentence Representations. In EMNLP. 2475--2485. https:\/\/doi.org\/10. 18653\/v1\/D18-1269","author":"Conneau Alexis","year":"2018","unstructured":"Alexis Conneau, Ruty Rinott, Guillaume Lample, Adina Williams, Samuel Bowman, Holger Schwenk, and Veselin Stoyanov. 2018. XNLI: Evaluating Crosslingual Sentence Representations. In EMNLP. 2475--2485. https:\/\/doi.org\/10. 18653\/v1\/D18-1269"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","unstructured":"Alexis Conneau Shijie Wu Haoran Li Luke Zettlemoyer and Veselin Stoyanov. 2020. Emerging Cross-lingual Structure in Pretrained Language Models. In ACL. 6022--6034. https:\/\/doi.org\/10.18653\/v1\/2020.acl-main.536","DOI":"10.18653\/v1\/2020.acl-main.536"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cognition.2013.02.003"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-1423"},{"key":"e_1_3_2_1_17_1","volume-title":"Multilingual BERT readme document. Retrieved","author":"Devlin Jacob","year":"2023","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Multilingual BERT readme document. Retrieved April 1, 2023 from https: \/\/github.com\/google-research\/bert\/blob\/master\/multilingual.md"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W16-3210"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/"},{"key":"e_1_3_2_1_20_1","volume-title":"Cluster analysis of multivariate data: Efficiency vs. interpretability of classifications. biometrics 21","author":"Forgy Edward W","year":"1965","unstructured":"Edward W Forgy. 1965. Cluster analysis of multivariate data: Efficiency vs. interpretability of classifications. biometrics 21 (1965), 768--769."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"Ruka Funaki and Hideki Nakayama. 2015. Image-Mediated Learning for Zero-Shot Cross-Lingual Document Retrieval. In EMNLP. 585--590. https:\/\/doi.org\/10. 18653\/v1\/D15-1070","DOI":"10.18653\/v1\/D15-1070"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1016\/S1364-6613(99)01397-2"},{"key":"e_1_3_2_1_23_1","volume-title":"The IAPR TC-12 Benchmark: A New Evaluation Resource for Visual Information Systems. In OntoImage 2006 Workshop on Language Resources for Content-based Image Retrieval during LREC 2006 Final Programme.","author":"Grubinger Michael","year":"2006","unstructured":"Michael Grubinger, Paul Clough, Henning M\u00fcller, and Thomas Deselaers. 2006. The IAPR TC-12 Benchmark: A New Evaluation Resource for Visual Information Systems. In OntoImage 2006 Workshop on Language Resources for Content-based Image Retrieval during LREC 2006 Final Programme."},{"key":"e_1_3_2_1_24_1","unstructured":"Jiuxiang Gu Jianfei Cai Shafiq R. Joty Li Niu and GangWang. 2018. Look Imagine and Match: Improving Textual-Visual Cross-Modal Retrieval With Generative Models. In CVPR. https:\/\/openaccess.thecvf.com\/content_cvpr_2018\/html\/Gu_ Look_Imagine_and_CVPR_2018_paper.html"},{"key":"e_1_3_2_1_25_1","volume-title":"Qinyu Zhang, and Ji-Rong Wen.","author":"Guo Hangyu","year":"2023","unstructured":"Hangyu Guo, Kun Zhou, Wayne Xin Zhao, Qinyu Zhang, and Ji-Rong Wen. 2023. Visually-augmented pretrained language models for NLP tasks without images. In ACL. 14912--14929. https:\/\/aclanthology.org\/2023.acl-long.833"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Raia Hadsell Sumit Chopra and Yann LeCun. 2006. Dimensionality Reduction by Learning an Invariant Mapping. In CVPR. 1735--1742. https:\/\/doi.org\/10.1109\/ CVPR.2006.100","DOI":"10.1109\/CVPR.2006.100"},{"key":"e_1_3_2_1_27_1","unstructured":"Dan Hendrycks and Kevin Gimpel. 2020. Gaussian Error Linear Units (GELUs). arXiv:1606.08415 [cs.LG]"},{"key":"e_1_3_2_1_28_1","first-page":"4411","article-title":"XTREME: A Massively Multilingual Multi-task Benchmark for Evaluating Cross-lingual Generalisation","volume":"119","author":"Hu Junjie","year":"2020","unstructured":"Junjie Hu, Sebastian Ruder, Aditya Siddhant, Graham Neubig, Orhan Firat, and Melvin Johnson. 2020. XTREME: A Massively Multilingual Multi-task Benchmark for Evaluating Cross-lingual Generalisation. In ICML, Vol. 119. 4411--4421. https: \/\/proceedings.mlr.press\/v119\/hu20b.html","journal-title":"ICML"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","unstructured":"Po-Yao Huang Mandela Patrick Junjie Hu Graham Neubig Florian Metze and Alexander Hauptmann. 2021. Multilingual Multimodal Pre-training for Zero-Shot Cross-Lingual Transfer of Vision-Language Models. In NAACL. 2443--2459. https:\/\/doi.org\/10.18653\/v1\/2021.naacl-main.195","DOI":"10.18653\/v1\/2021.naacl-main.195"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","unstructured":"Julia Ive Pranava Madhyastha and Lucia Specia. 2019. Distilling Translations with Visual Awareness. In ACL. 6525--6538. https:\/\/doi.org\/10.18653\/v1\/P19-1653","DOI":"10.18653\/v1\/P19-1653"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/TBDATA.2019.2921572"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Douwe Kiela Alexis Conneau Allan Jabri and Maximilian Nickel. 2018. Learning Visually Grounded Sentence Representations. In NAACL. 408--418. https:\/\/doi. org\/10.18653\/v1\/N18-1038","DOI":"10.18653\/v1\/N18-1038"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Douwe Kiela Ivan Vuli\u0107 and Stephen Clark. 2015. Visual Bilingual Lexicon Induction with Transferred ConvNet Features. In EMNLP. 148--158. https:\/\/doi. org\/10.18653\/v1\/D15--1015","DOI":"10.18653\/v1\/D15-1015"},{"key":"e_1_3_2_1_34_1","volume-title":"Kingma and Jimmy Ba","author":"Diederik","year":"2015","unstructured":"Diederik P. Kingma and Jimmy Ba. 2015. Adam: A Method for Stochastic Optimization. In ICLR. http:\/\/arxiv.org\/abs\/1412.6980"},{"key":"e_1_3_2_1_35_1","volume-title":"Sorting and Searching","author":"Knuth Donald Ervin","unstructured":"Donald Ervin Knuth. 1998. The Art of Computer Programming. Vol. 3: Sorting and Searching. Addison-Wesley."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","unstructured":"Noriyuki Kojima Hadar Averbuch-Elor Alexander Rush and Yoav Artzi. 2020. What is Learned in Visually Grounded Neural Syntax Acquisition. In ACL. 2615--2635. https:\/\/doi.org\/10.18653\/v1\/2020.acl-main.234","DOI":"10.18653\/v1\/2020.acl-main.234"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Chen Kong Dahua Lin Mohit Bansal Raquel Urtasun and Sanja Fidler. 2014. What are You Talking About? Text-to-Image Coreference. In CVPR. https:\/\/openaccess.thecvf.com\/content_cvpr_2014\/html\/Kong_What_ are_You_2014_CVPR_paper.html","DOI":"10.1109\/CVPR.2014.455"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/N15-1016"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-naacl.48"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Liunian Harold Li Haoxuan You Zhecan Wang Alireza Zareian Shih-Fu Chang and Kai-Wei Chang. 2021. Unsupervised Vision-and-Language Pre-training Without Parallel Images and Captions. In NAACL. 5339--5350. https:\/\/doi.org\/10. 18653\/v1\/2021.naacl-main.420","DOI":"10.18653\/v1\/2021.naacl-main.420"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.202"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","unstructured":"Tsung-Yi Lin Michael Maire Serge Belongie James Hays Pietro Perona Deva Ramanan Piotr Doll\u00e1r and C. Lawrence Zitnick. 2014. Microsoft COCO: Common Objects in Context. In ECCV. 740--755. https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","unstructured":"Yu-Hsiang Lin Chian-Yu Chen Jean Lee Zirui Li Yuyan Zhang Mengzhou Xia Shruti Rijhwani Junxian He Zhisong Zhang Xuezhe Ma Antonios Anastasopoulos Patrick Littell and Graham Neubig. 2019. Choosing Transfer Languages for Cross-Lingual Learning. In ACL. 3125--3135. https:\/\/doi.org\/10.18653\/v1\/P19-1301","DOI":"10.18653\/v1\/P19-1301"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","unstructured":"Xiao Liu Da Yin Yansong Feng and Dongyan Zhao. 2022. Things not Written in Text: Exploring Spatial Commonsense from Visual Signals. In ACL. 2365--2376. https:\/\/doi.org\/10.18653\/v1\/2022.acl-long.168","DOI":"10.18653\/v1\/2022.acl-long.168"},{"key":"e_1_3_2_1_46_1","unstructured":"Yinhan Liu Myle Ott Naman Goyal Jingfei Du Mandar Joshi Danqi Chen Omer Levy Mike Lewis Luke Zettlemoyer and Veselin Stoyanov. 2019. RoBERTa: A Robustly Optimized BERT Pretraining Approach. arXiv:1907.11692 [cs.CL]"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIT.1982.1056489"},{"key":"e_1_3_2_1_48_1","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. DecoupledWeight Decay Regularization. In ICLR. https:\/\/openreview.net\/forum?id=Bkg6RiCqY7"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","unstructured":"Yujie Lu Wanrong Zhu Xin Wang Miguel Eckstein and William Yang Wang. 2022. Imagination-Augmented Natural Language Understanding. In NAACL. 4392--4402. https:\/\/doi.org\/10.18653\/v1\/2022.naacl-main.326","DOI":"10.18653\/v1\/2022.naacl-main.326"},{"key":"e_1_3_2_1_50_1","unstructured":"TorchVision maintainers and contributors. 2016. TorchVision: PyTorch's Computer Vision library. https:\/\/github.com\/pytorch\/vision"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.21105\/joss.00861"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","unstructured":"Masayasu Muraoka Tetsuya Nasukawa and Bishwaranjan Bhattacharjee. 2020. Visual Objects As Context: Exploiting Visual Objects for Lexical Entailment. In Findings of EMNLP. 2723--2735. https:\/\/doi.org\/10.18653\/v1\/2020.findingsemnlp. 246","DOI":"10.18653\/v1\/2020.findingsemnlp"},{"key":"e_1_3_2_1_53_1","volume-title":"Hinton","author":"Nair Vinod","year":"2010","unstructured":"Vinod Nair and Geoffrey E. Hinton. 2010. Rectified Linear Units Improve Restricted Boltzmann Machines. In ICML. 807--814. https:\/\/dl.acm.org\/doi\/10.5555\/ 3104322.3104425"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","unstructured":"Xiaoman Pan Boliang Zhang Jonathan May Joel Nothman Kevin Knight and Heng Ji. 2017. Cross-lingual Name Tagging and Linking for 282 Languages. In ACL. 1946--1958. https:\/\/doi.org\/10.18653\/v1\/P17-1178","DOI":"10.18653\/v1\/P17-1178"},{"key":"e_1_3_2_1_55_1","volume-title":"Dual Language Development and Disorders: A Handbook on Bilingualism and Second Language Learning","author":"Paradis Johanne","year":"2011","unstructured":"Johanne Paradis, Fred Genesee, and Martha B Crago. 2011. Dual Language Development and Disorders: A Handbook on Bilingualism and Second Language Learning. Brookes Publishing Company (2011)."},{"key":"e_1_3_2_1_56_1","volume-title":"PyTorch: An Imperative Style","author":"Paszke Adam","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. 2019. PyTorch: An Imperative Style, High-Performance Deep Learning Library. In NeurIPS. 8024--8035. http:\/\/papers.neurips.cc\/paper\/9015-pytorchan-imperative-style-high-performance-deep-learning-library.pdf"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","unstructured":"Telmo Pires Eva Schlinger and Dan Garrette. 2019. How Multilingual is Multilingual BERT?. In ACL. 4996--5001. https:\/\/doi.org\/10.18653\/v1\/P19--1493","DOI":"10.18653\/v1\/P19--1493"},{"key":"e_1_3_2_1_58_1","unstructured":"Alec Radford Jeff Wu Rewon Child David Luan Dario Amodei and Ilya Sutskever. 2019. Language Models are Unsupervised Multitask Learners. (2019). https:\/\/openai.com\/research\/better-language-models"},{"key":"e_1_3_2_1_59_1","volume-title":"So Kweon, and Joon Son Chung.","author":"Ryu Hyeonggon","year":"2023","unstructured":"Hyeonggon Ryu, Arda Senocak, In So Kweon, and Joon Son Chung. 2023. Hindi as a Second Language: Improving Visually Grounded Speech with Semantically Similar Samples. arXiv:2303.17517 [cs.CL] https:\/\/arxiv.org\/abs\/2303.17517"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"crossref","unstructured":"Florian Schroff Dmitry Kalenichenko and James Philbin. 2015. FaceNet: A Unified Embedding for Face Recognition and Clustering. In CVPR. https:\/\/www.cv-foundation.org\/openaccess\/content_cvpr_2015\/html\/Schroff_ FaceNet_A_Unified_2015_CVPR_paper.html","DOI":"10.1109\/CVPR.2015.7298682"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"crossref","unstructured":"Haoyue Shi Jiayuan Mao Kevin Gimpel and Karen Livescu. 2019. Visually Grounded Neural Syntax Acquisition. In ACL. 1842--1861. https:\/\/doi.org\/10. 18653\/v1\/P19-1180","DOI":"10.18653\/v1\/P19-1180"},{"key":"e_1_3_2_1_62_1","first-page":"539","article-title":"Using Visuals to Help the Second Language Learner","volume":"34","author":"Sinatra Richard","year":"1981","unstructured":"Richard Sinatra. 1981. Using Visuals to Help the Second Language Learner. The Reading Teacher 34, 5 (1981), 539--546. http:\/\/www.jstor.org\/stable\/20195283","journal-title":"The Reading Teacher"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"crossref","unstructured":"D\u00eddac Sur\u00eds Dave Epstein and Carl Vondrick. 2022. Globetrotter: Connecting Languages by Connecting Images. In CVPR. 16474--16484. https:\/\/openaccess.thecvf.com\/content\/CVPR2022\/html\/Suris_Globetrotter_ Connecting_Languages_by_Connecting_Images_CVPR_2022_paper.html","DOI":"10.1109\/CVPR52688.2022.01598"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.162"},{"key":"e_1_3_2_1_66_1","first-page":"24468","article-title":"VidLanKD: Improving Language Understanding via Video-Distilled Knowledge Transfer","volume":"34","author":"Tang Zineng","year":"2021","unstructured":"Zineng Tang, Jaemin Cho, Hao Tan, and Mohit Bansal. 2021. VidLanKD: Improving Language Understanding via Video-Distilled Knowledge Transfer. In NeurIPS, Vol. 34. 24468--24481. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2021\/file\/ccdf3864e2fa9089f9eca4fc7a48ea0a-Paper.pdf","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_67_1","volume-title":"NeurIPS","volume":"30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In NeurIPS, Vol. 30. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/ 2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","unstructured":"Ivan Vuli\u0107 Douwe Kiela Stephen Clark and Marie-Francine Moens. 2016. Multi-Modal Representations for Improved Bilingual Lexicon Learning. In ACL. 188--194. https:\/\/doi.org\/10.18653\/v1\/P16--2031","DOI":"10.18653\/v1\/P16--2031"},{"key":"e_1_3_2_1_69_1","volume-title":"Bowman","author":"Singh Amanpreet","year":"2019","unstructured":"AlexWang, Amanpreet Singh, Julian Michael, Felix Hill, Omer Levy, and Samuel R. Bowman. 2019. GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding. In ICLR. https:\/\/openreview.net\/forum?id= rJ4km2R5t7"},{"key":"e_1_3_2_1_70_1","unstructured":"Weizhi Wang Li Dong Hao Cheng Haoyu Song Xiaodong Liu Xifeng Yan Jianfeng Gao and Furu Wei. 2023. Visually-Augmented Language Modeling. In ICLR. https:\/\/openreview.net\/forum?id=8IN-qLkl215"},{"key":"e_1_3_2_1_71_1","unstructured":"Guillaume Wenzek Marie-Anne Lachaux Alexis Conneau Vishrav Chaudhary Francisco Guzm\u00e1n Armand Joulin and Edouard Grave. 2020. CCNet: Extracting High Quality Monolingual Datasets from Web Crawl Data. In LREC. 4003--4012. https:\/\/aclanthology.org\/2020.lrec-1.494"},{"key":"e_1_3_2_1_72_1","volume-title":"Sylvain Gugger, Mariama Drame, Quentin Lhoest, and Alexander Rush.","author":"Debut Lysandre","year":"2020","unstructured":"ThomasWolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Clement Delangue, Anthony Moi, Pierric Cistac, Tim Rault, Remi Louf, Morgan Funtowicz, Joe Davison, Sam Shleifer, Patrick von Platen, Clara Ma, Yacine Jernite, Julien Plu, Canwen Xu, Teven Le Scao, Sylvain Gugger, Mariama Drame, Quentin Lhoest, and Alexander Rush. 2020. Transformers: State-of-the-Art Natural Language Processing. In EMNLP: System Demonstrations. 38--45. https:\/\/doi.org\/10.18653\/ v1\/2020.emnlp-demos.6"},{"key":"e_1_3_2_1_73_1","volume-title":"ICML Workshop on The How2 Challenge: New Tasks for Vision and Language. https:\/\/srvk.github.io\/how2-challenge\/assets\/authors\/1908","author":"Wu Zixiu","year":"2019","unstructured":"Zixiu Wu, Julia Ive, Josiah Wang, Pranava Madhyastha, and Lucia Specia. 2019. Predicting Actions to Help Predict Translations. In ICML Workshop on The How2 Challenge: New Tasks for Vision and Language. https:\/\/srvk.github.io\/how2-challenge\/assets\/authors\/1908.01665.pdf"},{"key":"e_1_3_2_1_74_1","volume-title":"Visual Entailment: A Novel Task for Fine-Grained Image Understanding. arXiv:1901.06706 [cs.CV]","author":"Xie Ning","year":"2019","unstructured":"Ning Xie, Farley Lai, Derek Doran, and Asim Kadav. 2019. Visual Entailment: A Novel Task for Fine-Grained Image Understanding. arXiv:1901.06706 [cs.CV]"},{"key":"e_1_3_2_1_75_1","volume-title":"Zhuowen Tu, and Kaiming He.","author":"Xie Saining","year":"2017","unstructured":"Saining Xie, Ross Girshick, Piotr Dollar, Zhuowen Tu, and Kaiming He. 2017. Aggregated Residual Transformations for Deep Neural Networks. In CVPR. https:\/\/openaccess.thecvf.com\/content_cvpr_2017\/html\/Xie_Aggregated_ Residual_Transformations_CVPR_2017_paper.html"},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"crossref","unstructured":"Yue Yang Wenlin Yao Hongming Zhang Xiaoyang Wang Dong Yu and Jianshu Chen. 2022. Z-LaVI: Zero-Shot Language Solver Fueled by Visual Imagination. In EMNLP. 1186--1203. https:\/\/aclanthology.org\/2022.emnlp-main.78","DOI":"10.18653\/v1\/2022.emnlp-main.78"},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"crossref","unstructured":"Thomas Zenkel Joern Wuebker and John DeNero. 2020. End-to-End Neural Word Alignment Outperforms GIZA. In ACL. 1605--1617. https:\/\/doi.org\/10. 18653\/v1\/2020.acl-main.146","DOI":"10.18653\/v1\/2020.acl-main.146"},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.436"},{"key":"e_1_3_2_1_79_1","unstructured":"Zhuosheng Zhang Kehai Chen Rui Wang Masao Utiyama Eiichiro Sumita Zuchao Li and Hai Zhao. 2020. Neural Machine Translation with Universal Visual Representation. In ICLR. https:\/\/openreview.net\/forum?id=Byl8hhNYPS"},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"crossref","unstructured":"Mingyang Zhou Licheng Yu Amanpreet Singh Mengjiao Wang Zhou Yu and Ning Zhang. 2022. Unsupervised Vision-and-Language Pre-Training via Retrieval-Based Multi-Granular Alignment. In CVPR. 16485--16494. https:\/\/openaccess.thecvf.com\/content\/CVPR2022\/html\/ Zhou_Unsupervised_Vision-and-Language_Pre-Training_via_Retrieval-Based_Multi-Granular_Alignment_CVPR_2022_paper.html","DOI":"10.1109\/CVPR52688.2022.01599"},{"key":"e_1_3_2_1_81_1","doi-asserted-by":"crossref","unstructured":"Mingyang Zhou Luowei Zhou Shuohang Wang Yu Cheng Linjie Li Zhou Yu and Jingjing Liu. 2021. UC2: Universal Cross-Lingual Cross-Modal Vision and-Language Pre-Training. In CVPR. 4155--4165. https:\/\/openaccess.thecvf. com\/content\/CVPR2021\/html\/Zhou_UC2_Universal_Cross-Lingual_Cross-Modal_Vision-and-Language_Pre-Training_CVPR_2021_paper.html","DOI":"10.1109\/CVPR46437.2021.00414"},{"key":"e_1_3_2_1_82_1","volume-title":"An Yan, Miguel Eckstein, and William Yang Wang.","author":"Zhu Wanrong","year":"2023","unstructured":"Wanrong Zhu, Xin Eric Wang, An Yan, Miguel Eckstein, and William Yang Wang. 2023. ImaginE: An Imagination-Based Automatic Evaluation Metric for Natural Language Generation. arXiv:2106.05970 [cs.CL]"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611992","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3611992","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:11:56Z","timestamp":1755821516000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611992"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":82,"alternative-id":["10.1145\/3581783.3611992","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3611992","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}