{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,11]],"date-time":"2026-04-11T15:07:57Z","timestamp":1775920077712,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,7,11]],"date-time":"2021-07-11T00:00:00Z","timestamp":1625961600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,7,11]]},"DOI":"10.1145\/3404835.3463257","type":"proceedings-article","created":{"date-parts":[[2021,7,21]],"date-time":"2021-07-21T17:18:49Z","timestamp":1626887929000},"page":"2443-2449","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":139,"title":["WIT: Wikipedia-based Image Text Dataset for Multimodal Multilingual Machine Learning"],"prefix":"10.1145","author":[{"given":"Krishna","family":"Srinivasan","sequence":"first","affiliation":[{"name":"Google, Mountain View, CA, USA"}]},{"given":"Karthik","family":"Raman","sequence":"additional","affiliation":[{"name":"Google, Mountain View, CA, USA"}]},{"given":"Jiecao","family":"Chen","sequence":"additional","affiliation":[{"name":"Google, Mountain View, CA, USA"}]},{"given":"Michael","family":"Bendersky","sequence":"additional","affiliation":[{"name":"Google, Mountain View, CA, USA"}]},{"given":"Marc","family":"Najork","sequence":"additional","affiliation":[{"name":"Google, Mountain View, CA, USA"}]}],"member":"320","published-online":{"date-parts":[[2021,7,11]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1219"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3340531.3412783"},{"key":"e_1_3_2_2_4_1","volume-title":"Proceedings of the Third Conference on Machine Translation (Volume 2: Shared Task Papers (WMT). 304--323","author":"Bougares Fethi","year":"2018","unstructured":"Lo\"ic Barrault, Fethi Bougares, Lucia Specia, Chiraag Lala, Desmond Elliott, and Stella Frank. 2018. Findings of the Third Shared Task on Multimodal Machine Translation. In Proceedings of the Third Conference on Machine Translation (Volume 2: Shared Task Papers (WMT). 304--323."},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/1809028.1806638"},{"key":"e_1_3_2_2_6_1","volume-title":"Conceptual 12M: Pushing Web-Scale Image-Text Pre-Training To Recognize Long-Tail Visual Concepts. arXiv preprint arXiv:2102.08981","author":"Changpinyo Soravit","year":"2021","unstructured":"Soravit Changpinyo, Piyush Sharma, Nan Ding, and Radu Soricut. 2021. Conceptual 12M: Pushing Web-Scale Image-Text Pre-Training To Recognize Long-Tail Visual Concepts. arXiv preprint arXiv:2102.08981 (2021)."},{"key":"e_1_3_2_2_7_1","volume-title":"Faisal Ahmed, Zhe Gan, Yu Cheng, and Jingjing Liu.","author":"Chen Yen-Chun","year":"2019","unstructured":"Yen-Chun Chen, Linjie Li, Licheng Yu, Ahmed El Kholy, Faisal Ahmed, Zhe Gan, Yu Cheng, and Jingjing Liu. 2019. Uniter: Learning universal image-text representations. arXiv preprint arXiv:1909.11740 (2019)."},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_2_9_1","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","volume":"1","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers) (NAACL-HLT). 4171--4186."},{"key":"e_1_3_2_2_10_1","volume-title":"Proceedings of the 9th International Conference on Learning Representations (ICLR).","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In Proceedings of the 9th International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W17-4718"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W16-3210"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1239"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-1227"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.5555\/2566972.2566993"},{"key":"e_1_3_2_2_16_1","volume-title":"Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision. arXiv preprint arXiv:2102.05918","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V Le, Yunhsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision. arXiv preprint arXiv:2102.05918 (2021)."},{"key":"e_1_3_2_2_17_1","volume-title":"Graph-RISE: Graph-Regularized Image Semantic Embedding. arXiv preprint arXiv:1902.10814","author":"Juan Da-Cheng","year":"2019","unstructured":"Da-Cheng Juan, Chun-Ta Lu, Zhen Li, Futang Peng, Aleksei Timofeev, Yi-Ting Chen, Yaxi Gao, Tom Duerig, Andrew Tomkins, and Sujith Ravi. 2019. Graph-RISE: Graph-Regularized Image Semantic Embedding. arXiv preprint arXiv:1902.10814 (2019)."},{"key":"e_1_3_2_2_18_1","volume-title":"2019 a. Unicoder-VL: A Universal Encoder for Vision and Language by Cross-modal Pre-training. arXiv e-prints, page. arXiv preprint arXiv:1908.06066","author":"Li Gen","year":"2019","unstructured":"Gen Li, Nan Duan, Yuejian Fang, Ming Gong, Daxin Jiang, and Ming Zhou. 2019 a. Unicoder-VL: A Universal Encoder for Vision and Language by Cross-modal Pre-training. arXiv e-prints, page. arXiv preprint arXiv:1908.06066 (2019)."},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"e_1_3_2_2_20_1","volume-title":"2019 c. VisualBERT: A Simple and Performant Baseline for Vision and Language. arXiv preprint arXiv:1908.03557","author":"Li Liunian Harold","year":"2019","unstructured":"Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, and Kai-Wei Chang. 2019 c. VisualBERT: A Simple and Performant Baseline for Vision and Language. arXiv preprint arXiv:1908.03557 (2019)."},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2019.2896494"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_2_23_1","volume-title":"Proceedings of the 33rd Conference on Neural Information Processing Systems (NeurIPS). 13--23","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks. In Proceedings of the 33rd Conference on Neural Information Processing Systems (NeurIPS). 13--23."},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_2_2_25_1","volume-title":"Proceedings of the 25th Annual Conference on Neural Information Processing Systems (NeurIPS). 1143--1151","author":"Ordonez Vicente","year":"2011","unstructured":"Vicente Ordonez, Girish Kulkarni, and Tamara Berg. 2011. Im2Text: Describing Images Using 1 Million Captioned Photographs. In Proceedings of the 25th Annual Conference on Neural Information Processing Systems (NeurIPS). 1143--1151."},{"key":"e_1_3_2_2_26_1","volume-title":"ImageBERT: Cross-modal Pre-training with Large-scale Weak-supervised Image-Text Data. arXiv preprint arXiv:2001.07966","author":"Qi Di","year":"2020","unstructured":"Di Qi, Lin Su, Jia Song, Edward Cui, Taroon Bharti, and Arun Sacheti. 2020. ImageBERT: Cross-modal Pre-training with Large-scale Weak-supervised Image-Text Data. arXiv preprint arXiv:2001.07966 (2020)."},{"key":"e_1_3_2_2_27_1","first-page":"T2","article-title":"Learning transferable visual models from natural language supervision","volume":"2","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning transferable visual models from natural language supervision. Image, Vol. 2 (2021), T2.","journal-title":"Image"},{"key":"e_1_3_2_2_28_1","first-page":"1","article-title":"Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer","volume":"21","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J Liu. 2020. Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer. Journal of Machine Learning Research, Vol. 21, 140 (2020), 1--67.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"e_1_3_2_2_30_1","volume-title":"Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556","author":"Simonyan Karen","year":"2014","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)."},{"key":"e_1_3_2_2_31_1","volume-title":"Learning multilingual word embeddings using image-text data. arXiv preprint arXiv:1905.12260","author":"Singhal Karan","year":"2019","unstructured":"Karan Singhal, Karthik Raman, and Balder ten Cate. 2019. Learning multilingual word embeddings using image-text data. arXiv preprint arXiv:1905.12260 (2019)."},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W16-2346"},{"key":"e_1_3_2_2_33_1","volume-title":"Proceedings of the 8th International Conference on Learning Representations (ICLR).","author":"Su Weijie","year":"2020","unstructured":"Weijie Su, Xizhou Zhu, Yue Cao, Bin Li, Lewei Lu, Furu Wei, and Jifeng Dai. 2020. VL-BERT: Pre-training of Generic Visual-Linguistic Representations. In Proceedings of the 8th International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.97"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.308"},{"key":"e_1_3_2_2_36_1","volume-title":"Proceedings of the 31st Conference on Neural Information Processing Systems (NeurIPS). 5998--6008","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Proceedings of the 31st Conference on Neural Information Processing Systems (NeurIPS). 5998--6008."},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1077"},{"key":"e_1_3_2_2_39_1","volume-title":"mT5: A massively multilingual pre-trained text-to-text transformer. arXiv preprint arXiv:2010.11934","author":"Xue Linting","year":"2020","unstructured":"Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, and Colin Raffel. 2020. mT5: A massively multilingual pre-trained text-to-text transformer. arXiv preprint arXiv:2010.11934 (2020)."},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00688"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.7005"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1400"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.11"}],"event":{"name":"SIGIR '21: The 44th International ACM SIGIR Conference on Research and Development in Information Retrieval","location":"Virtual Event Canada","acronym":"SIGIR '21","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"]},"container-title":["Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3404835.3463257","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3404835.3463257","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:17:44Z","timestamp":1750191464000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3404835.3463257"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,7,11]]},"references-count":44,"alternative-id":["10.1145\/3404835.3463257","10.1145\/3404835"],"URL":"https:\/\/doi.org\/10.1145\/3404835.3463257","relation":{},"subject":[],"published":{"date-parts":[[2021,7,11]]},"assertion":[{"value":"2021-07-11","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}