{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,22]],"date-time":"2025-10-22T09:52:56Z","timestamp":1761126776691,"version":"3.40.3"},"publisher-location":"Cham","reference-count":44,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783030585471"},{"type":"electronic","value":"9783030585488"}],"license":[{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2020]]},"DOI":"10.1007\/978-3-030-58548-8_12","type":"book-chapter","created":{"date-parts":[[2020,10,28]],"date-time":"2020-10-28T23:02:42Z","timestamp":1603926162000},"page":"197-213","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":10,"title":["Learning to Scale Multilingual Representations for Vision-Language Tasks"],"prefix":"10.1007","author":[{"given":"Andrea","family":"Burns","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Donghyun","family":"Kim","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Derry","family":"Wijaya","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kate","family":"Saenko","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bryan A.","family":"Plummer","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2020,10,29]]},"reference":[{"key":"12_CR1","doi-asserted-by":"crossref","unstructured":"Aharoni, R., Johnson, M., Firat, O.: Massively multilingual neural machine translation. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), June 2019","DOI":"10.18653\/v1\/N19-1388"},{"key":"12_CR2","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Bottom-up and top-down attention for image captioning and visual question answering. In: The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"12_CR3","doi-asserted-by":"crossref","unstructured":"Antol, S., et al.: VQA: visual question answering. In: The IEEE International Conference on Computer Vision (ICCV) (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"12_CR4","unstructured":"Arora, S., Liang, Y., Ma, T.: A simple but tough-to-beat baseline for sentence embeddings. In: International Conference on Learning Representations (ICLR) (2017)"},{"key":"12_CR5","doi-asserted-by":"crossref","unstructured":"Artetxe, M., Labaka, G., Agirre, E.: Learning principled bilingual mappings of word embeddings while preserving monolingual invariance. In: Empirical Methods in Natural Language Processing (EMNLP), pp. 2289\u20132294 (2016)","DOI":"10.18653\/v1\/D16-1250"},{"key":"12_CR6","doi-asserted-by":"crossref","unstructured":"Barrault, L., Bougares, F., Specia, L., Lala, C., Elliott, D., Frank, S.: Findings of the third shared task on multimodal machine translation. In: Proceedings of the Third Conference on Machine Translation: Shared Task Papers, pp. 304\u2013323 (2018)","DOI":"10.18653\/v1\/W18-6402"},{"key":"12_CR7","doi-asserted-by":"publisher","first-page":"135","DOI":"10.1162\/tacl_a_00051","volume":"5","author":"P Bojanowski","year":"2017","unstructured":"Bojanowski, P., Grave, E., Joulin, A., Mikolov, T.: Enriching word vectors with subword information. Trans. Assoc. Comput. Linguist. (TACL) 5, 135\u2013146 (2017)","journal-title":"Trans. Assoc. Comput. Linguist. (TACL)"},{"key":"12_CR8","doi-asserted-by":"crossref","unstructured":"Burns, A., Tan, R., Saenko, K., Sclaroff, S., Plummer, B.A.: Language features matter: effective language representations for vision-language tasks. In: The IEEE International Conference on Computer Vision (ICCV) (2019)","DOI":"10.1109\/ICCV.2019.00757"},{"key":"12_CR9","doi-asserted-by":"crossref","unstructured":"Cho, K., van Merrienboer, B., G\u00fcl\u00e7ehre, \u00c7., Bougares, F., Schwenk, H., Bengio, Y.: Learning phrase representations using RNN encoder-decoder for statistical machine translation. In: Empirical Methods in Natural Language Processing (EMNLP) (2014)","DOI":"10.3115\/v1\/D14-1179"},{"key":"12_CR10","unstructured":"Conneau, A., Lample, G.: Cross-lingual language model pretraining. In: Advances in Neural Information Processing Systems (NeurIPS) (2019)"},{"key":"12_CR11","unstructured":"Conneau, A., Lample, G., Ranzato, M., Denoyer, L., J\u00e9gou, H.: Word translation without parallel data. In: International Conference on Learning Representations (ICLR) (2018)"},{"key":"12_CR12","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"12_CR13","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: pre-training of deep bidirectional transformers for language understanding. arXiv:1810.04805v1 (2018)"},{"key":"12_CR14","doi-asserted-by":"crossref","unstructured":"Elliott, D., Frank, S., Barrault, L., Bougares, F., Specia, L.: Findings of the second shared task on multimodal machine translation and multilingual image description. arXiv:1710.07177 (2017)","DOI":"10.18653\/v1\/W17-4718"},{"key":"12_CR15","doi-asserted-by":"crossref","unstructured":"Elliott, D., Frank, S., Sima\u2019an, K., Specia, L.: Multi30k: multilingual English-German image descriptions. arXiv:1605.00459 (2016)","DOI":"10.18653\/v1\/W16-3210"},{"key":"12_CR16","doi-asserted-by":"crossref","unstructured":"Gella, S., Sennrich, R., Keller, F., Lapata, M.: Image pivoting for learning multilingual multimodal representations. In: Empirical Methods in Natural Language Processing (EMNLP) (2017)","DOI":"10.18653\/v1\/D17-1303"},{"key":"12_CR17","doi-asserted-by":"crossref","unstructured":"Goyal, Y., Khot, T., Summers-Stay, D., Batra, D., Parikh, D.: Making the V in VQA matter: elevating the role of image understanding in Visual Question Answering. In: The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2017)","DOI":"10.1109\/CVPR.2017.670"},{"key":"12_CR18","doi-asserted-by":"crossref","unstructured":"Gu, J., Hassan, H., Devlin, J., Li, V.O.: Universal neural machine translation for extremely low resource languages. In: Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (ACL-HLT) (2018)","DOI":"10.18653\/v1\/N18-1032"},{"key":"12_CR19","doi-asserted-by":"crossref","unstructured":"Gupta, T., Schwing, A., Hoiem, D.: Vico: word embeddings from visual co-occurrences. In: The IEEE International Conference on Computer Vision (ICCV) (2019)","DOI":"10.1109\/ICCV.2019.00752"},{"key":"12_CR20","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. arXiv:1512.03385 (2015)","DOI":"10.1109\/CVPR.2016.90"},{"key":"12_CR21","unstructured":"K, K., Wang, Z., Mayhew, S., Roth, D.: Cross-lingual ability of multilingual bert: an empirical study. arXiv:1912.07840 (2019)"},{"key":"12_CR22","doi-asserted-by":"crossref","unstructured":"Kazemzadeh, S., Ordonez, V., Matten, M., Berg, T.: ReferitGame: referring to objects in photographs of natural scenes. In: Empirical Methods in Natural Language Processing (EMNLP) (2014)","DOI":"10.3115\/v1\/D14-1086"},{"key":"12_CR23","doi-asserted-by":"crossref","unstructured":"Kim, D., Saito, K., Saenko, K., Sclaroff, S., Plummer, B.A.: Mule: multimodal universal language embedding. In: AAAI Conference on Artificial Intelligence (2020)","DOI":"10.1609\/aaai.v34i07.6785"},{"key":"12_CR24","unstructured":"Klein, B., Lev, G., Sadeh, G., Wolf, L.: Fisher vectors derived from hybrid Gaussian-Laplacian mixture models for image annotation. In: The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2015)"},{"key":"12_CR25","doi-asserted-by":"crossref","unstructured":"Krishna, R., et al.: Visual genome: Connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vis. (IJCV) (2017)","DOI":"10.1007\/s11263-016-0981-7"},{"key":"12_CR26","unstructured":"Li, L.H., Yatskar, M., Yin, D., Hsieh, C.J., Chang, K.W.: VisualBERT: a simple and performant baseline for vision and language. arXiv:1908.03557 (2019)"},{"key":"12_CR27","doi-asserted-by":"crossref","unstructured":"Li, X., et al.: COCO-CN for cross-lingual image tagging, captioning and retrieval. IEEE Trans. Multimedia (2019)","DOI":"10.1109\/TMM.2019.2896494"},{"key":"12_CR28","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"12_CR29","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: ViLBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. arXiv:1908.02265 (2019)"},{"issue":"3","key":"12_CR30","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1016\/0098-3004(93)90090-R","volume":"19","author":"A Ma\u0107kiewicz","year":"1993","unstructured":"Ma\u0107kiewicz, A., Ratajczak, W.: Principal components analysis (PCA). Comput. Geosci. 19(3), 303\u2013342 (1993)","journal-title":"Comput. Geosci."},{"key":"12_CR31","doi-asserted-by":"crossref","unstructured":"Miyazaki, T., Shimizu, N.: Cross-lingual image caption generation. In: Conference of the Association for Computational Linguistics (ACL) (2016)","DOI":"10.18653\/v1\/P16-1168"},{"key":"12_CR32","doi-asserted-by":"crossref","unstructured":"Nguyen, D.K., Okatani, T.: Multi-task learning of hierarchical vision-language representation. In: The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2019)","DOI":"10.1109\/CVPR.2019.01074"},{"key":"12_CR33","doi-asserted-by":"crossref","unstructured":"Pires, T., Schlinger, E., Garrette, D.: How multilingual is multilingual bert? arXiv:1906.01502 (2019)","DOI":"10.18653\/v1\/P19-1493"},{"key":"12_CR34","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., Caicedo, J.C., Hockenmaier, J., Lazebnik, S.: Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. In: The IEEE International Conference on Computer Vision (ICCV) (2015)","DOI":"10.1109\/ICCV.2015.303"},{"key":"12_CR35","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. In: Advances in Neural Information Processing Systems (NeurIPS) (2015)"},{"key":"12_CR36","unstructured":"Smith, S.L., Turban, D.H.P., Hamblin, S., Hammerla, N.Y.: Offline bilingual word vectors, orthogonal transformations and the inverted softmax. arXiv:1702.03859 (2017)"},{"key":"12_CR37","unstructured":"Su, W., et al.: Vl-BERT: pre-training of generic visual-linguistic representations. arXiv:1908.08530 (2019)"},{"key":"12_CR38","doi-asserted-by":"crossref","unstructured":"Tan, H., Bansal, M.: Lxmert: learning cross-modality encoder representations from transformers. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing (EMNLP) (2019)","DOI":"10.18653\/v1\/D19-1514"},{"key":"12_CR39","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Guyon, I., et al. (eds.) Advances in Neural Information Processing Systems (NeurIPS), pp. 5998\u20136008 (2017)"},{"issue":"2","key":"12_CR40","doi-asserted-by":"publisher","first-page":"394","DOI":"10.1109\/TPAMI.2018.2797921","volume":"41","author":"L Wang","year":"2018","unstructured":"Wang, L., Li, Y., Huang, J., Lazebnik, S.: Learning two-branch neural networks for image-text matching tasks. IEEE Trans. Pattern Anal. Mach. Intell.(TPAMI) 41(2), 394\u2013407 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell.(TPAMI)"},{"key":"12_CR41","doi-asserted-by":"crossref","unstructured":"Wehrmann, J., Souza, D.M., Lopes, M.A., Barros, R.C.: Language-agnostic visual-semantic embeddings. In: The IEEE International Conference on Computer Vision (ICCV) (2019)","DOI":"10.1109\/ICCV.2019.00590"},{"key":"12_CR42","doi-asserted-by":"crossref","unstructured":"Wu, S., Dredze, M.: Beto, Bentz, Becas: the surprising cross-lingual effectiveness of Bert. arXiv:1904.09077 (2019)","DOI":"10.18653\/v1\/D19-1077"},{"key":"12_CR43","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young, P., Lai, A., Hodosh, M., Hockenmaier, J.: From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions. Trans. Assoc. Comput. Linguist. (TACL) 2, 67\u201378 (2014)","journal-title":"Trans. Assoc. Comput. Linguist. (TACL)"},{"key":"12_CR44","doi-asserted-by":"crossref","unstructured":"Zellers, R., Bisk, Y., Farhadi, A., Choi, Y.: From recognition to cognition: visual commonsense reasoning. In: The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2019)","DOI":"10.1109\/CVPR.2019.00688"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2020"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-58548-8_12","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T00:06:10Z","timestamp":1730160370000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-58548-8_12"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020]]},"ISBN":["9783030585471","9783030585488"],"references-count":44,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-58548-8_12","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2020]]},"assertion":[{"value":"29 October 2020","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Glasgow","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"United Kingdom","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2020","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 August 2020","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 August 2020","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2020","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2020.eu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"OpenReview","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5025","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1360","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"27% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"7","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"The conference was held virtually due to the COVID-19 pandemic.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}