{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T21:15:45Z","timestamp":1757625345679,"version":"3.44.0"},"publisher-location":"Cham","reference-count":34,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783032025500"},{"type":"electronic","value":"9783032025517"}],"license":[{"start":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:00:00Z","timestamp":1755820800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:00:00Z","timestamp":1755820800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-02551-7_11","type":"book-chapter","created":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T05:27:33Z","timestamp":1755754053000},"page":"115-127","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Investigating the\u00a0Effect of\u00a0Parallel Data in\u00a0the\u00a0Cross-Lingual Transfer for\u00a0Vision-Language Encoders"],"prefix":"10.1007","author":[{"given":"Andrei-Alexandru","family":"Manea","sequence":"first","affiliation":[]},{"given":"Jind\u0159ich","family":"Libovick\u00fd","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,8,22]]},"reference":[{"key":"11_CR1","unstructured":"Bugliarello, E., et al.: IGLUE: a benchmark for transfer learning across modalities, tasks, and languages. CoRR abs\/2201.11732 (2022). https:\/\/arxiv.org\/abs\/2201.11732"},{"key":"11_CR2","unstructured":"Cao, S., Kitaev, N., Klein, D.: Multilingual alignment of contextual word representations. In: International Conference on Learning Representations (2020). https:\/\/openreview.net\/forum?id=r1xCMyBtPS"},{"key":"11_CR3","unstructured":"Carlsson, F., Eisen, P., Rekathati, F., Sahlgren, M.: Cross-lingual and multilingual CLIP. In: Proceedings of the Thirteenth Language Resources and Evaluation Conference, pp. 6848\u20136854. European Language Resources Association, Marseille, France (2022). https:\/\/aclanthology.org\/2022.lrec-1.739\/"},{"key":"11_CR4","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"104","DOI":"10.1007\/978-3-030-58577-8_7","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Y-C Chen","year":"2020","unstructured":"Chen, Y.-C., et al.: UNITER: UNiversal image-TExt representation learning. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12375, pp. 104\u2013120. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_7"},{"key":"11_CR5","doi-asserted-by":"publisher","unstructured":"Conneau, A., et al.: Unsupervised cross-lingual representation learning at scale. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 8440\u20138451. ACL, Online (2020). https:\/\/doi.org\/10.18653\/v1\/2020.acl-main.747","DOI":"10.18653\/v1\/2020.acl-main.747"},{"key":"11_CR6","doi-asserted-by":"publisher","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long and Short Papers), pp. 4171\u20134186. ACL, Minneapolis, Minnesota (2019). https:\/\/doi.org\/10.18653\/v1\/N19-1423","DOI":"10.18653\/v1\/N19-1423"},{"key":"11_CR7","doi-asserted-by":"crossref","unstructured":"Dou, Z., Neubig, G.: Word alignment by fine-tuning embeddings on parallel corpora. CoRR abs\/2101.08231 (2021). https:\/\/arxiv.org\/abs\/2101.08231","DOI":"10.18653\/v1\/2021.eacl-main.181"},{"key":"11_CR8","doi-asserted-by":"publisher","unstructured":"Feng, F., Yang, Y., Cer, D., Arivazhagan, N., Wang, W.: Language-agnostic BERT sentence embedding. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 878\u2013891. ACL, Dublin, Ireland (2022). https:\/\/doi.org\/10.18653\/v1\/2022.acl-long.62","DOI":"10.18653\/v1\/2022.acl-long.62"},{"key":"11_CR9","doi-asserted-by":"publisher","unstructured":"Geigle, G., Jain, A., Timofte, R., Glava\u0161, G.: mBLIP: efficient bootstrapping of multilingual vision-LLMs. In: Proceedings of the 3rd Workshop on Advances in Language and Vision Research (ALVR), pp. 7\u201325. ACL, Bangkok, Thailand (2024). https:\/\/doi.org\/10.18653\/v1\/2024.alvr-1.2","DOI":"10.18653\/v1\/2024.alvr-1.2"},{"key":"11_CR10","doi-asserted-by":"publisher","unstructured":"Karoui, Y., Lebret, R., Foroutan\u00a0Eghlidi, N., Aberer, K.: Stop pre-training: adapt visual-language models to unseen languages. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers), pp. 366\u2013375. ACL, Toronto, Canada (2023). https:\/\/doi.org\/10.18653\/v1\/2023.acl-short.32","DOI":"10.18653\/v1\/2023.acl-short.32"},{"key":"11_CR11","doi-asserted-by":"publisher","unstructured":"Kwon, G., Cai, Z., Ravichandran, A., Bas, E., Bhotika, R., Soatto, S.: Masked vision and language modeling for multi-modal representation learning. CoRR abs\/2208.02131 (2022). https:\/\/doi.org\/10.48550\/ARXIV.2208.02131","DOI":"10.48550\/ARXIV.2208.02131"},{"key":"11_CR12","unstructured":"Li, J., Selvaraju, R., Gotmare, A., Joty, S., Xiong, C., Hoi, S.C.H.: Align before fuse: vision and language representation learning with momentum distillation. In: Advances in Neural Information Processing Systems, vol.\u00a034, pp. 9694\u20139705. Curran Associates, Inc. (2021)"},{"key":"11_CR13","doi-asserted-by":"crossref","unstructured":"Liu, F., Bugliarello, E., Ponti, E.M., Reddy, S., Collier, N., Elliott, D.: Visually grounded reasoning across languages and cultures. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing. pp. 10467\u201310485. ACL, Online and Punta Cana, Dominican Republic (2021). https:\/\/aclanthology.org\/2021.emnlp-main.818\/","DOI":"10.18653\/v1\/2021.emnlp-main.818"},{"key":"11_CR14","doi-asserted-by":"crossref","unstructured":"Liu, F., Bugliarello, E., Ponti, E.M., Reddy, S., Collier, N., Elliott, D.: Visually grounded reasoning across languages and cultures. CoRR abs\/2109.13238 (2021). https:\/\/arxiv.org\/abs\/2109.13238","DOI":"10.18653\/v1\/2021.emnlp-main.818"},{"key":"11_CR15","unstructured":"Liu, Y., et al.: RoBERTa: a robustly optimized BERT pretraining approach. CoRR abs\/1907.11692 (2019). http:\/\/arxiv.org\/abs\/1907.11692"},{"key":"11_CR16","unstructured":"Ni, M., et al.: M3P: Learning universal representations via multitask multilingual multimodal pre-training. CoRR abs\/2006.02635 (2020). https:\/\/arxiv.org\/abs\/2006.02635"},{"issue":"1","key":"11_CR17","doi-asserted-by":"publisher","first-page":"125","DOI":"10.1515\/pralin-2016-0013","volume":"106","author":"R \u00d6stling","year":"2016","unstructured":"\u00d6stling, R., Tiedemann, J.: Efficient word alignment with Markov chain Monte Carlo. Prague Bull. Math. Linguist. 106(1), 125 (2016)","journal-title":"Prague Bull. Math. Linguist."},{"key":"11_CR18","doi-asserted-by":"publisher","unstructured":"Pfeiffer, J., et al.: Lifting the curse of multilinguality by pre-training modular transformers. In: Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 3479\u20133495. ACL, Seattle, United States (2022). https:\/\/doi.org\/10.18653\/v1\/2022.naacl-main.255","DOI":"10.18653\/v1\/2022.naacl-main.255"},{"key":"11_CR19","doi-asserted-by":"publisher","unstructured":"Pires, T., Schlinger, E., Garrette, D.: How multilingual is multilingual BERT? In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, pp. 4996\u20135001. ACL, Florence, Italy (2019). https:\/\/doi.org\/10.18653\/v1\/P19-1493","DOI":"10.18653\/v1\/P19-1493"},{"key":"11_CR20","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: Proceedings of the 38th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol.\u00a0139, pp. 8748\u20138763. PMLR, Online (2021). http:\/\/proceedings.mlr.press\/v139\/radford21a.html"},{"key":"11_CR21","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"11_CR22","doi-asserted-by":"publisher","unstructured":"Reimers, N., Gurevych, I.: Making monolingual sentence embeddings multilingual using knowledge distillation. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 4512\u20134525. ACL, Online (2020). https:\/\/doi.org\/10.18653\/v1\/2020.emnlp-main.365","DOI":"10.18653\/v1\/2020.emnlp-main.365"},{"key":"11_CR23","doi-asserted-by":"publisher","unstructured":"Schneider, F., Sitaram, S.: M5 \u2013 a diverse benchmark to assess the performance of large multimodal models across multilingual and multicultural vision-language tasks. In: Findings of the Association for Computational Linguistics: EMNLP 2024, pp. 4309\u20134345. ACL, Miami, Florida, USA (2024). https:\/\/doi.org\/10.18653\/v1\/2024.findings-emnlp.250","DOI":"10.18653\/v1\/2024.findings-emnlp.250"},{"key":"11_CR24","unstructured":"Suhr, A., Zhou, S., Zhang, I., Bai, H., Artzi, Y.: A corpus for reasoning about natural language grounded in photographs. CoRR abs\/1811.00491 (2018). http:\/\/arxiv.org\/abs\/1811.00491"},{"key":"11_CR25","doi-asserted-by":"publisher","unstructured":"Tan, H., Bansal, M.: LXMERT: learning cross-modality encoder representations from transformers. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), pp. 5100\u20135111. ACL, Hong Kong, China (2019). https:\/\/doi.org\/10.18653\/v1\/D19-1514","DOI":"10.18653\/v1\/D19-1514"},{"key":"11_CR26","unstructured":"Tiedemann, J.: Parallel data, tools and interfaces in OPUS. In: Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012), pp. 2214\u20132218. European Language Resources Association (ELRA), Istanbul, Turkey (2012). https:\/\/aclanthology.org\/L12-1246\/"},{"key":"11_CR27","doi-asserted-by":"publisher","unstructured":"Vu, T., Barua, A., Lester, B., Cer, D., Iyyer, M., Constant, N.: Overcoming catastrophic forgetting in zero-shot cross-lingual generation. In: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, pp. 9279\u20139300. ACL, Abu Dhabi, United Arab Emirates (2022). https:\/\/doi.org\/10.18653\/v1\/2022.emnlp-main.630","DOI":"10.18653\/v1\/2022.emnlp-main.630"},{"key":"11_CR28","doi-asserted-by":"publisher","unstructured":"Wu, S., Dredze, M.: Do explicit alignments robustly improve multilingual encoders? In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 4471\u20134482. ACL, Online (2020). https:\/\/doi.org\/10.18653\/v1\/2020.emnlp-main.362","DOI":"10.18653\/v1\/2020.emnlp-main.362"},{"key":"11_CR29","doi-asserted-by":"crossref","unstructured":"Xu, X., Wu, C., Rosenman, S., Lal, V., Che, W., Duan, N.: BridgeTower: building bridges between encoders in vision-language representation learning. In: Thirty-Seventh AAAI Conference on Artificial Intelligence, AAAI 2023, pp. 10637\u201310647. AAAI Press, Washington, DC, USA (2023). https:\/\/ojs.aaai.org\/index.php\/AAAI\/article\/view\/26263","DOI":"10.1609\/aaai.v37i9.26263"},{"key":"11_CR30","unstructured":"Zeng, Y., Zhang, X., Li, H., Wang, J., Zhang, J., Zhou, W.: X\\$$$\\hat{\\,}2$$\\$-VLM: all-in-one pre-trained model for vision-language tasks (2023). http:\/\/arxiv.org\/abs\/2211.12402, [cs]"},{"key":"11_CR31","doi-asserted-by":"publisher","unstructured":"Zeng, Y., Zhou, W., Luo, A., Cheng, Z., Zhang, X.: Cross-view language modeling: towards unified cross-lingual cross-modal pre-training. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 5731\u20135746. ACL, Toronto, Canada (2023). https:\/\/doi.org\/10.18653\/v1\/2023.acl-long.315","DOI":"10.18653\/v1\/2023.acl-long.315"},{"key":"11_CR32","doi-asserted-by":"publisher","unstructured":"Zhang, B., Williams, P., Titov, I., Sennrich, R.: Improving massively multilingual neural machine translation and zero-shot translation. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 1628\u20131639. ACL, Online (2020). https:\/\/doi.org\/10.18653\/v1\/2020.acl-main.148","DOI":"10.18653\/v1\/2020.acl-main.148"},{"key":"11_CR33","unstructured":"Zhang, T., Kishore, V., Wu, F., Weinberger, K.Q., Artzi, Y.: Bertscore: evaluating text generation with BERT (2020). https:\/\/arxiv.org\/abs\/1904.09675"},{"key":"11_CR34","doi-asserted-by":"crossref","unstructured":"Zhou, M., et al.: UC2: universal cross-lingual cross-modal vision-and-language pre-training. CoRR abs\/2104.00332 (2021). https:\/\/arxiv.org\/abs\/2104.00332","DOI":"10.1109\/CVPR46437.2021.00414"}],"container-title":["Lecture Notes in Computer Science","Text, Speech, and Dialogue"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-02551-7_11","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,9]],"date-time":"2025-09-09T18:04:34Z","timestamp":1757441074000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-02551-7_11"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,22]]},"ISBN":["9783032025500","9783032025517"],"references-count":34,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-02551-7_11","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025,8,22]]},"assertion":[{"value":"22 August 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"TSD","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Text, Speech, and Dialogue","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Erlangen","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"25 August 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 August 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"tsd2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.kiv.zcu.cz\/tsd2025\/index.php","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}