{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,26]],"date-time":"2026-02-26T15:27:24Z","timestamp":1772119644747,"version":"3.50.1"},"reference-count":67,"publisher":"Springer Science and Business Media LLC","issue":"1-2","license":[{"start":{"date-parts":[[2023,9,25]],"date-time":"2023-09-25T00:00:00Z","timestamp":1695600000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,9,25]],"date-time":"2023-09-25T00:00:00Z","timestamp":1695600000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Inf Retrieval J"],"published-print":{"date-parts":[[2023,12]]},"DOI":"10.1007\/s10791-023-09422-5","type":"journal-article","created":{"date-parts":[[2023,9,25]],"date-time":"2023-09-25T15:01:36Z","timestamp":1695654096000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["MuMUR: Multilingual Multimodal Universal Retrieval"],"prefix":"10.1007","volume":"26","author":[{"given":"Avinash","family":"Madasu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Estelle","family":"Aflalo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Gabriela Ben Melech","family":"Stan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shachar","family":"Rosenman","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shao-Yen","family":"Tseng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Gedas","family":"Bertasius","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Vasudev","family":"Lal","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,9,25]]},"reference":[{"key":"9422_CR1","doi-asserted-by":"publisher","first-page":"6644","DOI":"10.1609\/aaai.v35i8.16822","volume":"35","author":"E Amrani","year":"2021","unstructured":"Amrani, E., Ben-Ari, R., Rotman, D., & Bronstein, A. (2021). Noise estimation using density estimation for self-supervised multimodal learning. Proceedings of the AAAI Conference on Artificial Intelligence, 35, 6644\u20136652.","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"9422_CR2","doi-asserted-by":"crossref","unstructured":"Anne\u00a0Hendricks, L., Wang, O., Shechtman, E., Sivic, J., Darrell, T., & Russell, B. (2017). Localizing moments in video with natural language. In: Proceedings of the IEEE international conference on computer vision. 5803\u20135812.","DOI":"10.1109\/ICCV.2017.618"},{"key":"9422_CR3","doi-asserted-by":"crossref","unstructured":"Artetxe, M., Ruder, S., & Yogatama, D. (2019).  On the cross-lingual transferability of monolingual representations","DOI":"10.18653\/v1\/2020.acl-main.421"},{"key":"9422_CR4","doi-asserted-by":"crossref","unstructured":"Bain, M., Nagrani, A., Varol, G., & Zisserman, A. (2021). Frozen in time: A joint video and image encoder for end-to-end retrieval. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. 1728\u20131738.","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"9422_CR5","unstructured":"Bertasius, G., Wang, H., & Torresani, L. (2021). Is space-time attention all you need for video understanding? In: ICML. 2, 4."},{"key":"9422_CR6","doi-asserted-by":"crossref","unstructured":"Burns, A., Kim, D., Wijaya, D., Saenko, K., & Plummer, B.A. (2020). Learning to scale multilingual representations for vision-language tasks. In: European Conference on Computer Vision. 197\u2013213. Springer.","DOI":"10.1007\/978-3-030-58548-8_12"},{"key":"9422_CR7","doi-asserted-by":"crossref","unstructured":"Cao, S., Wang, B., Zhang, W., & Ma, L. (2022). Visual consensus modeling for video-text retrieval.","DOI":"10.1609\/aaai.v36i1.19891"},{"key":"9422_CR8","doi-asserted-by":"crossref","unstructured":"Changpinyo, S., Sharma, P., Ding, N., & Soricut, R. (2021). Conceptual 12m: Pushing web-scale image-text pre-training to recognize long-tail visual concepts. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 3558\u20133568.","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"9422_CR9","unstructured":"Chen, D., & Dolan, W.B. (2011) Collecting highly parallel data for paraphrase evaluation. In: Proceedings of the 49th annual meeting of the association for computational linguistics: human language technologies. 190\u2013200."},{"key":"9422_CR10","doi-asserted-by":"crossref","unstructured":"Chen, Y.C., Li, L., Yu, L., El\u00a0Kholy, A., Ahmed, F., Gan, Z., Cheng, Y., Liu, J. (2019). Uniter: Learning universal image-text representations.","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"9422_CR11","unstructured":"Cheng, X., Lin, H., Wu, X., Yang, F., & Shen, D. (2021). Improving video-text retrieval by multi-stream corpus alignment and dual softmax loss. arXiv preprint arXiv:2109.04290."},{"key":"9422_CR12","unstructured":"Conneau, A., & Lample, G. (2019). Cross-lingual language model pretraining. Advances in neural information processing systems 32."},{"key":"9422_CR13","doi-asserted-by":"crossref","unstructured":"Conneau, A., Rinott, R., Lample, G., Williams, A., Bowman, S., Schwenk, H., & Stoyanov, V. (2018). Xnli: Evaluating cross-lingual sentence representations. In: Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing. 2475\u20132485.","DOI":"10.18653\/v1\/D18-1269"},{"key":"9422_CR14","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., et\u00a0al. & (2020). An image is worth 16x16 words: Transformers for image recognition at scale. In: International Conference on Learning Representations."},{"key":"9422_CR15","doi-asserted-by":"crossref","unstructured":"Dzabraev, M., Kalashnikov, M., Komkov, S., & Petiushko, A. (2021). Mdmmt: Multidomain multimodal transformer for video retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 3354\u20133363.","DOI":"10.1109\/CVPRW53098.2021.00374"},{"key":"9422_CR16","doi-asserted-by":"crossref","unstructured":"Elliott, D., Frank, S., Sima\u2019an, K., & Specia, L. (2016). Multi30k: Multilingual english-german image descriptions. In: Proceedings of the 5th Workshop on Vision and Language. 70\u201374.","DOI":"10.18653\/v1\/W16-3210"},{"key":"9422_CR17","unstructured":"Fang, H., Xiong, P., Xu, L., & Chen, Y. (2021). Clip2video: Mastering video-text retrieval via image clip. arXiv preprint arXiv:2106.11097."},{"key":"9422_CR18","unstructured":"Gao, Z., Liu, J., Chen, S., Chang, D., Zhang, H., & Yuan, J. (2021) Clip2tv: An empirical study on transformer-based methods for video-text retrieval. arXiv preprint arXiv:2111.05610."},{"key":"9422_CR19","doi-asserted-by":"crossref","unstructured":"Ge, Y., Ge, Y., Liu, X., Li, D., Shan, Y., Qie, X., & Luo, P. (2022). Bridging video-text retrieval with multiple choice questions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 16167\u201316176.","DOI":"10.1109\/CVPR52688.2022.01569"},{"key":"9422_CR20","doi-asserted-by":"crossref","unstructured":"Ge, Y., Ge, Y., Liu, X., Wang, A.J., Wu, J., Shan, Y., Qie, X., & Luo, P. (2022). Miles: Visual bert pre-training with injected language semantics for video-text retrieval. arXiv preprint arXiv:2204.12408.","DOI":"10.1007\/978-3-031-19833-5_40"},{"key":"9422_CR21","doi-asserted-by":"crossref","unstructured":"Gella, S., Sennrich, R., Keller, F., & Lapata, M. (2017). Image pivoting for learning multilingual multimodal representations. In: Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing. 2839\u20132845.","DOI":"10.18653\/v1\/D17-1303"},{"key":"9422_CR22","unstructured":"Hu, J., Ruder, S., Siddhant, A., Neubig, G., Firat, O., Johnson, M. (2020). Xtreme: A massively multilingual multi-task benchmark for evaluating cross-lingual generalisation. In: International Conference on Machine Learning. 4411\u20134421. PMLR."},{"key":"9422_CR23","doi-asserted-by":"crossref","unstructured":"Huang, P.Y., Patrick, M., Hu, J., Neubig, G., Metze, F., Hauptmann, A.G. (2021). Multilingual multimodal pre-training for zero-shot cross-lingual transfer of vision-language models. In: Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. 2443\u20132459.","DOI":"10.18653\/v1\/2021.naacl-main.195"},{"key":"9422_CR24","unstructured":"Jia, C., Yang, Y., Xia, Y., Chen, Y.T., Parekh, Z., Pham, H., Le, Q., Sung, Y.H., Li, Z., Duerig, T. (2021). Scaling up visual and vision-language representation learning with noisy text supervision. In: International conference on machine learning. 4904\u20134916. PMLR."},{"key":"9422_CR25","doi-asserted-by":"crossref","unstructured":"Karpathy, A., & Fei-Fei, L. (2015). Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE conference on computer vision and pattern recognition. 3128\u20133137.","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"9422_CR26","doi-asserted-by":"crossref","unstructured":"Kim, D., Saito, K., Saenko, K., Sclaroff, S., & Plummer, B. (2020). Mule: Multimodal universal language embedding. In: Proceedings of the AAAI Conference on Artificial Intelligence. 34, 11254\u201311261.","DOI":"10.1609\/aaai.v34i07.6785"},{"key":"9422_CR27","unstructured":"Kim, W., Son, B., & Kim, I. (2021). Vilt: Vision-and-language transformer without convolution or region supervision. In: International Conference on Machine Learning. pp. 5583\u20135594. PMLR"},{"key":"9422_CR28","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., Zhu, Y., Groth, O., Johnson, J., Hata, K., Kravitz, J., Chen, S., Kalantidis, Y., Li, L. J., Shamma, D. A., et al. (2017). Visual genome: Connecting language and vision using crowdsourced dense image annotations. International Journal of Computer Vision, 123, 32\u201373.","journal-title":"International Journal of Computer Vision"},{"key":"9422_CR29","doi-asserted-by":"crossref","unstructured":"Lei, J., Berg, T.L., & Bansal, M. (2022). Revealing single frame bias for video-and-language learning. arXiv preprint arXiv:2206.03428","DOI":"10.18653\/v1\/2023.acl-long.29"},{"key":"9422_CR30","doi-asserted-by":"crossref","unstructured":"Lei, J., Li, L., Zhou, L., Gan, Z., Berg, T.L., Bansal, M., Liu, J. (2021). Less is more: Clipbert for video-and-language learning via sparse sampling. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 7331\u20137341.","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"9422_CR31","doi-asserted-by":"crossref","unstructured":"Li, D., Li, J., Li, H., Niebles, J.C., Hoi, S.C. (2022) Align and prompt: Video-and-language pre-training with entity prompts. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 4953\u20134963.","DOI":"10.1109\/CVPR52688.2022.00490"},{"key":"9422_CR32","unstructured":"Li, J., Li, D., Xiong, C., & Hoi, S. (2022). Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning. 12888\u201312900. PMLR"},{"key":"9422_CR33","first-page":"9694","volume":"34","author":"J Li","year":"2021","unstructured":"Li, J., Selvaraju, R., Gotmare, A., Joty, S., Xiong, C., & Hoi, S. C. H. (2021). Align before fuse: Vision and language representation learning with momentum distillation. Advances in Neural Information Processing Systems, 34, 9694\u20139705.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"9422_CR34","doi-asserted-by":"crossref","unstructured":"Li, L., Chen, Y.C., Cheng, Y., Gan, Z., Yu, L., Liu, J. (2020). Hero: Hierarchical encoder for video+ language omni-representation pre-training. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP). 2046\u20132065.","DOI":"10.18653\/v1\/2020.emnlp-main.161"},{"key":"9422_CR35","doi-asserted-by":"crossref","unstructured":"Lin, Y.B., Lei, J., Bansal, M., & Bertasius, G. (2022). Eclipse: Efficient long-range video retrieval using sight and sound. arXiv preprint arXiv:2204.02874.","DOI":"10.1007\/978-3-031-19830-4_24"},{"key":"9422_CR36","doi-asserted-by":"crossref","unstructured":"Liu, S., Fan, H., Qian, S., Chen, Y., Ding, W., Wang, Z. (2021). Hit: Hierarchical transformer with momentum contrast for video-text retrieval. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. 11915\u201311925.","DOI":"10.1109\/ICCV48922.2021.01170"},{"key":"9422_CR37","unstructured":"Liu, Y., Albanie, S., Nagrani, A., & Zisserman, A. (2019) Use what you have: Video retrieval using representations from collaborative experts. arXiv preprint arXiv:1907.13487."},{"key":"9422_CR38","doi-asserted-by":"crossref","unstructured":"Liu, Y., Xiong, P., Xu, L., Cao, S., Jin, Q. (2022). Ts2-net: Token shift and selection transformer for text-video retrieval. arXiv preprint arXiv:2207.07852.","DOI":"10.1007\/978-3-031-19781-9_19"},{"key":"9422_CR39","unstructured":"Loshchilov, I., & Hutter, F. (2018). Decoupled weight decay regularization. In: International Conference on Learning Representations."},{"key":"9422_CR40","doi-asserted-by":"crossref","unstructured":"Lu, J., Goswami, V., Rohrbach, M., Parikh, D., & Lee, S. (2020). 12-in-1: Multi-task vision and language representation learning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 10437\u201310446.","DOI":"10.1109\/CVPR42600.2020.01045"},{"key":"9422_CR41","doi-asserted-by":"crossref","unstructured":"Luo, H., Ji, L., Zhong, M., Chen, Y., Lei, W., Duan, N., & Li, T. (2021). Clip4clip: An empirical study of clip for end to end video clip retrieval. arXiv preprint arXiv:2104.08860","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"9422_CR42","doi-asserted-by":"crossref","unstructured":"Madasu, A., Aflalo, E., Ben Melech\u00a0Stan, G., Tseng, S.Y., Bertasius, G., & Lal, V. (2023). Improving video retrieval using multilingual knowledge transfer. In: European Conference on Information Retrieval. 669\u2013684. Springer.","DOI":"10.1007\/978-3-031-28244-7_42"},{"key":"9422_CR43","doi-asserted-by":"crossref","unstructured":"Madasu, A., Oliva, J., & Bertasius, G. (2022). Learning to retrieve videos by asking questions. arXiv preprint arXiv:2205.05739","DOI":"10.1145\/3503161.3548361"},{"key":"9422_CR44","doi-asserted-by":"crossref","unstructured":"Miech, A., Zhukov, D., Alayrac, J.B., Tapaswi, M., Laptev, I., & Sivic, J. (2019). Howto100m: Learning a text-video embedding by watching hundred million narrated video clips. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. 2630\u20132640.","DOI":"10.1109\/ICCV.2019.00272"},{"key":"9422_CR45","doi-asserted-by":"crossref","unstructured":"Ni, M., Huang, H., Su, L., Cui, E., Bharti, T., Wang, L., Zhang, D., & Duan, N. (2021). M3p: Learning universal representations via multitask multilingual multimodal pre-training. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 3977\u20133986.","DOI":"10.1109\/CVPR46437.2021.00397"},{"key":"9422_CR46","unstructured":"Patrick, M., Huang, P.Y., Asano, Y., Metze, F., Hauptmann, A.G., Henriques, J.F., Vedaldi, A. (2020). Support-set bottlenecks for video-text representation learning. In: International Conference on Learning Representations."},{"key":"9422_CR47","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., Caicedo, J.C., Hockenmaier, J., & Lazebnik, S. (2015). Flickr30k entities: Collecting region-to-phrase correspondences for richer image-to-sentence models. In: Proceedings of the IEEE international conference on computer vision. 2641\u20132649.","DOI":"10.1109\/ICCV.2015.303"},{"key":"9422_CR48","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., & et\u00a0al. (2021). Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning. 8748\u20138763. PMLR."},{"key":"9422_CR49","doi-asserted-by":"crossref","unstructured":"Sigurdsson, G.A., Alayrac, J.B., Nematzadeh, A., Smaira, L., Malinowski, M., Carreira, J., Blunsom, P., & Zisserman, A. (2020). Visual grounding in video for unsupervised word translation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 10850\u201310859.","DOI":"10.1109\/CVPR42600.2020.01086"},{"key":"9422_CR50","doi-asserted-by":"crossref","unstructured":"Sigurdsson, G.A., Varol, G., Wang, X., Farhadi, A., Laptev, I., & Gupta, A. (2016). Hollywood in homes: Crowdsourcing data collection for activity understanding. In: European Conference on Computer Vision. 510\u2013526. Springer.","DOI":"10.1007\/978-3-319-46448-0_31"},{"key":"9422_CR51","doi-asserted-by":"crossref","unstructured":"Sur\u00eds, D., Epstein, D., Vondrick, C. (2022). Globetrotter: Connecting languages by connecting images. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 16474\u201316484.","DOI":"10.1109\/CVPR52688.2022.01598"},{"key":"9422_CR52","unstructured":"Tang, Y., Tran, C., Li, X., Chen, P.J., Goyal, N., Chaudhary, V., Gu, J., & Fan, A. (2020). Multilingual translation with extensible multilingual pretraining and finetuning. arXiv preprint arXiv:2008.00401"},{"key":"9422_CR53","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Rohrbach, M., Donahue, J., Mooney, R., Darrell, T., & Saenko, K. (2015). Sequence to sequence-video to text. In: Proceedings of the IEEE international conference on computer vision. 4534\u20134542.","DOI":"10.1109\/ICCV.2015.515"},{"key":"9422_CR54","doi-asserted-by":"crossref","unstructured":"Wang, A.J., Ge, Y., Yan, R., Ge, Y., Lin, X., Cai, G., Wu, J., Shan, Y., Qie, X., & Shou, M.Z. (2022). All in one: Exploring unified video-language pre-training. arXiv preprint arXiv:2203.07303","DOI":"10.1109\/CVPR52729.2023.00638"},{"key":"9422_CR55","doi-asserted-by":"crossref","unstructured":"Wang, J., Ge, Y., Cai, G., Yan, R., Lin, X., Shan, Y., Qie, X., & Shou, M.Z. (2022). Object-aware video-language pre-training for retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 3313\u20133322.","DOI":"10.1109\/CVPR52688.2022.00331"},{"issue":"2","key":"9422_CR56","doi-asserted-by":"publisher","first-page":"394","DOI":"10.1109\/TPAMI.2018.2797921","volume":"41","author":"L Wang","year":"2018","unstructured":"Wang, L., Li, Y., Huang, J., & Lazebnik, S. (2018). Learning two-branch neural networks for image-text matching tasks. IEEE Transactions on Pattern Analysis and Machine Intelligence, 41(2), 394\u2013407.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"9422_CR57","doi-asserted-by":"crossref","unstructured":"Wehrmann, J., Souza, D.M., Lopes, M.A., & Barros, R.C. (2019). Language-agnostic visual-semantic embeddings. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. 5804\u20135813.","DOI":"10.1109\/ICCV.2019.00590"},{"key":"9422_CR58","doi-asserted-by":"publisher","unstructured":"Wu, H.Y., & Zhai, A. (2019). Classification is a strong baseline for deep metric learning. In: Sidorov, K., Hicks, Y. (eds.) Proceedings of the British Machine Vision Conference (BMVC). 224.1\u2013224.12. BMVA Press. https:\/\/doi.org\/10.5244\/C.33.224,","DOI":"10.5244\/C.33.224"},{"key":"9422_CR59","doi-asserted-by":"crossref","unstructured":"WU SJ, D.M. (2019). The surprising cross-lingual effectiveness of bert. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing, Hong Kong, China. 833\u2013844.","DOI":"10.18653\/v1\/D19-1077"},{"key":"9422_CR60","doi-asserted-by":"crossref","unstructured":"Xu, H., Ghosh, G., Huang, P.Y., Okhonko, D., Aghajanyan, A., Metze, F., Zettlemoyer, L., & Feichtenhofer, C. (2021). Videoclip: Contrastive pre-training for zero-shot video-text understanding. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing. 6787\u20136800.","DOI":"10.18653\/v1\/2021.emnlp-main.544"},{"key":"9422_CR61","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., & Rui, Y. (2016). Msr-vtt: A large video description dataset for bridging video and language. In: Proceedings of the IEEE conference on computer vision and pattern recognition. 5288\u20135296.","DOI":"10.1109\/CVPR.2016.571"},{"key":"9422_CR62","doi-asserted-by":"publisher","unstructured":"Xu, X., Li, B., Wu, C., Tseng, S.Y., Bhiwandiwalla, A., Rosenman, S., Lal, V., Che, W., & Duan, N. (2023). ManagerTower: Aggregating the insights of uni-modal experts for vision-language representation learning. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). 14507\u201314525. Association for Computational Linguistics, Toronto, Canada. https:\/\/doi.org\/10.18653\/v1\/2023.acl-long.811, https:\/\/aclanthology.org\/2023.acl-long.811","DOI":"10.18653\/v1\/2023.acl-long.811"},{"key":"9422_CR63","doi-asserted-by":"publisher","first-page":"10637","DOI":"10.1609\/aaai.v37i9.26263","volume":"37","author":"X Xu","year":"2023","unstructured":"Xu, X., Wu, C., Rosenman, S., Lal, V., Che, W., & Duan, N. (2023). Bridgetower: Building bridges between encoders in vision-language representation learning. Proceedings of the AAAI Conference on Artificial Intelligence, 37, 10637\u201310647.","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"9422_CR64","doi-asserted-by":"crossref","unstructured":"Yu, Y., Kim, J., & Kim, G. (2018). A joint sequence fusion model for video question answering and retrieval. In: Proceedings of the European Conference on Computer Vision (ECCV). 471\u2013487.","DOI":"10.1007\/978-3-030-01234-2_29"},{"key":"9422_CR65","doi-asserted-by":"crossref","unstructured":"Zhang, B., Hu, H., & Sha, F. (2018). Cross-modal and hierarchical modeling of video and text. In: Proceedings of the european conference on computer vision (ECCV). 374\u2013390.","DOI":"10.1007\/978-3-030-01261-8_23"},{"key":"9422_CR66","doi-asserted-by":"crossref","unstructured":"Zhou, M., Zhou, L., Wang, S., Cheng, Y., Li, L., Yu, Z., & Liu, J. (2021). Uc2: Universal cross-lingual cross-modal vision-and-language pre-training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 4155\u20134165.","DOI":"10.1109\/CVPR46437.2021.00414"},{"key":"9422_CR67","doi-asserted-by":"crossref","unstructured":"Zhu, L., & Yang, Y. (2020). Actbert: Learning global-local video-text representations. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 8746\u20138755.","DOI":"10.1109\/CVPR42600.2020.00877"}],"container-title":["Information Retrieval Journal"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10791-023-09422-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10791-023-09422-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10791-023-09422-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,2]],"date-time":"2024-01-02T09:16:27Z","timestamp":1704186987000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10791-023-09422-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,9,25]]},"references-count":67,"journal-issue":{"issue":"1-2","published-print":{"date-parts":[[2023,12]]}},"alternative-id":["9422"],"URL":"https:\/\/doi.org\/10.1007\/s10791-023-09422-5","relation":{"has-preprint":[{"id-type":"doi","id":"10.21203\/rs.3.rs-3283631\/v1","asserted-by":"object"}]},"ISSN":["1386-4564","1573-7659"],"issn-type":[{"value":"1386-4564","type":"print"},{"value":"1573-7659","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,9,25]]},"assertion":[{"value":"21 August 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 September 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 September 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they do not have competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"This declaration is not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical approval"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}],"article-number":"5"}}