{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T10:25:34Z","timestamp":1743071134367,"version":"3.40.3"},"publisher-location":"Cham","reference-count":47,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031282430"},{"type":"electronic","value":"9783031282447"}],"license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023]]},"DOI":"10.1007\/978-3-031-28244-7_42","type":"book-chapter","created":{"date-parts":[[2023,3,16]],"date-time":"2023-03-16T17:03:18Z","timestamp":1678986198000},"page":"669-684","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["Improving Video Retrieval Using Multilingual Knowledge Transfer"],"prefix":"10.1007","author":[{"given":"Avinash","family":"Madasu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Estelle","family":"Aflalo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Gabriela","family":"Ben Melech Stan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shao-Yen","family":"Tseng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Gedas","family":"Bertasius","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Vasudev","family":"Lal","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,3,17]]},"reference":[{"key":"42_CR1","doi-asserted-by":"crossref","unstructured":"Amrani, E., Ben-Ari, R., Rotman, D., Bronstein, A.: Noise estimation using density estimation for self-supervised multimodal learning. In: Proceedings of the AAAI Conference on Artificial Intelligence. vol. 35, pp. 6644\u20136652 (2021)","DOI":"10.1609\/aaai.v35i8.16822"},{"key":"42_CR2","doi-asserted-by":"crossref","unstructured":"Anne Hendricks, L., Wang, O., Shechtman, E., Sivic, J., Darrell, T., Russell, B.: Localizing moments in video with natural language. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5803\u20135812 (2017)","DOI":"10.1109\/ICCV.2017.618"},{"key":"42_CR3","doi-asserted-by":"crossref","unstructured":"Bain, M., Nagrani, A., Varol, G., Zisserman, A.: Frozen in time: A joint video and image encoder for end-to-end retrieval. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1728\u20131738 (2021)","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"42_CR4","unstructured":"Bertasius, G., Wang, H., Torresani, L.: Is space-time attention all you need for video understanding? In: ICML. vol. 2, p. 4 (2021)"},{"key":"42_CR5","doi-asserted-by":"crossref","unstructured":"Burns, A., Kim, D., Wijaya, D., Saenko, K., Plummer, B.A.: Learning to scale multilingual representations for vision-language tasks. In: European Conference on Computer Vision, pp. 197\u2013213. Springer (2020)","DOI":"10.1007\/978-3-030-58548-8_12"},{"key":"42_CR6","doi-asserted-by":"crossref","unstructured":"Cao, S., Wang, B., Zhang, W., Ma, L.: Visual consensus modeling for video-text retrieval (2022)","DOI":"10.1609\/aaai.v36i1.19891"},{"key":"42_CR7","unstructured":"Chen, D., Dolan, W.B.: Collecting highly parallel data for paraphrase evaluation. In: Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies, pp. 190\u2013200 (2011)"},{"key":"42_CR8","unstructured":"Cheng, X., Lin, H., Wu, X., Yang, F., Shen, D.: Improving video-text retrieval by multi-stream corpus alignment and dual softmax loss. arXiv preprint arXiv:2109.04290 (2021)"},{"key":"42_CR9","unstructured":"Conneau, A., Lample, G.: Cross-lingual language model pretraining. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"42_CR10","doi-asserted-by":"crossref","unstructured":"Conneau, A., et al.: Xnli: Evaluating cross-lingual sentence representations. In: Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, pp. 2475\u20132485 (2018)","DOI":"10.18653\/v1\/D18-1269"},{"key":"42_CR11","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: Transformers for image recognition at scale. In: International Conference on Learning Representations (2020)"},{"key":"42_CR12","doi-asserted-by":"crossref","unstructured":"Dzabraev, M., Kalashnikov, M., Komkov, S., Petiushko, A.: Mdmmt: Multidomain multimodal transformer for video retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3354\u20133363 (2021)","DOI":"10.1109\/CVPRW53098.2021.00374"},{"key":"42_CR13","unstructured":"Fang, H., Xiong, P., Xu, L., Chen, Y.: Clip2video: Mastering video-text retrieval via image clip. arXiv preprint arXiv:2106.11097 (2021)"},{"key":"42_CR14","unstructured":"Gao, Z., Liu, J., Chen, S., Chang, D., Zhang, H., Yuan, J.: Clip2tv: An empirical study on transformer-based methods for video-text retrieval. arXiv preprint arXiv:2111.05610 (2021)"},{"key":"42_CR15","doi-asserted-by":"crossref","unstructured":"Ge, Y., et al.: Bridging video-text retrieval with multiple choice questions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16167\u201316176 (2022)","DOI":"10.1109\/CVPR52688.2022.01569"},{"key":"42_CR16","doi-asserted-by":"crossref","unstructured":"Ge, Y., et al.: Miles: Visual bert pre-training with injected language semantics for video-text retrieval. arXiv preprint arXiv:2204.12408 (2022)","DOI":"10.1007\/978-3-031-19833-5_40"},{"key":"42_CR17","doi-asserted-by":"crossref","unstructured":"Gella, S., Sennrich, R., Keller, F., Lapata, M.: Image pivoting for learning multilingual multimodal representations. In: Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing, pp. 2839\u20132845 (2017)","DOI":"10.18653\/v1\/D17-1303"},{"key":"42_CR18","doi-asserted-by":"crossref","unstructured":"Huang, P.Y., Patrick, M., Hu, J., Neubig, G., Metze, F., Hauptmann, A.G.: Multilingual multimodal pre-training for zero-shot cross-lingual transfer of vision-language models. In: Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 2443\u20132459 (2021)","DOI":"10.18653\/v1\/2021.naacl-main.195"},{"key":"42_CR19","doi-asserted-by":"crossref","unstructured":"Kim, D., Saito, K., Saenko, K., Sclaroff, S., Plummer, B.: Mule: Multimodal universal language embedding. In: Proceedings of the AAAI Conference on Artificial Intelligence. vol. 34, pp. 11254\u201311261 (2020)","DOI":"10.1609\/aaai.v34i07.6785"},{"key":"42_CR20","doi-asserted-by":"crossref","unstructured":"Lei, J., Berg, T.L., Bansal, M.: Revealing single frame bias for video-and-language learning. arXiv preprint arXiv:2206.03428 (2022)","DOI":"10.18653\/v1\/2023.acl-long.29"},{"key":"42_CR21","doi-asserted-by":"crossref","unstructured":"Lei, J., et al.: Less is more: Clipbert for video-and-language learning via sparse sampling. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7331\u20137341 (2021)","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"42_CR22","doi-asserted-by":"crossref","unstructured":"Li, D., Li, J., Li, H., Niebles, J.C., Hoi, S.C.: Align and prompt: Video-and-language pre-training with entity prompts. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4953\u20134963 (2022)","DOI":"10.1109\/CVPR52688.2022.00490"},{"key":"42_CR23","doi-asserted-by":"crossref","unstructured":"Li, L., Chen, Y.C., Cheng, Y., Gan, Z., Yu, L., Liu, J.: Hero: Hierarchical encoder for video+ language omni-representation pre-training. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 2046\u20132065 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.161"},{"key":"42_CR24","doi-asserted-by":"crossref","unstructured":"Lin, Y.B., Lei, J., Bansal, M., Bertasius, G.: Eclipse: Efficient long-range video retrieval using sight and sound. arXiv preprint arXiv:2204.02874 (2022)","DOI":"10.1007\/978-3-031-19830-4_24"},{"key":"42_CR25","doi-asserted-by":"crossref","unstructured":"Liu, S., Fan, H., Qian, S., Chen, Y., Ding, W., Wang, Z.: Hit: Hierarchical transformer with momentum contrast for video-text retrieval. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 11915\u201311925 (2021)","DOI":"10.1109\/ICCV48922.2021.01170"},{"key":"42_CR26","unstructured":"Liu, Y., Albanie, S., Nagrani, A., Zisserman, A.: Use what you have: Video retrieval using representations from collaborative experts. arXiv preprint arXiv:1907.13487 (2019)"},{"key":"42_CR27","doi-asserted-by":"crossref","unstructured":"Liu, Y., Xiong, P., Xu, L., Cao, S., Jin, Q.: Ts2-net: Token shift and selection transformer for text-video retrieval. arXiv preprint arXiv:2207.07852 (2022)","DOI":"10.1007\/978-3-031-19781-9_19"},{"key":"42_CR28","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: International Conference on Learning Representations (2018)"},{"key":"42_CR29","doi-asserted-by":"crossref","unstructured":"Luo, H., et al.: Clip4clip: An empirical study of clip for end to end video clip retrieval. arXiv preprint arXiv:2104.08860 (2021)","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"42_CR30","doi-asserted-by":"crossref","unstructured":"Madasu, A., Oliva, J., Bertasius, G.: Learning to retrieve videos by asking questions. arXiv preprint arXiv:2205.05739 (2022)","DOI":"10.1145\/3503161.3548361"},{"key":"42_CR31","doi-asserted-by":"crossref","unstructured":"Miech, A., Zhukov, D., Alayrac, J.B., Tapaswi, M., Laptev, I., Sivic, J.: Howto100m: Learning a text-video embedding by watching hundred million narrated video clips. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2630\u20132640 (2019)","DOI":"10.1109\/ICCV.2019.00272"},{"key":"42_CR32","unstructured":"Patrick, M., Huang, P.Y., Asano, Y., Metze, F., Hauptmann, A.G., Henriques, J.F., Vedaldi, A.: Support-set bottlenecks for video-text representation learning. In: International Conference on Learning Representations (2020)"},{"key":"42_CR33","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"42_CR34","doi-asserted-by":"crossref","unstructured":"Sigurdsson, G.A., et al.: Visual grounding in video for unsupervised word translation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10850\u201310859 (2020)","DOI":"10.1109\/CVPR42600.2020.01086"},{"key":"42_CR35","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"510","DOI":"10.1007\/978-3-319-46448-0_31","volume-title":"Computer Vision \u2013 ECCV 2016","author":"GA Sigurdsson","year":"2016","unstructured":"Sigurdsson, G.A., Varol, G., Wang, X., Farhadi, A., Laptev, I., Gupta, A.: Hollywood in homes: crowdsourcing data collection for activity understanding. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 510\u2013526. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_31"},{"key":"42_CR36","doi-asserted-by":"crossref","unstructured":"Sur\u00eds, D., Epstein, D., Vondrick, C.: Globetrotter: Connecting languages by connecting images. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16474\u201316484 (2022)","DOI":"10.1109\/CVPR52688.2022.01598"},{"key":"42_CR37","unstructured":"Tang, Y., et al.: Multilingual translation with extensible multilingual pretraining and finetuning. arXiv preprint arXiv:2008.00401 (2020)"},{"key":"42_CR38","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Rohrbach, M., Donahue, J., Mooney, R., Darrell, T., Saenko, K.: Sequence to sequence-video to text. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 4534\u20134542 (2015)","DOI":"10.1109\/ICCV.2015.515"},{"key":"42_CR39","unstructured":"Wang, A.J., et al.: All in one: Exploring unified video-language pre-training. arXiv preprint arXiv:2203.07303 (2022)"},{"key":"42_CR40","doi-asserted-by":"crossref","unstructured":"Wang, J., et al.: Object-aware video-language pre-training for retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3313\u20133322 (2022)","DOI":"10.1109\/CVPR52688.2022.00331"},{"key":"42_CR41","doi-asserted-by":"crossref","unstructured":"Wehrmann, J., Souza, D.M., Lopes, M.A., Barros, R.C.: Language-agnostic visual-semantic embeddings. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5804\u20135813 (2019)","DOI":"10.1109\/ICCV.2019.00590"},{"key":"42_CR42","doi-asserted-by":"publisher","unstructured":"Wu, H.Y., Zhai, A.: Classification is a strong baseline for deep metric learning. In: Sidorov, K., Hicks, Y. (eds.) Proceedings of the British Machine Vision Conference (BMVC), pp. 224.1-224.12. BMVA Press (September 2019). https:\/\/doi.org\/10.5244\/C.33.224","DOI":"10.5244\/C.33.224"},{"key":"42_CR43","doi-asserted-by":"crossref","unstructured":"Xu, H., et al.: Videoclip: Contrastive pre-training for zero-shot video-text understanding. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, pp. 6787\u20136800 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.544"},{"key":"42_CR44","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., Rui, Y.: Msr-vtt: A large video description dataset for bridging video and language. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5288\u20135296 (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"42_CR45","doi-asserted-by":"crossref","unstructured":"Yu, Y., Kim, J., Kim, G.: A joint sequence fusion model for video question answering and retrieval. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 471\u2013487 (2018)","DOI":"10.1007\/978-3-030-01234-2_29"},{"key":"42_CR46","doi-asserted-by":"crossref","unstructured":"Zhang, B., Hu, H., Sha, F.: Cross-modal and hierarchical modeling of video and text. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 374\u2013390 (2018)","DOI":"10.1007\/978-3-030-01261-8_23"},{"key":"42_CR47","doi-asserted-by":"crossref","unstructured":"Zhu, L., Yang, Y.: Actbert: Learning global-local video-text representations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8746\u20138755 (2020)","DOI":"10.1109\/CVPR42600.2020.00877"}],"container-title":["Lecture Notes in Computer Science","Advances in Information Retrieval"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-28244-7_42","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,5]],"date-time":"2024-03-05T13:44:13Z","timestamp":1709646253000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-28244-7_42"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"ISBN":["9783031282430","9783031282447"],"references-count":47,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-28244-7_42","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2023]]},"assertion":[{"value":"17 March 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECIR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Information Retrieval","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Dublin","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Ireland","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 April 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"6 April 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"45","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ecir2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/ecir2023.org\/index.html?v=1.0","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"EasyChair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"489","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"77","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"83","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"16% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}