{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:47:44Z","timestamp":1778082464839,"version":"3.51.4"},"publisher-location":"Cham","reference-count":35,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030585471","type":"print"},{"value":"9783030585488","type":"electronic"}],"license":[{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2020]]},"DOI":"10.1007\/978-3-030-58548-8_13","type":"book-chapter","created":{"date-parts":[[2020,10,28]],"date-time":"2020-10-28T23:02:42Z","timestamp":1603926162000},"page":"214-229","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":398,"title":["Multi-modal Transformer for Video Retrieval"],"prefix":"10.1007","author":[{"given":"Valentin","family":"Gabeur","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chen","family":"Sun","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Karteek","family":"Alahari","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Cordelia","family":"Schmid","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2020,10,29]]},"reference":[{"key":"13_CR1","doi-asserted-by":"crossref","unstructured":"Burns, A., Tan, R., Saenko, K., Sclaroff, S., Plummer, B.A.: Language features matter: effective language representations for vision-language tasks. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00757"},{"key":"13_CR2","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? In: CVPR, A New Model and the Kinetics Dataset (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"13_CR3","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: NAACL-HLT (2019)"},{"key":"13_CR4","doi-asserted-by":"crossref","unstructured":"Gabeur, V., Sun, C., Alahari, K., Schmid, C.: CVPR 2020 video pentathlon challenge: multi-modal transformer for video retrieval. In: CVPR Video Pentathlon Workshop (2020)","DOI":"10.1007\/978-3-030-58548-8_13"},{"key":"13_CR5","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"659","DOI":"10.1007\/978-3-030-01231-1_40","volume-title":"Computer Vision \u2013 ECCV 2018","author":"D Harwath","year":"2018","unstructured":"Harwath, D., Recasens, A., Sur\u00eds, D., Chuang, G., Torralba, A., Glass, J.: Jointly discovering visual objects and spoken words from raw sensory input. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11210, pp. 659\u2013677. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01231-1_40"},{"key":"13_CR6","doi-asserted-by":"crossref","unstructured":"Hershey, S., et al.: CNN architectures for large-scale audio classification. In: ICASSP (2017)","DOI":"10.1109\/ICASSP.2017.7952132"},{"key":"13_CR7","doi-asserted-by":"crossref","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8) (1997)","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"13_CR8","doi-asserted-by":"crossref","unstructured":"Hu, J., Shen, L., Albanie, S., Sun, G., Wu, E.: Squeeze-and-excitation networks. IEEE Trans. Pattern Anal. Mach. Intell. (2019)","DOI":"10.1109\/CVPR.2018.00745"},{"key":"13_CR9","doi-asserted-by":"crossref","unstructured":"Huang, G., Liu, Z., Weinberger, K.Q.: Densely connected convolutional networks. In: CVPR (2016)","DOI":"10.1109\/CVPR.2017.243"},{"key":"13_CR10","unstructured":"Karpathy, A., Joulin, A., Fei-Fei, L.: Deep fragment embeddings for bidirectional image sentence mapping. In: NIPS (2014)"},{"key":"13_CR11","doi-asserted-by":"crossref","unstructured":"Klein, B., Lev, G., Sadeh, G., Wolf, L.: Associating neural word embeddings with deep image representations using fisher vectors. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7299073"},{"key":"13_CR12","doi-asserted-by":"crossref","unstructured":"Krishna, R., Hata, K., Ren, F., Fei-Fei, L., Niebles, J.C.: Dense-captioning events in videos. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.83"},{"key":"13_CR13","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"212","DOI":"10.1007\/978-3-030-01225-0_13","volume-title":"Computer Vision \u2013 ECCV 2018","author":"K-H Lee","year":"2018","unstructured":"Lee, K.-H., Chen, X., Hua, G., Hu, H., He, X.: Stacked cross attention for image-text matching. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11208, pp. 212\u2013228. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01225-0_13"},{"key":"13_CR14","unstructured":"Liu, Y., Albanie, S., Nagrani, A., Zisserman, A.: Use what you have: video retrieval using representations from collaborative experts. arXiv abs\/1907.13487 (2019)"},{"key":"13_CR15","doi-asserted-by":"crossref","unstructured":"Miech, A., Alayrac, J.B., Smaira, L., Laptev, I., Sivic, J., Zisserman, A.: End-to-end learning of visual representations from uncurated instructional videos. arXiv e-prints arXiv:1912.06430, December 2019","DOI":"10.1109\/CVPR42600.2020.00990"},{"key":"13_CR16","unstructured":"Miech, A., Laptev, I., Sivic, J.: Learning a text-video embedding from incomplete and heterogeneous data. arXiv abs\/1804.02516 (2018)"},{"key":"13_CR17","doi-asserted-by":"crossref","unstructured":"Miech, A., Zhukov, D., Alayrac, J.B., Tapaswi, M., Laptev, I., Sivic, J.: HowTo100M: learning a text-video embedding by watching hundred million narrated video clips. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00272"},{"key":"13_CR18","unstructured":"Mikolov, T., Chen, K., Corrado, G.S., Dean, J.: Efficient estimation of word representations in vector space. In: ICLR (2013)"},{"key":"13_CR19","doi-asserted-by":"crossref","unstructured":"Mithun, N.C., Li, J., Metze, F., Roy-Chowdhury, A.K.: Learning joint embedding with multimodal cues for cross-modal video-text retrieval. In: ICMR (2018)","DOI":"10.1145\/3206025.3206064"},{"issue":"1","key":"13_CR20","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1007\/s13735-018-00166-3","volume":"8","author":"NC Mithun","year":"2019","unstructured":"Mithun, N.C., Li, J., Metze, F., Roy-Chowdhury, A.K.: Joint embeddings with multimodal cues for video-text retrieval. Int. J. Multimedia Inf. Retrieval 8(1), 3\u201318 (2019). https:\/\/doi.org\/10.1007\/s13735-018-00166-3","journal-title":"Int. J. Multimedia Inf. Retrieval"},{"key":"13_CR21","doi-asserted-by":"crossref","unstructured":"Rohrbach, A., Rohrbach, M., Tandon, N., Schiele, B.: A dataset for movie description. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"13_CR22","unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos. In: NIPS (2014)"},{"key":"13_CR23","unstructured":"Sun, C., Baradel, F., Murphy, K., Schmid, C.: Learning video representations using contrastive bidirectional transformer. arXiv:1906.05743 (2019)"},{"key":"13_CR24","doi-asserted-by":"crossref","unstructured":"Sun, C., Myers, A., Vondrick, C., Murphy, K., Schmid, C.: VideoBERT: a joint model for video and language representation learning. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00756"},{"key":"13_CR25","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NIPS (2017)"},{"key":"13_CR26","doi-asserted-by":"crossref","unstructured":"Wray, M., Larlus, D., Csurka, G., Damen, D.: Fine-grained action retrieval through multiple parts-of-speech embeddings. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00054"},{"key":"13_CR27","unstructured":"Wu, Y., et al.: Google\u2019s neural machine translation system: bridging the gap between human and machine translation. arXiv:1609.08144 (2016)"},{"key":"13_CR28","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"318","DOI":"10.1007\/978-3-030-01267-0_19","volume-title":"Computer Vision \u2013 ECCV 2018","author":"S Xie","year":"2018","unstructured":"Xie, S., Sun, C., Huang, J., Tu, Z., Murphy, K.: Rethinking spatiotemporal feature learning: speed-accuracy trade-offs in video classification. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11219, pp. 318\u2013335. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01267-0_19"},{"key":"13_CR29","unstructured":"Xing, E.P., Ng, A.Y., Jordan, M.I., Russell, S.: Distance metric learning, with application to clustering with side-information. In: NIPS (2002)"},{"key":"13_CR30","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., Rui, Y.: MSR-VTT: a large video description dataset for bridging video and language. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"13_CR31","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"487","DOI":"10.1007\/978-3-030-01234-2_29","volume-title":"Computer Vision \u2013 ECCV 2018","author":"Y Yu","year":"2018","unstructured":"Yu, Y., Kim, J., Kim, G.: A joint sequence fusion model for video question answering and retrieval. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11211, pp. 487\u2013503. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01234-2_29"},{"key":"13_CR32","doi-asserted-by":"crossref","unstructured":"Yu, Y., Ko, H., Choi, J., Kim, G.: End-to-end concept word detection for video captioning, retrieval, and question answering. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.347"},{"key":"13_CR33","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"385","DOI":"10.1007\/978-3-030-01261-8_23","volume-title":"Computer Vision \u2013 ECCV 2018","author":"B Zhang","year":"2018","unstructured":"Zhang, B., Hu, H., Sha, F.: Cross-modal and hierarchical modeling of video and text. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11217, pp. 385\u2013401. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01261-8_23"},{"key":"13_CR34","doi-asserted-by":"publisher","first-page":"43","DOI":"10.1007\/s13042-010-0001-0","volume":"1","author":"Y Zhang","year":"2010","unstructured":"Zhang, Y., Jin, R., Zhou, Z.H.: Understanding bag-of-words model: a statistical framework. Int. J. Mach. Learn. Cybernet. 1, 43\u201352 (2010)","journal-title":"Int. J. Mach. Learn. Cybernet."},{"key":"13_CR35","doi-asserted-by":"crossref","unstructured":"Zhou, B., Lapedriza, \u00c0., Khosla, A., Oliva, A., Torralba, A.: Places: a 10 million image database for scene recognition. IEEE Trans. Pattern Anal. Mach. Intell. 40, 1452\u20131464 (2018)","DOI":"10.1109\/TPAMI.2017.2723009"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2020"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-58548-8_13","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T00:05:53Z","timestamp":1730160353000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-58548-8_13"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020]]},"ISBN":["9783030585471","9783030585488"],"references-count":35,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-58548-8_13","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020]]},"assertion":[{"value":"29 October 2020","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Glasgow","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"United Kingdom","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2020","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 August 2020","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 August 2020","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2020","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2020.eu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"OpenReview","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5025","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1360","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"27% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"7","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"The conference was held virtually due to the COVID-19 pandemic.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}