{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,23]],"date-time":"2026-07-23T14:24:58Z","timestamp":1784816698532,"version":"3.55.0"},"publisher-location":"Cham","reference-count":55,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031198328","type":"print"},{"value":"9783031198335","type":"electronic"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-19833-5_40","type":"book-chapter","created":{"date-parts":[[2022,11,4]],"date-time":"2022-11-04T00:40:30Z","timestamp":1667522430000},"page":"691-708","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":26,"title":["MILES: Visual BERT Pre-training with\u00a0Injected Language Semantics for\u00a0Video-Text Retrieval"],"prefix":"10.1007","author":[{"given":"Yuying","family":"Ge","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yixiao","family":"Ge","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xihui","family":"Liu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jinpeng","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jianping","family":"Wu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ying","family":"Shan","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiaohu","family":"Qie","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ping","family":"Luo","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2022,11,4]]},"reference":[{"key":"40_CR1","unstructured":"Akbari, H., et al.: VATT: transformers for multimodal self-supervised learning from raw video, audio and text. arXiv preprint arXiv:2104.11178 (2021)"},{"issue":"6","key":"40_CR2","first-page":"7","volume":"2","author":"J-B Alayrac","year":"2020","unstructured":"Alayrac, J.-B., et al.: Self-supervised multimodal versatile networks. NeurIPS 2(6), 7 (2020)","journal-title":"NeurIPS"},{"key":"40_CR3","unstructured":"Alwassel, H., Mahajan, D., Korbar, B., Torresani, L., Ghanem, B., Tran, D.: Self-supervised learning by cross-modal audio-video clustering. In: NeurIPS 2020 (2020)"},{"key":"40_CR4","doi-asserted-by":"crossref","unstructured":"Amrani, E., Ben-Ari, R., Rotman, D., Bronstein, A.: Noise estimation using density estimation for self-supervised multimodal learning. In: AAAI, vol. 35, pp. 6644\u20136652 (2021)","DOI":"10.1609\/aaai.v35i8.16822"},{"key":"40_CR5","doi-asserted-by":"crossref","unstructured":"Hendricks, L.A., Wang, O., Shechtman, E., Sivic, J., Darrell, T., Russell, B.: Localizing moments in video with natural language. In: ICCV, pp. 5803\u20135812 (2017)","DOI":"10.1109\/ICCV.2017.618"},{"key":"40_CR6","doi-asserted-by":"crossref","unstructured":"Bain, M., Nagrani, A., Varol, G., Zisserman, A.: Frozen in time: a joint video and image encoder for end-to-end retrieval (2021)","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"40_CR7","unstructured":"Bao, H., Dong, L., Wei, F.: BEiT: BERT pre-training of image transformers. In: ICLR (2022)"},{"key":"40_CR8","unstructured":"Bertasius, G., Wang, H., Torresani, L.: Is space-time attention all you need for video understanding. arXiv preprint arXiv:2102.05095 (2021)"},{"key":"40_CR9","doi-asserted-by":"crossref","unstructured":"Caron, M., et al.: Emerging properties in self-supervised vision transformers. In: ICCV, pp. 9650\u20139660 (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"40_CR10","doi-asserted-by":"crossref","unstructured":"Chen, B., et al.: Multimodal clustering networks for self-supervised learning from unlabeled videos. arXiv preprint arXiv:2104.12671 (2021)","DOI":"10.1109\/ICCV48922.2021.00791"},{"key":"40_CR11","unstructured":"Chen, D., Dolan, W.B.: Collecting highly parallel data for paraphrase evaluation. In: ACL, pp. 190\u2013200 (2011)"},{"key":"40_CR12","unstructured":"Devlin, J., Chang, M.-W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: NAACL (2019)"},{"key":"40_CR13","unstructured":"Dong, X., et al.: PeCo: perceptual codebook for BERT pre-training of vision transformers. arXiv preprint arXiv:2111.12710 (2021)"},{"key":"40_CR14","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. In: ICLR (2020)"},{"key":"40_CR15","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"214","DOI":"10.1007\/978-3-030-58548-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"V Gabeur","year":"2020","unstructured":"Gabeur, V., Sun, C., Alahari, K., Schmid, C.: Multi-modal transformer for video retrieval. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12349, pp. 214\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58548-8_13"},{"key":"40_CR16","doi-asserted-by":"crossref","unstructured":"Ge, Y., et al.: Bridging video-text retrieval with multiple choice questions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16167\u201316176 (2022)","DOI":"10.1109\/CVPR52688.2022.01569"},{"key":"40_CR17","doi-asserted-by":"crossref","unstructured":"Ghadiyaram, D., Tran, D., Mahajan, D.: Large-scale weakly-supervised pre-training for video action recognition. In: CVPR, pp. 12046\u201312055 (2019)","DOI":"10.1109\/CVPR.2019.01232"},{"key":"40_CR18","unstructured":"Ging, S., Zolfaghari, M., Pirsiavash, H., Brox, T.: COOT: cooperative hierarchical transformer for video-text representation learning. In: NeurIPS (2020)"},{"key":"40_CR19","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"312","DOI":"10.1007\/978-3-030-58580-8_19","volume-title":"Computer Vision \u2013 ECCV 2020","author":"T Han","year":"2020","unstructured":"Han, T., Xie, W., Zisserman, A.: Memory-augmented dense predictive coding for video representation learning. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12348, pp. 312\u2013329. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58580-8_19"},{"key":"40_CR20","unstructured":"Han, T., Xie, W., Zisserman, A.: Self-supervised co-training for video representation learning. In: NeurIPS, vol. 33, pp. 5679\u20135690 (2020)"},{"key":"40_CR21","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.: Masked autoencoders are scalable vision learners. arXiv preprint arXiv:2111.06377 (2021)","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"40_CR22","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.: Momentum contrast for unsupervised visual representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9729\u20139738 (2020)","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"40_CR23","unstructured":"Huo, Y., et al.: Compressed video contrastive learning. In: NeurIPS, vol. 34 (2021)"},{"key":"40_CR24","unstructured":"Jozefowicz, R., Vinyals, O., Schuster, M., Shazeer, N., Wu, Y.: Exploring the limits of language modeling. arXiv preprint arXiv:1602.02410 (2016)"},{"key":"40_CR25","unstructured":"Kong, Q., Wei, W., Yoshinaga, T., Deng, Z., Murakami, T.: Cycle-contrast for self-supervised video representation learning (2020)"},{"key":"40_CR26","doi-asserted-by":"crossref","unstructured":"Kuehne, H., Jhuang, H., Garrote, E., Poggio, T., Serre, T.: HMDB: a large video database for human motion recognition. In: ICCV, pp. 2556\u20132563. IEEE (2011)","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"40_CR27","doi-asserted-by":"crossref","unstructured":"Lei, J., et al.: Less is more: ClipBERT for video-and-language learning via sparse sampling. In: CVPR, pp. 7331\u20137341 (2021)","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"40_CR28","doi-asserted-by":"crossref","unstructured":"Li, L., Chen, Y.-C., Cheng, Y., Gan, Z., Yu, L., Liu, J.: HERO: hierarchical encoder for video+ language omni-representation pre-training. In: EMNLP, pp. 2046\u20132065 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.161"},{"key":"40_CR29","unstructured":"Liu, Y., Albanie, S., Nagrani, A., Zisserman, A.: Use what you have: video retrieval using representations from collaborative experts. arXiv preprint arXiv:1907.13487 (2019)"},{"key":"40_CR30","unstructured":"Luo, H., et al.: UniVL: a unified video and language pre-training model for multimodal understanding and generation. arXiv preprint arXiv:2002.06353 (2020)"},{"key":"40_CR31","doi-asserted-by":"crossref","unstructured":"Miech, A., Alayrac, J.-B., Smaira, L., Laptev, I., Sivic, J., Zisserman, A.: End-to-end learning of visual representations from uncurated instructional videos. In: CVPR, pp. 9879\u20139889 (2020)","DOI":"10.1109\/CVPR42600.2020.00990"},{"key":"40_CR32","doi-asserted-by":"crossref","unstructured":"Miech, A., Zhukov, D., Alayrac, J.-B., Tapaswi, M., Laptev, I., Sivic, J.: Howto100m: learning a text-video embedding by watching hundred million narrated video clips. In: ICCV, pp. 2630\u20132640 (2019)","DOI":"10.1109\/ICCV.2019.00272"},{"key":"40_CR33","unstructured":"van den Oord, A., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)"},{"key":"40_CR34","unstructured":"Pan, J., Lin, Z., Zhu, X., Shao, J., Li, H.: Parameter-efficient image-to-video transfer learning. arXiv preprint arXiv:2206.13559 (2022)"},{"key":"40_CR35","unstructured":"Patrick, M., et al.: Support-set bottlenecks for video-text representation learning. In: ICLR (2020)"},{"key":"40_CR36","doi-asserted-by":"crossref","unstructured":"Piergiovanni, A.J., Angelova, A., Ryoo, M.S.: Evolving losses for unsupervised video representation learning. In: CVPR, pp. 133\u2013142 (2020)","DOI":"10.1109\/CVPR42600.2020.00021"},{"key":"40_CR37","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. arXiv preprint arXiv:2103.00020 (2021)"},{"key":"40_CR38","unstructured":"Ramesh, A., et al.: Zero-shot text-to-image generation. In: ICML (2021)"},{"key":"40_CR39","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. In: NeurIPS, vol. 28 (2015)"},{"key":"40_CR40","doi-asserted-by":"crossref","unstructured":"Rohrbach, A., Rohrbach, M., Tandon, N., Schiele, B.: A dataset for movie description. In: CVPR, pp. 3202\u20133212 (2015)","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"40_CR41","doi-asserted-by":"crossref","unstructured":"Rouditchenko, A., et al. AVLNet: learning audio-visual language representations from instructional videos. arXiv preprint arXiv:2006.09199 (2020)","DOI":"10.21437\/Interspeech.2021-1312"},{"key":"40_CR42","unstructured":"Sanh, V., Debut, L., Chaumond, J., Wolf, T.: DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108 (2019)"},{"key":"40_CR43","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., Soricut, R.: Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: ACL, pp. 2556\u20132565 (2018)","DOI":"10.18653\/v1\/P18-1238"},{"key":"40_CR44","unstructured":"Soomro, K., Zamir, A.R., Shah, M.: UCF101: a dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)"},{"key":"40_CR45","unstructured":"Sun, C., Baradel, F., Murphy, K., Schmid, C.: Learning video representations using contrastive bidirectional transformer. arXiv preprint arXiv:1906.05743 (2019)"},{"key":"40_CR46","doi-asserted-by":"crossref","unstructured":"Sun, C., Myers, A., Vondrick, C., Murphy, K., Schmid, C.: VideoBERT: a joint model for video and language representation learning. In: ICCV, pp. 7464\u20137473 (2019)","DOI":"10.1109\/ICCV.2019.00756"},{"key":"40_CR47","unstructured":"Tong, Z., Song, Y., Wang, J., Wang, L.: VideoMAE: masked autoencoders are data-efficient learners for self-supervised video pre-training. arXiv preprint arXiv:2203.12602 (2022)"},{"key":"40_CR48","doi-asserted-by":"crossref","unstructured":"Wang, J., et al.: Object-aware video-language pre-training for retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3313\u20133322 (2022)","DOI":"10.1109\/CVPR52688.2022.00331"},{"key":"40_CR49","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"318","DOI":"10.1007\/978-3-030-01267-0_19","volume-title":"Computer Vision \u2013 ECCV 2018","author":"S Xie","year":"2018","unstructured":"Xie, S., Sun, C., Huang, J., Tu, Z., Murphy, K.: Rethinking spatiotemporal feature learning: speed-accuracy trade-offs in video classification. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11219, pp. 318\u2013335. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01267-0_19"},{"key":"40_CR50","doi-asserted-by":"crossref","unstructured":"Xu, H., et al.: VLM: task-agnostic video-language model pre-training for video understanding. arXiv preprint arXiv:2105.09996 (2021)","DOI":"10.18653\/v1\/2021.findings-acl.370"},{"key":"40_CR51","doi-asserted-by":"crossref","unstructured":"Xu, H., et al.: VideoClip: contrastive pre-training for zero-shot video-text understanding. arXiv preprint arXiv:2109.14084 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.544"},{"key":"40_CR52","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., Rui, Y.: MSR-VTT: a large video description dataset for bridging video and language. In: CVPR, pp. 5288\u20135296 (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"40_CR53","doi-asserted-by":"crossref","unstructured":"Yang, J., Bisk, Y., Gao, J.: TACO: token-aware cascade contrastive learning for video-text alignment. In: ICCV, pp. 11562\u201311572 (2021)","DOI":"10.1109\/ICCV48922.2021.01136"},{"key":"40_CR54","unstructured":"Zhou, J., et al.: iBOT: image BERT pre-training with online tokenizer. In: ICLR (2022)"},{"key":"40_CR55","doi-asserted-by":"crossref","unstructured":"Zhu, L., Yang, Y.: ActBERT: learning global-local video-text representations. In: CVPR, pp. 8746\u20138755 (2020)","DOI":"10.1109\/CVPR42600.2020.00877"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-19833-5_40","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,1,9]],"date-time":"2023-01-09T15:48:35Z","timestamp":1673279315000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-19833-5_40"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031198328","9783031198335"],"references-count":55,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-19833-5_40","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"4 November 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"From the workshops, 367 reviewed full papers have been selected for publication","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}