{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,9]],"date-time":"2026-05-09T17:39:12Z","timestamp":1778348352005,"version":"3.51.4"},"publisher-location":"Cham","reference-count":39,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031198083","type":"print"},{"value":"9783031198090","type":"electronic"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-19809-0_30","type":"book-chapter","created":{"date-parts":[[2022,10,31]],"date-time":"2022-10-31T07:03:04Z","timestamp":1667199784000},"page":"529-544","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":208,"title":["SLIP: Self-supervision Meets Language-Image Pre-training"],"prefix":"10.1007","author":[{"given":"Norman","family":"Mu","sequence":"first","affiliation":[]},{"given":"Alexander","family":"Kirillov","sequence":"additional","affiliation":[]},{"given":"David","family":"Wagner","sequence":"additional","affiliation":[]},{"given":"Saining","family":"Xie","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,11,1]]},"reference":[{"key":"30_CR1","unstructured":"Bao, H., Dong, L., Wei, F.: Beit: bert pre-training of image transformers. ArXiv abs\/2106.08254 (2021)"},{"key":"30_CR2","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"139","DOI":"10.1007\/978-3-030-01264-9_9","volume-title":"Computer Vision \u2013 ECCV 2018","author":"M Caron","year":"2018","unstructured":"Caron, M., Bojanowski, P., Joulin, A., Douze, M.: Deep clustering for unsupervised learning of visual features. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) Computer Vision \u2013 ECCV 2018. LNCS, vol. 11218, pp. 139\u2013156. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01264-9_9"},{"key":"30_CR3","doi-asserted-by":"crossref","unstructured":"Caron, M., et al.: Emerging properties in self-supervised vision transformers. ArXiv abs\/2104.14294 (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"30_CR4","doi-asserted-by":"crossref","unstructured":"Changpinyo, S., Sharma, P.K., Ding, N., Soricut, R.: Conceptual 12m: Pushing web-scale image-text pre-training to recognize long-tail visual concepts. 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3557\u20133567 (2021)","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"30_CR5","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.E.: A simple framework for contrastive learning of visual representations. ArXiv abs\/2002.05709 (2020)"},{"key":"30_CR6","unstructured":"Chen, T., Kornblith, S., Swersky, K., Norouzi, M., Hinton, G.E.: Big self-supervised models are strong semi-supervised learners. ArXiv abs\/2006.10029 (2020)"},{"key":"30_CR7","doi-asserted-by":"crossref","unstructured":"Chen, X., Xie, S., He, K.: An empirical study of training self-supervised vision transformers. ArXiv abs\/2104.02057 (2021)","DOI":"10.1109\/ICCV48922.2021.00950"},{"key":"30_CR8","doi-asserted-by":"crossref","unstructured":"Desai, K., Johnson, J.: Virtex: Learning visual representations from textual annotations. 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 11157\u201311168 (2021)","DOI":"10.1109\/CVPR46437.2021.01101"},{"key":"30_CR9","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: pre-training of deep bidirectional transformers for language understanding. In: NAACL (2019)"},{"key":"30_CR10","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. ArXiv abs\/2010.11929 (2021)"},{"key":"30_CR11","unstructured":"El-Nouby, A., Izacard, G., Touvron, H., Laptev, I., J\u00e9gou, H., Grave, E.: Are large-scale datasets necessary for self-supervised pre-training? ArXiv abs\/2112.10740 (2021)"},{"key":"30_CR12","unstructured":"Frome, A., et al.: Devise: a deep visual-semantic embedding model. In: NIPS (2013)"},{"key":"30_CR13","doi-asserted-by":"crossref","unstructured":"Girshick, R.B., Donahue, J., Darrell, T., Malik, J.: Rich feature hierarchies for accurate object detection and semantic segmentation. In: 2014 IEEE Conference on Computer Vision and Pattern Recognition, pp. 580\u2013587 (2014)","DOI":"10.1109\/CVPR.2014.81"},{"key":"30_CR14","unstructured":"Goyal, P., et al.: Vissl (2021). https:\/\/github.com\/facebookresearch\/vissl"},{"key":"30_CR15","doi-asserted-by":"crossref","unstructured":"Goyal, P., Mahajan, D.K., Gupta, A., Misra, I.: Scaling and benchmarking self-supervised visual representation learning. In: 2019 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 6390\u20136399 (2019)","DOI":"10.1109\/ICCV.2019.00649"},{"key":"30_CR16","unstructured":"Grill, J.B., et al.: Bootstrap your own latent: A new approach to self-supervised learning. ArXiv abs\/2006.07733 (2020)"},{"key":"30_CR17","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u2019ar, P., Girshick, R.B.: Masked autoencoders are scalable vision learners (2021)","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"30_CR18","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.B.: Momentum contrast for unsupervised visual representation learning. 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 9726\u20139735 (2020)","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"30_CR19","doi-asserted-by":"publisher","unstructured":"Ilharco, G., et al.: Openclip (2021). https:\/\/doi.org\/10.5281\/zenodo.5143773","DOI":"10.5281\/zenodo.5143773"},{"key":"30_CR20","doi-asserted-by":"crossref","unstructured":"Jain, A., et al.: Mural: Multimodal, multitask retrieval across languages. ArXiv abs\/2109.05125 (2021)","DOI":"10.18653\/v1\/2021.findings-emnlp.293"},{"key":"30_CR21","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: ICML (2021)"},{"key":"30_CR22","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1007\/978-3-319-46478-7_5","volume-title":"Computer Vision \u2013 ECCV 2016","author":"A Joulin","year":"2016","unstructured":"Joulin, A., van der Maaten, L., Jabri, A., Vasilache, N.: Learning visual features from large weakly supervised data. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9911, pp. 67\u201384. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46478-7_5"},{"key":"30_CR23","doi-asserted-by":"publisher","first-page":"84","DOI":"10.1145\/3065386","volume":"60","author":"A Krizhevsky","year":"2012","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: Imagenet classification with deep convolutional neural networks. Commun. ACM 60, 84\u201390 (2012)","journal-title":"Commun. ACM"},{"key":"30_CR24","doi-asserted-by":"crossref","unstructured":"Li, A., Jabri, A., Joulin, A., van der Maaten, L.: Learning visual n-grams from web data. In: 2017 IEEE International Conference on Computer Vision (ICCV), pp. 4193\u20134202 (2017)","DOI":"10.1109\/ICCV.2017.449"},{"key":"30_CR25","unstructured":"Li, Y., et al.: Supervision exists everywhere: a data efficient contrastive language-image pre-training paradigm. ArXiv abs\/2110.05208 (2021)"},{"key":"30_CR26","unstructured":"van den Oord, A., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. ArXiv abs\/1807.03748 (2018)"},{"key":"30_CR27","doi-asserted-by":"crossref","unstructured":"Quattoni, A., Collins, M., Darrell, T.: Learning visual representations using images with captions. In: 2007 IEEE Conference on Computer Vision and Pattern Recognition, pp. 1\u20138 (2007)","DOI":"10.1109\/CVPR.2007.383173"},{"key":"30_CR28","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: ICML (2021)"},{"key":"30_CR29","unstructured":"Radford, A., Narasimhan, K.: Improving language understanding by generative pre-training (2018)"},{"key":"30_CR30","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., Deng, J., Su, H., Krause, J., Satheesh, S., Ma, S., Huang, Z., Karpathy, A., Khosla, A., Bernstein, M.S., Berg, A.C., Fei-Fei, L.: Imagenet large scale visual recognition challenge. Int. J. Comput. Vis. 115, 211\u2013252 (2015)","journal-title":"Int. J. Comput. Vis."},{"key":"30_CR31","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"153","DOI":"10.1007\/978-3-030-58598-3_10","volume-title":"Computer Vision \u2013 ECCV 2020","author":"MB Sariyildiz","year":"2020","unstructured":"Sariyildiz, M.B., Perez, J., Larlus, D.: Learning visual representations with\u00a0caption annotations. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12353, pp. 153\u2013170. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58598-3_10"},{"key":"30_CR32","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., Soricut, R.: Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: ACL (2018)","DOI":"10.18653\/v1\/P18-1238"},{"key":"30_CR33","doi-asserted-by":"publisher","first-page":"64","DOI":"10.1145\/2812802","volume":"59","author":"B Thomee","year":"2016","unstructured":"Thomee, B., Shamma, D.A., Friedland, G., Elizalde, B., Ni, K.S., Poland, D.N., Borth, D., Li, L.J.: Yfcc100m: the new data in multimedia research. Commun. ACM 59, 64\u201373 (2016)","journal-title":"Commun. ACM"},{"key":"30_CR34","doi-asserted-by":"crossref","unstructured":"Tian, Y., Henaff, O.J., Oord, A.v.d.: Divide and contrast: Self-supervised learning from uncurated data. arXiv preprint arXiv:2105.08054 (2021)","DOI":"10.1109\/ICCV48922.2021.00991"},{"key":"30_CR35","doi-asserted-by":"crossref","unstructured":"Torralba, A., Efros, A.A.: Unbiased look at dataset bias. In: CVPR 2011, pp. 1521\u20131528 (2011)","DOI":"10.1109\/CVPR.2011.5995347"},{"key":"30_CR36","unstructured":"Touvron, H., Cord, M., Douze, M., Massa, F., Sablayrolles, A., J\u2019egou, H.: Training data-efficient image transformers & distillation through attention. In: ICML (2021)"},{"key":"30_CR37","doi-asserted-by":"crossref","unstructured":"Wu, Z., Xiong, Y., Yu, S.X., Lin, D.: Unsupervised feature learning via non-parametric instance-level discrimination. ArXiv abs\/1805.01978 (2018)","DOI":"10.1109\/CVPR.2018.00393"},{"key":"30_CR38","doi-asserted-by":"crossref","unstructured":"Yuan, X., et al.: Multimodal contrastive training for visual representation learning. In: 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 6991\u20137000 (2021)","DOI":"10.1109\/CVPR46437.2021.00692"},{"key":"30_CR39","unstructured":"Zhai, X., et al.: A large-scale study of representation learning with the visual task adaptation benchmark. arXiv: Computer Vision and Pattern Recognition (2019)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-19809-0_30","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,11,3]],"date-time":"2022-11-03T00:16:11Z","timestamp":1667434571000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-19809-0_30"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031198083","9783031198090"],"references-count":39,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-19809-0_30","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"1 November 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}