{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:50:13Z","timestamp":1778082613917,"version":"3.51.4"},"publisher-location":"Cham","reference-count":93,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031198113","type":"print"},{"value":"9783031198120","type":"electronic"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-19812-0_8","type":"book-chapter","created":{"date-parts":[[2022,10,29]],"date-time":"2022-10-29T14:03:42Z","timestamp":1667052222000},"page":"123-143","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":41,"title":["Object Discovery and\u00a0Representation Networks"],"prefix":"10.1007","author":[{"given":"Olivier J.","family":"H\u00e9naff","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Skanda","family":"Koppula","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Evan","family":"Shelhamer","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Daniel","family":"Zoran","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Andrew","family":"Jaegle","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Andrew","family":"Zisserman","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jo\u00e3o","family":"Carreira","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Relja","family":"Arandjelovi\u0107","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2022,10,30]]},"reference":[{"key":"8_CR1","doi-asserted-by":"crossref","unstructured":"Agrawal, P., Carreira, J., Malik, J.: Learning to see by moving. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.13"},{"key":"8_CR2","doi-asserted-by":"crossref","unstructured":"Arandjelovic, R., Zisserman, A.: Look, listen and learn. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 609\u2013617 (2017)","DOI":"10.1109\/ICCV.2017.73"},{"key":"8_CR3","doi-asserted-by":"crossref","unstructured":"Arbel\u00e1ez, P., Pont-Tuset, J., Barron, J.T., Marques, F., Malik, J.: Multiscale combinatorial grouping. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 328\u2013335 (2014)","DOI":"10.1109\/CVPR.2014.49"},{"key":"8_CR4","unstructured":"Asano, Y.M., Rupprecht, C., Vedaldi, A.: Self-labelling via simultaneous clustering and representation learning. In: International Conference on Learning Representations (ICLR) (2020)"},{"key":"8_CR5","first-page":"15535","volume":"32","author":"P Bachman","year":"2019","unstructured":"Bachman, P., Hjelm, R.D., Buchwalter, W.: Learning representations by maximizing mutual information across views. Adv. Neural. Inf. Process. Syst. 32, 15535\u201315545 (2019)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"8_CR6","unstructured":"Baevski, A., Hsu, W.N., Xu, Q., Babu, A., Gu, J., Auli, M.: Data2vec: a general framework for self-supervised learning in speech, vision and language. arXiv preprint arXiv:2202.03555 (2022)"},{"key":"8_CR7","unstructured":"Bao, H., Dong, L., Wei, F.: BEiT: BERT pre-training of image transformers. arXiv preprint arXiv:2106.08254 (2021)"},{"key":"8_CR8","unstructured":"Burgess, C.P., et al.: MONet: unsupervised scene decomposition and representation. CoRR abs\/1901.11390 (2019). http:\/\/arxiv.org\/abs\/1901.11390"},{"key":"8_CR9","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"139","DOI":"10.1007\/978-3-030-01264-9_9","volume-title":"Computer Vision \u2013 ECCV 2018","author":"M Caron","year":"2018","unstructured":"Caron, M., Bojanowski, P., Joulin, A., Douze, M.: Deep clustering for unsupervised learning of visual features. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) Computer Vision \u2013 ECCV 2018. LNCS, vol. 11218, pp. 139\u2013156. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01264-9_9"},{"key":"8_CR10","doi-asserted-by":"crossref","unstructured":"Caron, M., Bojanowski, P., Mairal, J., Joulin, A.: Unsupervised pre-training of image features on non-curated data. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2959\u20132968 (2019)","DOI":"10.1109\/ICCV.2019.00305"},{"key":"8_CR11","first-page":"9912","volume":"33","author":"M Caron","year":"2020","unstructured":"Caron, M., Misra, I., Mairal, J., Goyal, P., Bojanowski, P., Joulin, A.: Unsupervised learning of visual features by contrasting cluster assignments. Adv. Neural. Inf. Process. Syst. 33, 9912\u20139924 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"8_CR12","doi-asserted-by":"crossref","unstructured":"Caron, M., et al.: Emerging properties in self-supervised vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9650\u20139660 (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"8_CR13","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: International Conference on Machine Learning, pp. 1597\u20131607. PMLR (2020)"},{"key":"8_CR14","unstructured":"Chen, X., Fan, H., Girshick, R., He, K.: Improved baselines with momentum contrastive learning. arXiv preprint arXiv:2003.04297 (2020)"},{"key":"8_CR15","doi-asserted-by":"crossref","unstructured":"Chen, X., He, K.: Exploring simple Siamese representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15750\u201315758 (2021)","DOI":"10.1109\/CVPR46437.2021.01549"},{"key":"8_CR16","doi-asserted-by":"crossref","unstructured":"Chen, X., Xie, S., He, K.: An empirical study of training self-supervised vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9640\u20139649 (2021)","DOI":"10.1109\/ICCV48922.2021.00950"},{"key":"8_CR17","unstructured":"Cho, J.H., Mall, U., Bala, K., Hariharan, B.: PiCIE: unsupervised semantic segmentation using invariance and equivariance in clustering. In: CVPR, pp. 16794\u201316804 (2021)"},{"key":"8_CR18","doi-asserted-by":"crossref","unstructured":"Cho, M., Kwak, S., Schmid, C., Ponce, J.: Unsupervised object discovery and localization in the wild: part-based matching with bottom-up region proposals. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1201\u20131210 (2015)","DOI":"10.1109\/CVPR.2015.7298724"},{"key":"8_CR19","doi-asserted-by":"crossref","unstructured":"Cordts, M., et al.: The cityscapes dataset for semantic urban scene understanding. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3213\u20133223 (2016)","DOI":"10.1109\/CVPR.2016.350"},{"key":"8_CR20","unstructured":"Didolkar, A., et al.: Neural production systems (2021)"},{"key":"8_CR21","doi-asserted-by":"crossref","unstructured":"Doersch, C., Gupta, A., Efros, A.A.: Unsupervised visual representation learning by context prediction. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1422\u20131430 (2015)","DOI":"10.1109\/ICCV.2015.167"},{"key":"8_CR22","doi-asserted-by":"crossref","unstructured":"Doersch, C., Zisserman, A.: Multi-task self-supervised visual learning. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2051\u20132060 (2017)","DOI":"10.1109\/ICCV.2017.226"},{"key":"8_CR23","unstructured":"Donahue, J., Kr\u00e4henb\u00fchl, P., Darrell, T.: Adversarial feature learning. In: International Conference on Learning Representations (ICLR) (2017)"},{"key":"8_CR24","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"8_CR25","doi-asserted-by":"crossref","unstructured":"Dosovitskiy, A., Springenberg, J.T., Riedmiller, M., Brox, T.: Discriminative unsupervised feature learning with convolutional neural networks. In: NIPS (2014)","DOI":"10.1109\/CVPR.2015.7298761"},{"key":"8_CR26","doi-asserted-by":"crossref","unstructured":"Dwibedi, D., Aytar, Y., Tompson, J., Sermanet, P., Zisserman, A.: With a little help from my friends: Nearest-neighbor contrastive learning of visual representations. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9588\u20139597 (2021)","DOI":"10.1109\/ICCV48922.2021.00945"},{"issue":"1","key":"8_CR27","doi-asserted-by":"publisher","first-page":"98","DOI":"10.1007\/s11263-014-0733-5","volume":"111","author":"M Everingham","year":"2015","unstructured":"Everingham, M., Eslami, S.A., Van Gool, L., Williams, C.K., Winn, J., Zisserman, A.: The pascal visual object classes challenge: a retrospective. Int. J. Comput. Vision 111(1), 98\u2013136 (2015)","journal-title":"Int. J. Comput. Vision"},{"issue":"2","key":"8_CR28","doi-asserted-by":"publisher","first-page":"167","DOI":"10.1023\/B:VISI.0000022288.19776.77","volume":"59","author":"PF Felzenszwalb","year":"2004","unstructured":"Felzenszwalb, P.F., Huttenlocher, D.P.: Efficient graph-based image segmentation. Int. J. Comput. Vision 59(2), 167\u2013181 (2004)","journal-title":"Int. J. Comput. Vision"},{"key":"8_CR29","doi-asserted-by":"crossref","unstructured":"Feng, C., Zhong, Y., Gao, Y., Scott, M.R., Huang, W.: TOOD: task-aligned one-stage object detection. In: International Conference on Computer Vision (2021)","DOI":"10.1109\/ICCV48922.2021.00349"},{"key":"8_CR30","unstructured":"Gidaris, S., Singh, P., Komodakis, N.: Unsupervised representation learning by predicting image rotations. In: International Conference on Learning Representations (ICLR) (2018)"},{"key":"8_CR31","unstructured":"GitHub: TPU object detection and segmentation framework (2021). https:\/\/github.com\/tensorflow\/tpu\/tree\/master\/models\/official\/detection"},{"key":"8_CR32","unstructured":"Goyal, A., et al.: Recurrent independent mechanisms. arXiv preprint arXiv:1909.10893 (2019)"},{"key":"8_CR33","unstructured":"Goyal, P., et al.: Self-supervised pretraining of visual features in the wild. arXiv preprint arXiv:2103.01988 (2021)"},{"key":"8_CR34","unstructured":"Greff, K., et al.: Multi-object representation learning with iterative variational inference. In: International Conference on Machine Learning, pp. 2424\u20132433. PMLR (2019)"},{"key":"8_CR35","unstructured":"Grill, J.B., et al.: Bootstrap your own latent-a new approach to self-supervised learning. In: Advances in Neural Information Processing Systems 33 (2020)"},{"key":"8_CR36","doi-asserted-by":"crossref","unstructured":"Hadsell, R., Chopra, S., LeCun, Y.: Dimensionality reduction by learning an invariant mapping. In: 2006 IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR2006), vol. 2, pp. 1735\u20131742. IEEE (2006)","DOI":"10.1109\/CVPR.2006.100"},{"key":"8_CR37","first-page":"5679","volume":"33","author":"T Han","year":"2020","unstructured":"Han, T., Xie, W., Zisserman, A.: Self-supervised co-training for video representation learning. Adv. Neural. Inf. Process. Syst. 33, 5679\u20135690 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"8_CR38","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.: Masked autoencoders are scalable vision learners. arXiv preprint arXiv:2111.06377 (2021)","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"8_CR39","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.: Momentum contrast for unsupervised visual representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9729\u20139738 (2020)","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"8_CR40","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., Girshick, R.: Mask R-CNN. In: Proceedings of the IEEE international conference on computer vision, pp. 2961\u20132969 (2017)","DOI":"10.1109\/ICCV.2017.322"},{"key":"8_CR41","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"8_CR42","doi-asserted-by":"crossref","unstructured":"H\u00e9naff, O.J., Koppula, S., Alayrac, J.B., van den Oord, A., Vinyals, O., Carreira, J.: Efficient visual pretraining with contrastive detection. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00993"},{"key":"8_CR43","unstructured":"H\u00e9naff, O.J., et al.: Data-efficient image recognition with contrastive predictive coding. arXiv preprint arXiv:1905.09272 (2019)"},{"key":"8_CR44","unstructured":"Hjelm, R.D., et al.: Learning deep representations by mutual information estimation and maximization. In: International Conference on Learning Representations (2018)"},{"key":"8_CR45","doi-asserted-by":"crossref","unstructured":"Ji, X., Henriques, J.F., Vedaldi, A.: Invariant information clustering for unsupervised image classification and segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9865\u20139874 (2019)","DOI":"10.1109\/ICCV.2019.00996"},{"key":"8_CR46","unstructured":"Kabra, R., et al.: Simone: view-invariant, temporally-abstracted object representations via unsupervised video decomposition. In: Advances in Neural Information Processing Systems 34 (2021)"},{"key":"8_CR47","unstructured":"Kipf, T., et al.: Conditional object-centric learning from video. arXiv preprint arXiv:2111.12594 (2021)"},{"key":"8_CR48","unstructured":"Korbar, B., Tran, D., Torresani, L.: Cooperative learning of audio and video models from self-supervised synchronization. In: Advances in Neural Information Processing Systems (2018)"},{"key":"8_CR49","doi-asserted-by":"crossref","unstructured":"Larsson, G., Maire, M., Shakhnarovich, G.: Colorization as a proxy task for visual understanding. In: CVPR, pp. 6874\u20136883 (2017)","DOI":"10.1109\/CVPR.2017.96"},{"key":"8_CR50","doi-asserted-by":"crossref","unstructured":"Lin, C., Miller, T., Dligach, D., Bethard, S., Savova, G.: EntityBERT: entity-centric masking strategy for model pretraining for the clinical domain. In: Proceedings of the 20th Workshop on Biomedical Language Processing, pp. 191\u2013201 (2021)","DOI":"10.18653\/v1\/2021.bionlp-1.21"},{"key":"8_CR51","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Doll\u00e1r, P., Girshick, R., He, K., Hariharan, B., Belongie, S.: Feature pyramid networks for object detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2117\u20132125 (2017)","DOI":"10.1109\/CVPR.2017.106"},{"key":"8_CR52","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"8_CR53","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"8_CR54","first-page":"11525","volume":"33","author":"F Locatello","year":"2020","unstructured":"Locatello, F., et al.: Object-centric learning with slot attention. Adv. Neural. Inf. Process. Syst. 33, 11525\u201311538 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"8_CR55","doi-asserted-by":"crossref","unstructured":"Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3431\u20133440 (2015)","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"8_CR56","doi-asserted-by":"crossref","unstructured":"Miech, A., Zhukov, D., Alayrac, J.B., Tapaswi, M., Laptev, I., Sivic, J.: Howto100M: learning a text-video embedding by watching hundred million narrated video clips. In: International Conference on Computer Vision (2019)","DOI":"10.1109\/ICCV.2019.00272"},{"key":"8_CR57","unstructured":"Mishra, S., et al.: Object-aware cropping for self-supervised learning. arXiv preprint arXiv:2112.00319 (2021)"},{"key":"8_CR58","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"527","DOI":"10.1007\/978-3-319-46448-0_32","volume-title":"Computer Vision \u2013 ECCV 2016","author":"I Misra","year":"2016","unstructured":"Misra, I., Zitnick, C.L., Hebert, M.: Shuffle and learn: unsupervised learning using temporal order verification. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 527\u2013544. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_32"},{"key":"8_CR59","doi-asserted-by":"crossref","unstructured":"Nathan Mundhenk, T., Ho, D., Chen, B.Y.: Improvements to context based self-supervised learning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 9339\u20139348 (2018)","DOI":"10.1109\/CVPR.2018.00973"},{"key":"8_CR60","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1007\/978-3-319-46466-4_5","volume-title":"Computer Vision \u2013 ECCV 2016","author":"M Noroozi","year":"2016","unstructured":"Noroozi, M., Favaro, P.: Unsupervised learning of visual representations by solving jigsaw puzzles. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9910, pp. 69\u201384. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46466-4_5"},{"key":"8_CR61","doi-asserted-by":"crossref","unstructured":"Nunes, L., Marcuzzi, R., Chen, X., Behley, J., Stachniss, C.: Segcontrast: 3D point cloud feature representation learning through self-supervised segment discrimination. IEEE Robotics and Automation Letters (2022)","DOI":"10.1109\/LRA.2022.3142440"},{"key":"8_CR62","unstructured":"van den Oord, A., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)"},{"key":"8_CR63","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"639","DOI":"10.1007\/978-3-030-01231-1_39","volume-title":"Computer Vision \u2013 ECCV 2018","author":"A Owens","year":"2018","unstructured":"Owens, A., Efros, A.A.: Audio-visual scene analysis with self-supervised multisensory features. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11210, pp. 639\u2013658. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01231-1_39"},{"key":"8_CR64","doi-asserted-by":"crossref","unstructured":"Pathak, D., Krahenbuhl, P., Donahue, J., Darrell, T., Efros, A.A.: Context encoders: feature learning by inpainting. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2536\u20132544 (2016)","DOI":"10.1109\/CVPR.2016.278"},{"key":"8_CR65","doi-asserted-by":"crossref","unstructured":"Peng, C., et al.: MegDet: a large mini-batch object detector. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6181\u20136189 (2018)","DOI":"10.1109\/CVPR.2018.00647"},{"key":"8_CR66","doi-asserted-by":"crossref","unstructured":"Perazzi, F., Pont-Tuset, J., McWilliams, B., Van Gool, L., Gross, M., Sorkine-Hornung, A.: A benchmark dataset and evaluation methodology for video object segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 724\u2013732 (2016)","DOI":"10.1109\/CVPR.2016.85"},{"key":"8_CR67","unstructured":"Pinheiro, P.O., Almahairi, A., Benmalek, R.Y., Golemo, F., Courville, A.C.: Unsupervised learning of dense visual representations. In: NeurIPS (2020)"},{"key":"8_CR68","doi-asserted-by":"crossref","unstructured":"Recasens, A., et al.: Broaden your views for self-supervised video learning. In: International Conference on Computer Vision (2021)","DOI":"10.1109\/ICCV48922.2021.00129"},{"issue":"3","key":"8_CR69","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., et al.: ImageNet large scale visual recognition challenge. Int. J. Comput. Vision 115(3), 211\u2013252 (2015)","journal-title":"Int. J. Comput. Vision"},{"key":"8_CR70","unstructured":"Ryali, C., Schwab, D.J., Morcos, A.S.: Learning background invariance improves generalization and robustness in self-supervised learning on imageNet and beyond. In: Advances in Neural Information Processing Systems (2021)"},{"key":"8_CR71","unstructured":"Shanahan, M., Nikiforou, K., Creswell, A., Kaplanis, C., Barrett, D., Garnelo, M.: An explicitly relational neural network architecture. In: International Conference on Machine Learning, pp. 8593\u20138603. PMLR (2020)"},{"key":"8_CR72","doi-asserted-by":"crossref","unstructured":"Sun, C., Myers, A., Vondrick, C., Murphy, K., Schmid, C.: VideoBERT: a joint model for video and language representation learning. In: International Conference on Computer Vision (2019)","DOI":"10.1109\/ICCV.2019.00756"},{"key":"8_CR73","doi-asserted-by":"crossref","unstructured":"Tian, Y., Henaff, O.J., van den Oord, A.: Divide and contrast: self-supervised learning from uncurated data. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10063\u201310074 (2021)","DOI":"10.1109\/ICCV48922.2021.00991"},{"key":"8_CR74","unstructured":"Tian, Y., Sun, C., Poole, B., Krishnan, D., Schmid, C., Isola, P.: What makes for good views for contrastive learning. In: NeurIPS (2020)"},{"key":"8_CR75","doi-asserted-by":"crossref","unstructured":"Tian, Z., Shen, C., Chen, H., He, T.: FCOS: fully convolutional one-stage object detection. In: International Conference on Computer Vision (2019)","DOI":"10.1109\/ICCV.2019.00972"},{"key":"8_CR76","unstructured":"Tomasev, N., et al.: Pushing the limits of self-supervised resnets: can we outperform supervised learning without labels on imagenet? arXiv preprint arXiv:2201.05119 (2022)"},{"key":"8_CR77","doi-asserted-by":"crossref","unstructured":"Van Gansbeke, W., Vandenhende, S., Georgoulis, S., Van Gool, L.: Unsupervised semantic segmentation by contrasting object mask proposals. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00990"},{"key":"8_CR78","doi-asserted-by":"crossref","unstructured":"Vincent, P., Larochelle, H., Bengio, Y., Manzagol, P.A.: Extracting and composing robust features with denoising autoencoders. In: Proceedings of the 25th international conference on Machine learning, pp. 1096\u20131103 (2008)","DOI":"10.1145\/1390156.1390294"},{"key":"8_CR79","unstructured":"Wei, F., Gao, Y., Wu, Z., Hu, H., Lin, S.: Aligning pretraining for detection via object-level contrastive learning. In: Advances in Neural Information Processing Systems 34 (2021)"},{"key":"8_CR80","doi-asserted-by":"crossref","unstructured":"Wu, S., Li, X., Wang, X.: IoU-aware single-stage object detector for accurate localization. Image and Vision Computing (2020)","DOI":"10.1016\/j.imavis.2020.103911"},{"key":"8_CR81","doi-asserted-by":"crossref","unstructured":"Wu, Z., Xiong, Y., Yu, S.X., Lin, D.: Unsupervised feature learning via non-parametric instance discrimination. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3733\u20133742 (2018)","DOI":"10.1109\/CVPR.2018.00393"},{"key":"8_CR82","unstructured":"Xie, J., Zhan, X., Liu, Z., Ong, Y., Loy, C.C.: Unsupervised object-level representation learning from scene images. In: Advances in Neural Information Processing Systems 34 (2021)"},{"key":"8_CR83","unstructured":"Xie, Z., et al.: Self-supervised learning with swin transformers. arXiv preprint arXiv:2105.04553 (2021)"},{"key":"8_CR84","doi-asserted-by":"crossref","unstructured":"Xie, Z., Lin, Y., Zhang, Z., Cao, Y., Lin, S., Hu, H.: Propagate yourself: exploring pixel-level consistency for unsupervised visual representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16684\u201316693 (2021)","DOI":"10.1109\/CVPR46437.2021.01641"},{"key":"8_CR85","doi-asserted-by":"crossref","unstructured":"Yang, C., Lamdouar, H., Lu, E., Zisserman, A., Xie, W.: Self-supervised video object segmentation by motion grouping. In: International Conference on Computer Vision (2021)","DOI":"10.1109\/ICCV48922.2021.00709"},{"key":"8_CR86","unstructured":"You, Y., Gitman, I., Ginsburg, B.: Large batch training of convolutional networks. arXiv preprint arXiv:1708.03888 (2017)"},{"key":"8_CR87","unstructured":"Zbontar, J., Jing, L., Misra, I., LeCun, Y., Deny, S.: Barlow twins: self-supervised learning via redundancy reduction. In: International Conference on Machine Learning, pp. 12310\u201312320. PMLR (2021)"},{"key":"8_CR88","unstructured":"Zhang, F., Torr, P., Ranftl, R., Richter, S.: Looking beyond single images for contrastive semantic segmentation learning. In: Advances in Neural Information Processing Systems 34 (2021)"},{"key":"8_CR89","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"649","DOI":"10.1007\/978-3-319-46487-9_40","volume-title":"Computer Vision \u2013 ECCV 2016","author":"R Zhang","year":"2016","unstructured":"Zhang, R., Isola, P., Efros, A.A.: Colorful image colorization. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9907, pp. 649\u2013666. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46487-9_40"},{"key":"8_CR90","doi-asserted-by":"crossref","unstructured":"Zhang, R., Isola, P., Efros, A.A.: Split-brain autoencoders: unsupervised learning by cross-channel prediction. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1058\u20131067 (2017)","DOI":"10.1109\/CVPR.2017.76"},{"key":"8_CR91","doi-asserted-by":"crossref","unstructured":"Zhang, S., Chi, C., Yao, Y., Lei, Z., Li, S.Z.: Bridging the gap between anchor-based and anchor-free detection via adaptive training sample selection. In: IEEE Conference on Computer Vision and Pattern Recognition (2020)","DOI":"10.1109\/CVPR42600.2020.00978"},{"key":"8_CR92","unstructured":"Zhao, N., Wu, Z., Lau, R.W., Lin, S.: Distilling localization for self-supervised representation learning. arXiv preprint arXiv:2004.06638 (2020)"},{"key":"8_CR93","doi-asserted-by":"crossref","unstructured":"Zoran, D., Kabra, R., Lerchner, A., Rezende, D.J.: Parts: unsupervised segmentation with slots, attention and independence maximization. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10439\u201310447 (2021)","DOI":"10.1109\/ICCV48922.2021.01027"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-19812-0_8","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,6]],"date-time":"2024-10-06T20:52:50Z","timestamp":1728247970000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-19812-0_8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031198113","9783031198120"],"references-count":93,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-19812-0_8","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"30 October 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"From the workshops, 367 reviewed full papers have been selected for publication","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}