{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,19]],"date-time":"2026-01-19T20:54:36Z","timestamp":1768856076390,"version":"3.49.0"},"publisher-location":"Cham","reference-count":24,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032144911","type":"print"},{"value":"9783032144928","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-14492-8_23","type":"book-chapter","created":{"date-parts":[[2026,1,19]],"date-time":"2026-01-19T07:08:20Z","timestamp":1768806500000},"page":"297-309","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Self-supervised Structured Object Representation Learning"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4601-8778","authenticated-orcid":false,"given":"Oussama","family":"Hadjerci","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-6915-1938","authenticated-orcid":false,"given":"Antoine","family":"Letienne","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9492-3719","authenticated-orcid":false,"given":"Mohamed Abbas","family":"Hedjazi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3185-9996","authenticated-orcid":false,"given":"Adel","family":"Hafiane","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,1,20]]},"reference":[{"key":"23_CR1","unstructured":"Bao, H., Dong, L., Piao, S., Wei, F.: Beit: bert pre-training of image transformers. In: International Conference on Learning Representations (ICLR) (2021)"},{"key":"23_CR2","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"23_CR3","doi-asserted-by":"crossref","unstructured":"Caron, M., Touvron, H., Misra, I., J\u00e9gou, H., Mairal, J., Bojanowski, P., Joulin, A.: Emerging properties in self-supervised vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 9915\u20139925 (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"23_CR4","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: III, H.D., Singh, A. (eds.) Proceedings of the 37th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol.\u00a0119, pp. 1597\u20131607. PMLR (2020)"},{"key":"23_CR5","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: Imagenet: a large-scale hierarchical image database. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 248\u2013255 (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"23_CR6","doi-asserted-by":"crossref","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the ACL (NAACL-HLT), pp. 4171\u20134186 (2019)","DOI":"10.18653\/v1\/N19-1423"},{"key":"23_CR7","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. In: International Conference on Learning Representations (2021)"},{"key":"23_CR8","unstructured":"Gao, S., Zhou, P., Cheng, M.M., Yan, S.: Towards sustainable self-supervised learning. In: Proceedings of the 36th Conference on Neural Information Processing Systems (NeurIPS) (2022)"},{"key":"23_CR9","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.: Momentum contrast for unsupervised visual representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 9726\u20139735 (2020)","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"23_CR10","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"23_CR11","doi-asserted-by":"crossref","unstructured":"H\u00e9naff, O.J., Koppula, S., Alayrac, J.B., van\u00a0den Oord, A., Vinyals, O., Carreira, J.: Efficient visual pretraining with contrastive detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 10086\u201310096 (2021)","DOI":"10.1109\/ICCV48922.2021.00993"},{"key":"23_CR12","unstructured":"Jiang, N., Dravid, A., Efros, A., Gandelsman, Y.: Vision transformers don\u2019t need trained registers (2025)"},{"key":"23_CR13","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"23_CR14","unstructured":"Locatello, F., et al.: Object-centric learning with slot attention. In: Advances in Neural Information Processing Systems (NeurIPS), vol.\u00a033, pp. 11528\u201311539 (2020)"},{"key":"23_CR15","unstructured":"Oquab, M., et al.: Dinov2: learning robust visual features without supervision. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2024)"},{"key":"23_CR16","unstructured":"Radford, A., Narasimhan, K., Salimans, T., Sutskever, I.: Improving language understanding by generative pre-training. OpenAI Tech. Rep. (2018)"},{"key":"23_CR17","doi-asserted-by":"crossref","unstructured":"Rambhatla, S.S., Misra, I., Chellappa, R., Shrivastava, A.: Multiple object localization with self-supervised transformers for object discovery. arXiv preprint arXiv:2304.05387 (2023)","DOI":"10.1109\/ICCV51070.2023.01450"},{"key":"23_CR18","unstructured":"Touvron, H., et al.: Llama: open and efficient foundation language models. CoRR, abs\/2302.13971 (2023)"},{"key":"23_CR19","doi-asserted-by":"crossref","unstructured":"Wang, X., Girdhar, R., Yu, S.X., Misra, I.: Cut and learn for unsupervised object detection and instance segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3124\u20133134 (2023)","DOI":"10.1109\/CVPR52729.2023.00305"},{"key":"23_CR20","unstructured":"Wei, F., Gao, Y., Wu, Z., Hu, H., Lin, S.: Aligning pretraining for detection via object-level contrastive learning. In: Proceedings of the 35th International Conference on Neural Information Processing Systems (NeurIPS) (2021)"},{"key":"23_CR21","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2020.102907","volume":"193","author":"L Wen","year":"2020","unstructured":"Wen, L., et al.: UA-DETRAC: a new benchmark and protocol for multi-object detection and tracking. Comput. Vis. Image Underst. 193, 102907 (2020)","journal-title":"Comput. Vis. Image Underst."},{"key":"23_CR22","unstructured":"Wen, X., Zhao, B., Zheng, A., Zhang, X., Qi, X.: Self-supervised visual representation learning with semantic grouping. In: Advances in Neural Information Processing Systems, vol.\u00a035, pp. 22644\u201322658 (2022)"},{"key":"23_CR23","doi-asserted-by":"crossref","unstructured":"Zhao, Y., et al.: Detrs beat yolos on real-time object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2024)","DOI":"10.1109\/CVPR52733.2024.01605"},{"key":"23_CR24","unstructured":"Zhou, J., et al.: Image bert pre-training with online tokenizer. In: Proceedings of the 10th International Conference on Learning Representations (ICLR) (2022)"}],"container-title":["Lecture Notes in Computer Science","Advances in Visual Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-14492-8_23","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,19]],"date-time":"2026-01-19T07:08:28Z","timestamp":1768806508000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-14492-8_23"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9783032144911","9783032144928"],"references-count":24,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-14492-8_23","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"20 January 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ISVC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Symposium on Visual Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Las Vegas, NV","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"USA","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17 November 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"19 November 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"isvc2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.isvc.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}