{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T19:34:29Z","timestamp":1757619269818,"version":"3.44.0"},"reference-count":39,"publisher":"Springer Science and Business Media LLC","issue":"9","license":[{"start":{"date-parts":[[2025,7,15]],"date-time":"2025-07-15T00:00:00Z","timestamp":1752537600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,7,15]],"date-time":"2025-07-15T00:00:00Z","timestamp":1752537600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Sci. China Inf. Sci."],"published-print":{"date-parts":[[2025,9]]},"DOI":"10.1007\/s11432-024-4342-6","type":"journal-article","created":{"date-parts":[[2025,7,21]],"date-time":"2025-07-21T03:31:13Z","timestamp":1753068673000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["ROLA: real-world object-centric learning with attention optimization"],"prefix":"10.1007","volume":"68","author":[{"given":"Qu","family":"Tang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Haochen","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiangyu","family":"Zhu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhen","family":"Lei","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhaoxiang","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,7,15]]},"reference":[{"key":"4342_CR1","doi-asserted-by":"publisher","first-page":"89","DOI":"10.1111\/j.1467-7687.2007.00569.x","volume":"10","author":"E S Spelke","year":"2007","unstructured":"Spelke E S, Kinzler K D. Core knowledge. Dev Sci, 2007, 10: 89\u201396","journal-title":"Dev Sci"},{"key":"4342_CR2","unstructured":"Burgess C P, Matthey L, Watters N, et al. MONet: unsupervised scene decomposition and representation. 2019. ArXiv:1901.11390"},{"key":"4342_CR3","first-page":"2424","volume-title":"Proceedings of the International Conference on Machine Learning","author":"K Greff","year":"2019","unstructured":"Greff K, Kaufman R L, Kabra R, et al. Multi-object representation learning with iterative variational inference. In: Proceedings of the International Conference on Machine Learning, 2019. 2424\u20132433"},{"key":"4342_CR4","first-page":"11525","volume-title":"Proceedings of the Advances in Neural Information Processing Systems","author":"F Locatello","year":"2020","unstructured":"Locatello F, Weissenborn D, Unterthiner T, et al. Object-centric learning with slot attention. In: Proceedings of the Advances in Neural Information Processing Systems, 2020. 11525\u201311538"},{"key":"4342_CR5","first-page":"23252","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Q Tang","year":"2023","unstructured":"Tang Q, Zhu X, Lei Z, et al. Intrinsic physical concepts discovery with object-centric predictive models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023. 23252\u201323261"},{"key":"4342_CR6","volume-title":"Proceedings of the International Conference on Learning Representations","author":"Q Tang","year":"2022","unstructured":"Tang Q, Zhu X, Lei Z, et al. Object dynamics distillation for scene decomposition and representation. In: Proceedings of the International Conference on Learning Representations, 2022"},{"key":"4342_CR7","doi-asserted-by":"publisher","first-page":"1257","DOI":"10.1038\/s41562-022-01394-8","volume":"6","author":"L S Piloto","year":"2022","unstructured":"Piloto L S, Weinstein A, Battaglia P, et al. Intuitive physics learning in a deep-learning model inspired by developmental psychology. Nat Hum Behav, 2022, 6: 1257\u20131267","journal-title":"Nat Hum Behav"},{"key":"4342_CR8","volume-title":"Proceedings of the International Conference on Learning Representations","author":"Z Wu","year":"2023","unstructured":"Wu Z, Nikita D, Klaus G, et al. SlotFormer: unsupervised visual dynamics simulation with object-centric models. In: Proceedings of the International Conference on Learning Representations, 2023"},{"key":"4342_CR9","first-page":"9112","volume-title":"Proceedings of the Advances in Neural Information Processing Systems","author":"D Ding","year":"2021","unstructured":"Ding D, Hill F, Santoro A, et al. Attention over learned object embeddings enables complex visual reasoning. In: Proceedings of the Advances in Neural Information Processing Systems, 2021. 9112\u20139124"},{"key":"4342_CR10","volume-title":"Proceedings of the International Conference on Learning Representations","author":"T Kipf","year":"2020","unstructured":"Kipf T, van der Pol E, Welling M. Contrastive learning of structured world models. In: Proceedings of the International Conference on Learning Representations, 2020"},{"key":"4342_CR11","volume-title":"Proceedings of the International Conference on Learning Representations","author":"A Nakano","year":"2022","unstructured":"Nakano A, Suzuki M, Matsuo Y. Interaction-based disentanglement of entities for object-centric world models. In: Proceedings of the International Conference on Learning Representations, 2022"},{"key":"4342_CR12","unstructured":"Zadaianchuk A, Seitzer M, Martius G. Self-supervised visual reinforcement learning with object-centric representations. 2020. ArXiv:2011.14381"},{"key":"4342_CR13","first-page":"7111","volume-title":"Proceedings of the International Conference on Robotics and Automation (ICRA)","author":"C Devin","year":"2018","unstructured":"Devin C, Abbeel P, Darrell T, et al. Deep object-centric representations for generalizable robot learning. In: Proceedings of the International Conference on Robotics and Automation (ICRA), 2018. 7111\u20137118"},{"key":"4342_CR14","volume-title":"Proceedings of the International Conference on Machine Learning","author":"A Ramesh","year":"2021","unstructured":"Ramesh A, Pavlov M, Goh G, et al. Zero-shot text-to-image generation. In: Proceedings of the International Conference on Machine Learning, 2021"},{"key":"4342_CR15","first-page":"740","volume-title":"Proceedings of the European Conference on Computer Vision","author":"T Y Lin","year":"2014","unstructured":"Lin T Y, Maire M, Belongie S, et al. Microsoft COCO: common objects in context. In: Proceedings of the European Conference on Computer Vision, 2014. 740\u2013755"},{"key":"4342_CR16","first-page":"9650","volume-title":"Proceedings of the International Conference on Computer Vision","author":"M Caron","year":"2021","unstructured":"Caron M, Touvron H, Misra I, et al. Emerging properties in self-supervised vision transformers. In: Proceedings of the International Conference on Computer Vision, 2021. 9650\u20139660"},{"key":"4342_CR17","volume-title":"Proceedings of the Advances in Neural Information Processing Systems","author":"A Vaswani","year":"2017","unstructured":"Vaswani A, Shazeer N, Parmar N, et al. Attention is all you need. In: Proceedings of the Advances in Neural Information Processing Systems, 2017"},{"key":"4342_CR18","unstructured":"Kingma D P, Welling M. Auto-encoding variational Bayes. 2013. ArXiv:1312.6114"},{"key":"4342_CR19","first-page":"12873","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"P Esser","year":"2021","unstructured":"Esser P, Rombach R, Ommer B. Taming transformers for high-resolution image synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021. 12873\u201312883"},{"key":"4342_CR20","unstructured":"Singh G, Deng F, Ahn S. Illiterate DALL-E learns to compose. 2021. ArXiv:2110.11405"},{"key":"4342_CR21","first-page":"18181","volume-title":"Proceedings of Advances in Neural Information Processing Systems","author":"G Singh","year":"2022","unstructured":"Singh G, Wu Y F, Ahn S. Simple unsupervised object-centric learning for complex and naturalistic videos. In: Proceedings of Advances in Neural Information Processing Systems, 2022. 18181\u201318196"},{"key":"4342_CR22","volume-title":"Proceedings of the International Conference on Learning Representations","author":"M Seitzer","year":"2022","unstructured":"Seitzer M, Horn M, Zadaianchuk A, et al. Bridging the gap to real-world object-centric learning. In: Proceedings of the International Conference on Learning Representations, 2022"},{"key":"4342_CR23","first-page":"50932","volume-title":"Proceedings of the Advances in Neural Information Processing Systems","author":"Z Wu","year":"2023","unstructured":"Wu Z, Hu J, Lu W, et al. SlotDiffusion: object-centric generative modeling with diffusion models. In: Proceedings of the Advances in Neural Information Processing Systems, 2023. 50932\u201350958"},{"key":"4342_CR24","doi-asserted-by":"crossref","unstructured":"Kakogeorgiou I, Gidaris S, Karantzaloset K, et al. SPOT: self-training with patch-order permutation for object-centric learning with autoregressive transformers. 2023. ArXiv:2312.00648","DOI":"10.1109\/CVPR52733.2024.02149"},{"key":"4342_CR25","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, et al. An image is worth 16x16 words: transformers for image recognition at scale. 2020. ArXiv:2010.11929"},{"key":"4342_CR26","doi-asserted-by":"crossref","unstructured":"Simeoni O, Puy G, Vo H V, et al. Localizing objects with self-supervised transformers and no labels. 2021. ArXiv:2109.14279","DOI":"10.5244\/C.35.365"},{"key":"4342_CR27","unstructured":"Peng Z, Dong L, Bao H, et al. A unified view of masked image modeling. 2022. ArXiv:2210.10615"},{"key":"4342_CR28","first-page":"3124","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"X Wang","year":"2023","unstructured":"Wang X, Girdhar R, Yu S X, et al. Cut and learn for unsupervised object detection and instance segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023. 3124\u20133134"},{"key":"4342_CR29","unstructured":"Niu D, Wang X, Han X, et al. Unsupervised universal image segmentation. 2023. ArXiv:2312.17243"},{"key":"4342_CR30","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1007\/s11263-009-0275-4","volume":"88","author":"M Everingham","year":"2010","unstructured":"Everingham M, van Gool L, Williams C K I, et al. The Pascal visual object classes (VOC) challenge. Int J Comput Vis, 2010, 88: 303\u2013338","journal-title":"Int J Comput Vis"},{"key":"4342_CR31","series-title":"Technical Report","volume-title":"Caltech-UCSD Birds 200","author":"P Welinder","year":"2010","unstructured":"Welinder P, Branson S, Mita T, et al. Caltech-UCSD Birds 200. Technical Report CNS-TR-2010-001, 2010"},{"key":"4342_CR32","first-page":"2117","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"T Y Lin","year":"2017","unstructured":"Lin T Y, Doll\u00e1r P, Girshick R, et al. Feature pyramid networks for object detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2017. 2117\u20132125"},{"key":"4342_CR33","first-page":"3","volume-title":"Proceedings of the European Conference on Computer Vision (ECCV)","author":"Y Wu","year":"2018","unstructured":"Wu Y, He K. Group normalization. In: Proceedings of the European Conference on Computer Vision (ECCV), 2018. 3\u201319"},{"key":"4342_CR34","unstructured":"Ren T, Liu S, Zeng A, et al. Grounded SAM: assembling open-world models for diverse visual tasks. 2024. ArXiv:2401.14159"},{"key":"4342_CR35","first-page":"1724","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Y Zhang","year":"2024","unstructured":"Zhang Y, Huang X, Ma J, et al. Recognize anything: a strong image tagging model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024. 1724\u20131732"},{"key":"4342_CR36","doi-asserted-by":"crossref","unstructured":"Liu S, Zeng Z, Ren T, et al. Grounding DINO: marrying DINO with grounded pre-training for open-set object detection. 2023. ArXiv:2303.05499","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"4342_CR37","first-page":"4015","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"A Kirillov","year":"2023","unstructured":"Kirillov A, Mintun E, Ravi N, et al. Segment anything. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023. 4015\u20134026"},{"key":"4342_CR38","first-page":"17864","volume-title":"Proceedings of the Advances in Neural Information Processing Systems","author":"B Cheng","year":"2021","unstructured":"Cheng B, Schwing A, Kirillov A. Per-pixel classification is not all you need for semantic segmentation. In: Proceedings of the Advances in Neural Information Processing Systems, 2021. 17864\u201317875"},{"key":"4342_CR39","first-page":"1290","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"B Cheng","year":"2022","unstructured":"Cheng B, Misra I, Schwing A G, et al. Masked-attention mask transformer for universal image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022. 1290\u20131299"}],"container-title":["Science China Information Sciences"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11432-024-4342-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11432-024-4342-6\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11432-024-4342-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,7]],"date-time":"2025-09-07T16:04:40Z","timestamp":1757261080000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11432-024-4342-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,15]]},"references-count":39,"journal-issue":{"issue":"9","published-print":{"date-parts":[[2025,9]]}},"alternative-id":["4342"],"URL":"https:\/\/doi.org\/10.1007\/s11432-024-4342-6","relation":{},"ISSN":["1674-733X","1869-1919"],"issn-type":[{"type":"print","value":"1674-733X"},{"type":"electronic","value":"1869-1919"}],"subject":[],"published":{"date-parts":[[2025,7,15]]},"assertion":[{"value":"28 June 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 December 2024","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 February 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 July 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"192105"}}