{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T03:29:30Z","timestamp":1777865370364,"version":"3.51.4"},"reference-count":48,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.01057","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"11359-11368","source":"Crossref","is-referenced-by-count":0,"title":["Ock: Unsupervised Dynamic Video Prediction With Object-Centric Kinematics"],"prefix":"10.1109","author":[{"given":"Yeon-Ji","family":"Song","sequence":"first","affiliation":[{"name":"Seoul National University"}]},{"given":"Jaein","family":"Kim","sequence":"additional","affiliation":[{"name":"Seoul National University"}]},{"given":"Suhyung","family":"Choi","sequence":"additional","affiliation":[{"name":"Seoul National University"}]},{"given":"Jin-Hwa","family":"Kim","sequence":"additional","affiliation":[{"name":"SNU AIIS"}]},{"given":"Byoung-Tak","family":"Zhang","sequence":"additional","affiliation":[{"name":"Seoul National University"}]}],"member":"263","reference":[{"key":"ref1","first-page":"20014","article-title":"Xcit: Cross-covariance image transformers","volume":"34","author":"Ali","year":"2021","journal-title":"Advances in neural information processing systems"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"ref3","author":"Christopher","journal-title":"Monet: Unsupervised scene decomposition and representation"},{"key":"ref4","first-page":"213","article-title":"End-toend object detection with transformers","volume-title":"Proceedings of the European Conference on Computer Vision","author":"Carion","year":"2020"},{"key":"ref5","author":"Chakravarthy","year":"2023","journal-title":"Spotlight attention: Robust object-centric learning with a spatial locality prior"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00041"},{"key":"ref7","author":"Daniel","year":"2023","journal-title":"Ddlp: Unsupervised objectcentric video prediction with deep dynamic latent particles"},{"key":"ref8","author":"Dziri","year":"2023","journal-title":"Faith and fate: Limits of transformers on compositionality (2023)"},{"key":"ref9","author":"Engelcke","journal-title":"Genesis: Generative scene inference and sampling with object-centric latent representations"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00317"},{"key":"ref11","first-page":"2424","article-title":"Multi-object representation learning with iterative variational inference","volume-title":"International Conference on Machine Learning","author":"Greff","year":"2019"},{"key":"ref12","first-page":"3749","article-title":"Fleet, Dan Gnanapragasam, Florian Golemo, Charles Herrmann, Thomas Kipf, Abhijit Kundu, Dmitry Lagun, Issam Laradji, HsuehTi (Derek) Liu, Henning Meyer, Yishu Miao, Derek Nowrouzezahrai, Cengiz Oztireli, Etienne Pot, Noha Radwan, Daniel Rebain, Sara Sabour, Mehdi S","volume-title":"M. Sajjadi","author":"Greff","year":"2022"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.3233\/sw-223228"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00593"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1049\/el:20080522"},{"key":"ref16","first-page":"12572","article-title":"Generative neurosymbolic machines","volume":"33","author":"Jiang","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1016\/0010-0285(92)90007-O"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01767"},{"key":"ref19","article-title":"Bilinear attention networks","author":"Kim","year":"2018","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref20","author":"Kipf","year":"2021","journal-title":"Conditional object-centric learning from video"},{"key":"ref21","article-title":"Improving generative imagination in object-centric world models, 2020","author":"Lin","journal-title":"1"},{"key":"ref22","first-page":"11525","article-title":"Objectcentric learning with slot attention","volume":"33","author":"Locatello","year":"2020","journal-title":"Advances in Neural In-formation Processing Systems"},{"key":"ref23","first-page":"3","author":"Ming","journal-title":"A survey on video prediction: From deterministic to generative approaches, 2024"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1016\/0921-8890(91)90015-D"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_44"},{"key":"ref26","author":"Radford","journal-title":"Improving language understanding by generative pre-training, 2018"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.4236\/jcc.2019.73002"},{"key":"ref28","author":"Seitzer","year":"2022","journal-title":"Bridging the gap to real-world object-centric learning"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3243465"},{"key":"ref30","author":"Singh","year":"2021","journal-title":"Illiterate dall-e learns to compose"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1322"},{"key":"ref32","article-title":"Learning object motion and appearance dynamics with object-centric representations","volume-title":"In Causal Representation Learning Workshop at NeurIPS","author":"Song","year":"2023"},{"key":"ref33","first-page":"2","author":"Song","journal-title":"Dbmovi-gs: Dynamic view synthesis from blurry monocular video via sparse-controlled gaussian splatting, 2025"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00252"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02227"},{"key":"ref36","article-title":"Attention is all you need","author":"Vaswani","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP49359.2023.10222810"},{"key":"ref38","article-title":"Eidetic 3d lstm: A model for video prediction and beyond","volume-title":"International conference on learning representations","author":"Wang","year":"2018"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3165153"},{"key":"ref40","author":"Wang","year":"2023","journal-title":"Slot-vae: Object-centric scene generation with slot attention"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2003.819861"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i8.20841"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01729"},{"key":"ref44","author":"Wu","year":"2022","journal-title":"Slotformer: Unsupervised visual dynamics simulation with object-centric models"},{"key":"ref45","article-title":"Self-supervised visual reinforcement learning with object-centric representations","volume-title":"International Conference on Learning Representations","author":"Zadaianchuk","year":"2021"},{"key":"ref46","first-page":"384","article-title":"Selfsupervised reinforcement learning with independently controllable subgoals","volume-title":"Conference on Robot Learning","author":"Zadaianchuk","year":"2022"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01522"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11446008.pdf?arnumber=11446008","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T06:20:04Z","timestamp":1777530004000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11446008\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":48,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.01057","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}