{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T05:04:15Z","timestamp":1750309455264,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":46,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:00:00Z","timestamp":1733184000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,12,3]]},"DOI":"10.1145\/3696409.3700207","type":"proceedings-article","created":{"date-parts":[[2024,12,28]],"date-time":"2024-12-28T09:55:23Z","timestamp":1735379723000},"page":"1-7","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["STODINE: Decompose video to Object-centric Spatial-Temporal Slots for physical reasoning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-9447-0365","authenticated-orcid":false,"given":"Haoyuan","family":"Zhang","sequence":"first","affiliation":[{"name":"School of Artificial Intelligence, UCAS, Beijing, China and MAIS, CASIA, Bejing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4636-9677","authenticated-orcid":false,"given":"Xiangyu","family":"Zhu","sequence":"additional","affiliation":[{"name":"MAIS, CASIA, Beijing, China and School of Artificial Intelligence, UCAS, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-4377-2581","authenticated-orcid":false,"given":"Qu","family":"Tang","sequence":"additional","affiliation":[{"name":"MAIS, CASIA, Beijing, China and School of Artificial Intelligence, UCAS, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2648-3875","authenticated-orcid":false,"given":"Zhaoxiang","family":"Zhang","sequence":"additional","affiliation":[{"name":"MAIS, CASIA, Beijing, China and School of Artificial Intelligence, UCAS, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0791-189X","authenticated-orcid":false,"given":"Zhen","family":"Lei","sequence":"additional","affiliation":[{"name":"MAIS, CASIA, Beijing, China and School of Artificial Intelligence, UCAS, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,12,28]]},"reference":[{"doi-asserted-by":"publisher","key":"e_1_3_3_2_2_2","DOI":"10.1109\/ICCV.2015.279"},{"doi-asserted-by":"crossref","unstructured":"Vijay Badrinarayanan Alex Kendall and Roberto Cipolla. 2017. Segnet: A deep convolutional encoder-decoder architecture for image segmentation. IEEE transactions on pattern analysis and machine intelligence 39 12 (2017) 2481\u20132495.","key":"e_1_3_3_2_3_2","DOI":"10.1109\/TPAMI.2016.2644615"},{"key":"e_1_3_3_2_4_2","volume-title":"International Conference on Learning Representations","author":"Baradel Fabien","year":"2020","unstructured":"Fabien Baradel, Natalia Neverova, Julien Mille, Greg Mori, and Christian Wolf. 2020. COPHY: Counterfactual Learning of Physical Dynamics. In International Conference on Learning Representations."},{"doi-asserted-by":"crossref","unstructured":"Paul\u00a0M Bays and Masud Husain. 2008. Dynamic shifts of limited working memory resources in human vision. Science 321 5890 (2008) 851\u2013854.","key":"e_1_3_3_2_5_2","DOI":"10.1126\/science.1158023"},{"unstructured":"Christopher\u00a0P Burgess Loic Matthey Nicholas Watters Rishabh Kabra Irina Higgins Matt Botvinick and Alexander Lerchner. 2019. Monet: Unsupervised scene decomposition and representation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1901.11390 (2019).","key":"e_1_3_3_2_6_2"},{"doi-asserted-by":"publisher","key":"e_1_3_3_2_7_2","DOI":"10.1109\/ICCV48922.2021.00951"},{"doi-asserted-by":"crossref","unstructured":"Liang-Chieh Chen George Papandreou Iasonas Kokkinos Kevin Murphy and Alan\u00a0L Yuille. 2017. Deeplab: Semantic image segmentation with deep convolutional nets atrous convolution and fully connected crfs. IEEE transactions on pattern analysis and machine intelligence 40 4 (2017) 834\u2013848.","key":"e_1_3_3_2_8_2","DOI":"10.1109\/TPAMI.2017.2699184"},{"key":"e_1_3_3_2_9_2","volume-title":"International Conference on Learning Representations","author":"Chen Zhenfang","year":"2022","unstructured":"Zhenfang Chen, Kexin Yi, Antonio Torralba, Josh Tenenbaum, and Chuang Gan. 2022. ComPhy: Compositional Physical Reasoning of Objects and Events from Videos. In International Conference on Learning Representations."},{"unstructured":"Kyunghyun Cho Bart Van\u00a0Merri\u00ebnboer Caglar Gulcehre Dzmitry Bahdanau Fethi Bougares Holger Schwenk and Yoshua Bengio. 2014. Learning phrase representations using RNN encoder-decoder for statistical machine translation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1406.1078 (2014).","key":"e_1_3_3_2_10_2"},{"doi-asserted-by":"publisher","key":"e_1_3_3_2_11_2","DOI":"10.1609\/aaai.v33i01.33013412"},{"doi-asserted-by":"crossref","unstructured":"Arthur\u00a0P Dempster Nan\u00a0M Laird and Donald\u00a0B Rubin. 1977. Maximum likelihood from incomplete data via the EM algorithm. Journal of the royal statistical society: series B (methodological) 39 1 (1977) 1\u201322.","key":"e_1_3_3_2_12_2","DOI":"10.1111\/j.2517-6161.1977.tb01600.x"},{"unstructured":"David Ding Felix Hill Adam Santoro Malcolm Reynolds and Matt Botvinick. 2021. Attention over learned object embeddings enables complex visual reasoning. Advances in neural information processing systems 34 (2021) 9112\u20139124.","key":"e_1_3_3_2_13_2"},{"unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et\u00a0al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2010.11929 (2020).","key":"e_1_3_3_2_14_2"},{"unstructured":"Gamaleldin Elsayed Aravindh Mahendran Sjoerd Van\u00a0Steenkiste Klaus Greff Michael\u00a0C Mozer and Thomas Kipf. 2022. Savi++: Towards end-to-end object-centric learning from real-world videos. Advances in Neural Information Processing Systems 35 (2022) 28940\u201328954.","key":"e_1_3_3_2_15_2"},{"unstructured":"SM Eslami Nicolas Heess Theophane Weber Yuval Tassa David Szepesvari Geoffrey\u00a0E Hinton et\u00a0al. 2016. Attend infer repeat: Fast scene understanding with generative models. Advances in neural information processing systems 29 (2016).","key":"e_1_3_3_2_16_2"},{"key":"e_1_3_3_2_17_2","first-page":"2424","volume-title":"International conference on machine learning","author":"Greff Klaus","year":"2019","unstructured":"Klaus Greff, Rapha\u00ebl\u00a0Lopez Kaufman, Rishabh Kabra, Nick Watters, Christopher Burgess, Daniel Zoran, Loic Matthey, Matthew Botvinick, and Alexander Lerchner. 2019. Multi-object representation learning with iterative variational inference. In International conference on machine learning. PMLR, 2424\u20132433."},{"unstructured":"Klaus Greff Sjoerd Van\u00a0Steenkiste and J\u00fcrgen Schmidhuber. 2017. Neural expectation maximization. Advances in Neural Information Processing Systems 30 (2017).","key":"e_1_3_3_2_18_2"},{"unstructured":"Klaus Greff Sjoerd Van\u00a0Steenkiste and J\u00fcrgen Schmidhuber. 2020. On the binding problem in artificial neural networks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2012.05208 (2020).","key":"e_1_3_3_2_19_2"},{"doi-asserted-by":"crossref","unstructured":"Quan Gu Alessandro Dai Tian Ye Bo Huang Xiqian Lu Mowei Shen and Zaifeng Gao. 2021. Object-based encoding in visual working memory: A critical revisit. Q J Exp Psychol (Hove) 75 8 (Oct. 2021) 1397\u20131410.","key":"e_1_3_3_2_20_2","DOI":"10.1177\/17470218211052502"},{"doi-asserted-by":"crossref","unstructured":"Quan Gu Xueyi Wan Hong Ma Xiqian Lu Yang Guo Mowei Shen and Zaifeng Gao. 2020. Event-based encoding of biological motion and location in visual working memory. Q J Exp Psychol (Hove) 73 8 (Feb. 2020) 1261\u20131277.","key":"e_1_3_3_2_21_2","DOI":"10.1177\/1747021820903042"},{"doi-asserted-by":"publisher","key":"e_1_3_3_2_22_2","DOI":"10.1109\/CVPR.2016.90"},{"unstructured":"Jun-Ting Hsieh Bingbin Liu De-An Huang Li\u00a0F Fei-Fei and Juan\u00a0Carlos Niebles. 2018. Learning to decompose and disentangle representations for video prediction. Advances in neural information processing systems 31 (2018).","key":"e_1_3_3_2_23_2"},{"unstructured":"Drew\u00a0A Hudson and Christopher\u00a0D Manning. 2018. Compositional attention networks for machine reasoning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1803.03067 (2018).","key":"e_1_3_3_2_24_2"},{"unstructured":"Jindong Jiang Fei Deng Gautam Singh and Sungjin Ahn. 2024. Object-Centric Slot Diffusion. Advances in Neural Information Processing Systems 36 (2024).","key":"e_1_3_3_2_25_2"},{"unstructured":"Diederik\u00a0P Kingma and Max Welling. 2013. Auto-encoding variational bayes. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1312.6114 (2013).","key":"e_1_3_3_2_26_2"},{"unstructured":"Thomas Kipf Gamaleldin\u00a0F Elsayed Aravindh Mahendran Austin Stone Sara Sabour Georg Heigold Rico Jonschkowski Alexey Dosovitskiy and Klaus Greff. 2021. Conditional object-centric learning from video. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2111.12594 (2021).","key":"e_1_3_3_2_27_2"},{"unstructured":"Adam Kosiorek Hyunjik Kim Yee\u00a0Whye Teh and Ingmar Posner. 2018. Sequential attend infer repeat: Generative modelling of moving objects. Advances in Neural Information Processing Systems 31 (2018).","key":"e_1_3_3_2_28_2"},{"doi-asserted-by":"publisher","key":"e_1_3_3_2_29_2","DOI":"10.1109\/CVPR42600.2020.00999"},{"unstructured":"Francesco Locatello Dirk Weissenborn Thomas Unterthiner Aravindh Mahendran Georg Heigold Jakob Uszkoreit Alexey Dosovitskiy and Thomas Kipf. 2020. Object-centric learning with slot attention. Advances in neural information processing systems 33 (2020) 11525\u201311538.","key":"e_1_3_3_2_30_2"},{"doi-asserted-by":"publisher","key":"e_1_3_3_2_31_2","DOI":"10.1109\/CVPR.2015.7298965"},{"doi-asserted-by":"crossref","unstructured":"Gerrit\u00a0W Maus Jason Fischer and David Whitney. 2013. Motion-dependent representation of space in area MT+. Neuron 78 3 (2013) 554\u2013562.","key":"e_1_3_3_2_32_2","DOI":"10.1016\/j.neuron.2013.03.010"},{"key":"e_1_3_3_2_33_2","first-page":"23533","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Nguyen Trang","year":"2023","unstructured":"Trang Nguyen, Amin Mansouri, Kanika Madan, Khuong\u00a0Duy Nguyen, Kartik Ahuja, Dianbo Liu, and Yoshua Bengio. 2023. Reusable Slotwise Mechanisms. In Advances in Neural Information Processing Systems , A.\u00a0Oh, T.\u00a0Naumann, A.\u00a0Globerson, K.\u00a0Saenko, M.\u00a0Hardt, and S.\u00a0Levine (Eds.), Vol.\u00a036. Curran Associates, Inc., 23533\u201323556. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2023\/file\/49ff6951ef47bc9bab276a31a965528e-Paper-Conference.pdf"},{"doi-asserted-by":"crossref","unstructured":"Nora Nortmann Sascha Rekauzke Selim Onat Peter K\u00f6nig and Dirk Jancke. 2015. Primary visual cortex represents the difference between past and present. Cerebral Cortex 25 6 (2015) 1427\u20131440.","key":"e_1_3_3_2_34_2","DOI":"10.1093\/cercor\/bht318"},{"unstructured":"Maxime Oquab Timoth\u00e9e Darcet Th\u00e9o Moutakanni Huy Vo Marc Szafraniec Vasil Khalidov Pierre Fernandez Daniel Haziza Francisco Massa Alaaeldin El-Nouby et\u00a0al. 2023. Dinov2: Learning robust visual features without supervision. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.07193 (2023).","key":"e_1_3_3_2_35_2"},{"doi-asserted-by":"publisher","key":"e_1_3_3_2_36_2","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"e_1_3_3_2_37_2","volume-title":"10th International Conference on Learning Representations, ICLR 2022","author":"Singh Gautam","year":"2022","unstructured":"Gautam Singh, Fei Deng, and Sungjin Ahn. 2022. ILLITERATE DALL-E LEARNS TO COMPOSE. In 10th International Conference on Learning Representations, ICLR 2022."},{"unstructured":"Gautam Singh Yi-Fu Wu and Sungjin Ahn. 2022. Simple unsupervised object-centric learning for complex and naturalistic videos. Advances in Neural Information Processing Systems 35 (2022) 18181\u201318196.","key":"e_1_3_3_2_38_2"},{"key":"e_1_3_3_2_39_2","volume-title":"International Conference on Learning Representations","author":"Tang Qu","year":"2021","unstructured":"Qu Tang, Xiangyu Zhu, Zhen Lei, and Zhaoxiang Zhang. 2021. Object dynamics distillation for scene decomposition and representation. In International Conference on Learning Representations."},{"doi-asserted-by":"publisher","key":"e_1_3_3_2_40_2","DOI":"10.1109\/CVPR52729.2023.02227"},{"unstructured":"Sjoerd Van\u00a0Steenkiste Michael Chang Klaus Greff and J\u00fcrgen Schmidhuber. 2018. Relational neural expectation maximization: Unsupervised discovery of objects and their interactions. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1802.10353 (2018).","key":"e_1_3_3_2_41_2"},{"unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan\u00a0N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017).","key":"e_1_3_3_2_42_2"},{"unstructured":"Ziyi Wu Nikita Dvornik Klaus Greff Thomas Kipf and Animesh Garg. 2022. Slotformer: Unsupervised visual dynamics simulation with object-centric models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2210.05861 (2022).","key":"e_1_3_3_2_43_2"},{"key":"e_1_3_3_2_44_2","volume-title":"International Conference on Learning Representations","author":"Yi Kexin","year":"2020","unstructured":"Kexin Yi, Chuang Gan, Yunzhu Li, Pushmeet Kohli, Jiajun Wu, Antonio Torralba, and Joshua\u00a0B Tenenbaum. 2020. CLEVRER: CoLlision Events for Video REpresentation and Reasoning. In International Conference on Learning Representations."},{"unstructured":"Polina Zablotskaia Edoardo\u00a0A Dominici Leonid Sigal and Andreas\u00a0M Lehrmann. 2020. Unsupervised video decomposition using spatio-temporal iterative inference. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2006.14727 (2020).","key":"e_1_3_3_2_45_2"},{"doi-asserted-by":"crossref","unstructured":"Jie Zheng Andrea\u00a0GP Schjetnan Mar Yebra Bernard\u00a0A Gomes Clayton\u00a0P Mosher Suneil\u00a0K Kalia Taufik\u00a0A Valiante Adam\u00a0N Mamelak Gabriel Kreiman and Ueli Rutishauser. 2022. Neurons detect cognitive boundaries to structure episodic memories in humans. Nature neuroscience 25 3 (2022) 358\u2013368.","key":"e_1_3_3_2_46_2","DOI":"10.1038\/s41593-022-01020-w"},{"unstructured":"Jinghao Zhou Chen Wei Huiyu Wang Wei Shen Cihang Xie Alan Yuille and Tao Kong. 2021. ibot: Image bert pre-training with online tokenizer. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2111.07832 (2021).","key":"e_1_3_3_2_47_2"}],"event":{"sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"acronym":"MMAsia '24","name":"MMAsia '24: ACM Multimedia Asia","location":"Auckland New Zealand"},"container-title":["Proceedings of the 6th ACM International Conference on Multimedia in Asia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696409.3700207","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3696409.3700207","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:10:15Z","timestamp":1750295415000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696409.3700207"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,3]]},"references-count":46,"alternative-id":["10.1145\/3696409.3700207","10.1145\/3696409"],"URL":"https:\/\/doi.org\/10.1145\/3696409.3700207","relation":{},"subject":[],"published":{"date-parts":[[2024,12,3]]},"assertion":[{"value":"2024-12-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}