{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T08:38:28Z","timestamp":1743064708106,"version":"3.40.3"},"publisher-location":"Cham","reference-count":79,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031726903"},{"type":"electronic","value":"9783031726910"}],"license":[{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72691-0_21","type":"book-chapter","created":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T18:07:11Z","timestamp":1730570831000},"page":"368-387","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Understanding Physical Dynamics with\u00a0Counterfactual World Modeling"],"prefix":"10.1007","author":[{"given":"Rahul","family":"Venkatesh","sequence":"first","affiliation":[]},{"given":"Honglin","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Kevin","family":"Feigelis","sequence":"additional","affiliation":[]},{"given":"Daniel M.","family":"Bear","sequence":"additional","affiliation":[]},{"given":"Khaled","family":"Jedoui","sequence":"additional","affiliation":[]},{"given":"Klemen","family":"Kotar","sequence":"additional","affiliation":[]},{"given":"Felix","family":"Binder","sequence":"additional","affiliation":[]},{"given":"Wanhee","family":"Lee","sequence":"additional","affiliation":[]},{"given":"Sherry","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Kevin A.","family":"Smith","sequence":"additional","affiliation":[]},{"given":"Judith E.","family":"Fan","sequence":"additional","affiliation":[]},{"given":"Daniel L. K.","family":"Yamins","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,3]]},"reference":[{"key":"21_CR1","unstructured":"Pytorch autograd. https:\/\/pytorch.org\/tutorials\/beginner\/blitz\/autograd_tutorial.html. Accessed 3 Mar 2024"},{"key":"21_CR2","unstructured":"Agrawal, P., Nair, A.V., Abbeel, P., Malik, J., Levine, S.: Learning to poke by poking: experiential learning of intuitive physics. In: Advances in Neural Information Processing Systems, vol. 29 (2016)"},{"key":"21_CR3","doi-asserted-by":"crossref","unstructured":"Ajay, A., et al.: Combining physical simulators and object-based networks for control. In: 2019 International Conference on Robotics and Automation (ICRA), pp. 3217\u20133223. IEEE (2019)","DOI":"10.1109\/ICRA.2019.8794358"},{"key":"21_CR4","unstructured":"Babaeizadeh, M., Saffar, M.T., Nair, S., Levine, S., Finn, C., Erhan, D.: FitVid: overfitting in pixel-level video prediction. arXiv preprint arXiv:2106.13195 (2021)"},{"key":"21_CR5","unstructured":"Bakhtin, A., van\u00a0der Maaten, L., Johnson, J., Gustafson, L., Girshick, R.: PHYRE: a new benchmark for physical reasoning. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"21_CR6","unstructured":"Bardes, A., et al.: Revisiting feature prediction for learning visual representations from video. arXiv preprint (2024)"},{"issue":"7","key":"21_CR7","doi-asserted-by":"publisher","first-page":"e1007210","DOI":"10.1371\/journal.pcbi.1007210","volume":"15","author":"CJ Bates","year":"2019","unstructured":"Bates, C.J., Yildirim, I., Tenenbaum, J.B., Battaglia, P.: Modeling human intuitions about liquid flow with particle-based simulation. PLoS Comput. Biol. 15(7), e1007210 (2019)","journal-title":"PLoS Comput. Biol."},{"key":"21_CR8","unstructured":"Battaglia, P., Pascanu, R., Lai, M., Jimenez\u00a0Rezende, D., et\u00a0al.: Interaction networks for learning about objects, relations and physics. In: Advances in Neural Information Processing Systems, vol. 29 (2016)"},{"issue":"45","key":"21_CR9","doi-asserted-by":"publisher","first-page":"18327","DOI":"10.1073\/pnas.1306572110","volume":"110","author":"PW Battaglia","year":"2013","unstructured":"Battaglia, P.W., Hamrick, J.B., Tenenbaum, J.B.: Simulation as an engine of physical scene understanding. Proc. Natl. Acad. Sci. 110(45), 18327\u201318332 (2013)","journal-title":"Proc. Natl. Acad. Sci."},{"key":"21_CR10","unstructured":"Bear, D.M., et al.: Unifying (machine) vision via counterfactual world modeling. arXiv preprint arXiv:2306.01828 (2023)"},{"key":"21_CR11","unstructured":"Bear, D.M., et\u00a0al.: Physion: evaluating physical prediction from vision in humans and machines. arXiv preprint arXiv:2106.08261 (2021)"},{"key":"21_CR12","doi-asserted-by":"crossref","unstructured":"Caron, M., et al.: Emerging properties in self-supervised vision transformers. In: Proceedings of the International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"21_CR13","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"719","DOI":"10.1007\/978-3-031-19818-2_41","volume-title":"Computer Vision \u2013 ECCV 2022","author":"H Chen","year":"2022","unstructured":"Chen, H., et al.: Unsupervised segmentation in real-world images via spelke object inference. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13689, pp. 719\u2013735. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19818-2_41"},{"key":"21_CR14","unstructured":"Chen, M., et al.: Generative pretraining from pixels. In: International Conference on Machine Learning, pp. 1691\u20131703. PMLR (2020)"},{"key":"21_CR15","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: International Conference on Machine Learning, pp. 1597\u20131607. PMLR (2020)"},{"key":"21_CR16","unstructured":"Chen, X., et al.: SEINE: short-to-long video diffusion model for generative transition and prediction. arXiv preprint arXiv:2310.20700 (2023)"},{"key":"21_CR17","doi-asserted-by":"publisher","first-page":"103406","DOI":"10.1016\/j.cviu.2022.103406","volume":"219","author":"I Dave","year":"2022","unstructured":"Dave, I., Gupta, R., Rizve, M.N., Shah, M.: TCLR: temporal contrastive learning for video representation. Comput. Vis. Image Underst. 219, 103406 (2022)","journal-title":"Comput. Vis. Image Underst."},{"key":"21_CR18","doi-asserted-by":"crossref","unstructured":"Doersch, C., Gupta, A., Efros, A.A.: Unsupervised visual representation learning by context prediction. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1422\u20131430 (2015)","DOI":"10.1109\/ICCV.2015.167"},{"key":"21_CR19","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16$$\\times $$16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"21_CR20","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1007\/s11263-009-0275-4","volume":"88","author":"M Everingham","year":"2010","unstructured":"Everingham, M., Van Gool, L., Williams, C.K., Winn, J., Zisserman, A.: The pascal visual object classes (VOC) challenge. Int. J. Comput. Vision 88, 303\u2013338 (2010)","journal-title":"Int. J. Comput. Vision"},{"key":"21_CR21","unstructured":"Feichtenhofer, C., Li, Y., He, K., et al.: Masked autoencoders as spatiotemporal learners. In: Advances in Neural Information Processing Systems, vol. 35, pp. 35946\u201335958 (2022)"},{"key":"21_CR22","unstructured":"Finn, C., Goodfellow, I., Levine, S.: Unsupervised learning for physical interaction through video prediction. In: Advances in Neural Information Processing Systems, vol. 29 (2016)"},{"key":"21_CR23","doi-asserted-by":"crossref","unstructured":"Finn, C., Levine, S.: Deep visual foresight for planning robot motion. In: 2017 IEEE International Conference on Robotics and Automation (ICRA), pp. 2786\u20132793. IEEE (2017)","DOI":"10.1109\/ICRA.2017.7989324"},{"key":"21_CR24","unstructured":"Fragkiadaki, K., Agrawal, P., Levine, S., Malik, J.: Learning visual predictive models of physics for playing billiards. arXiv preprint arXiv:1511.07404 (2015)"},{"issue":"11","key":"21_CR25","doi-asserted-by":"publisher","first-page":"1231","DOI":"10.1177\/0278364913491297","volume":"32","author":"A Geiger","year":"2013","unstructured":"Geiger, A., Lenz, P., Stiller, C., Urtasun, R.: Vision meets robotics: the KITTI dataset. Int. J. Robot. Res. 32(11), 1231\u20131237 (2013)","journal-title":"Int. J. Robot. Res."},{"issue":"12","key":"21_CR26","doi-asserted-by":"publisher","first-page":"1731","DOI":"10.1177\/0956797617713053","volume":"28","author":"T Gerstenberg","year":"2017","unstructured":"Gerstenberg, T., Peterson, M.F., Goodman, N.D., Lagnado, D.A., Tenenbaum, J.B.: Eye-tracking causality. Psychol. Sci. 28(12), 1731\u20131744 (2017)","journal-title":"Psychol. Sci."},{"key":"21_CR27","unstructured":"Gidaris, S., Singh, P., Komodakis, N.: Unsupervised representation learning by predicting image rotations. arXiv preprint arXiv:1803.07728 (2018)"},{"key":"21_CR28","doi-asserted-by":"crossref","unstructured":"Goldberg, L.R.: The Book of Why: The New Science of Cause and Effect: by judea pearl and dana mackenzie. Basic Books (2018). ISBN 978-0465097609 (2019)","DOI":"10.1080\/14697688.2019.1655928"},{"key":"21_CR29","doi-asserted-by":"crossref","unstructured":"Goyal, R., et\u00a0al.: The \u201csomething something\u201d video database for learning and evaluating visual common sense. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5842\u20135850 (2017)","DOI":"10.1109\/ICCV.2017.622"},{"key":"21_CR30","doi-asserted-by":"crossref","unstructured":"Groth, O., Fuchs, F.B., Posner, I., Vedaldi, A.: ShapeStacks: learning vision-based physical intuition for generalised object stacking. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 702\u2013717 (2018)","DOI":"10.1007\/978-3-030-01246-5_43"},{"key":"21_CR31","unstructured":"Gupta, A., Tian, S., Zhang, Y., Wu, J., Mart\u00edn-Mart\u00edn, R., Fei-Fei, L.: MaskViT: masked visual pre-training for video prediction. arXiv preprint arXiv:2206.11894 (2022)"},{"key":"21_CR32","unstructured":"Hafner, D., Lillicrap, T., Ba, J., Norouzi, M.: Dream to control: learning behaviors by latent imagination. arXiv preprint arXiv:1912.01603 (2019)"},{"key":"21_CR33","unstructured":"Hafner, D., et al.: Learning latent dynamics for planning from pixels. In: International Conference on Machine Learning, pp. 2555\u20132565. PMLR (2019)"},{"key":"21_CR34","unstructured":"Han, J., Huang, W., Ma, H., Li, J., Tenenbaum, J.B., Gan, C.: Learning physical dynamics with subequivariant graph neural networks. In: Thirty-Sixth Conference on Neural Information Processing Systems (2022)"},{"key":"21_CR35","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.: Masked autoencoders are scalable vision learners. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16000\u201316009 (2022)","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"21_CR36","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.: Momentum contrast for unsupervised visual representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9729\u20139738 (2020)","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"21_CR37","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: GANs trained by a two time-scale update rule converge to a local nash equilibrium. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"21_CR38","unstructured":"H\u00f6ppe, T., Mehrjou, A., Bauer, S., Nielsen, D., Dittadi, A.: Diffusion models for video prediction and infilling. arXiv preprint arXiv:2206.07696 (2022)"},{"key":"21_CR39","unstructured":"Janny, S., Baradel, F., Neverova, N., Nadri, M., Mori, G., Wolf, C.: Filtered-CoPhy: unsupervised learning of counterfactual physics in pixel space. arXiv preprint arXiv:2202.00368 (2022)"},{"key":"21_CR40","unstructured":"Kay, W., et\u00a0al.: The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)"},{"key":"21_CR41","unstructured":"Li, Y., Wu, J., Tedrake, R., Tenenbaum, J.B., Torralba, A.: Learning particle dynamics for manipulating rigid bodies, deformable objects, and fluids. arXiv preprint arXiv:1810.01566 (2018)"},{"key":"21_CR42","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"21_CR43","doi-asserted-by":"crossref","unstructured":"Liu, R., Wu, R., Van\u00a0Hoorick, B., Tokmakov, P., Zakharov, S., Vondrick, C.: Zero-1-to-3: zero-shot one image to 3d object. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9298\u20139309 (2023)","DOI":"10.1109\/ICCV51070.2023.00853"},{"key":"21_CR44","unstructured":"Liu, Y., et\u00a0al.: Sora: a review on background, technology, limitations, and opportunities of large vision models. arXiv preprint arXiv:2402.17177 (2024)"},{"key":"21_CR45","unstructured":"Lu, H., et al.: VDT: an empirical study on video diffusion with transformers. arXiv preprint arXiv:2305.13311 (2023)"},{"key":"21_CR46","doi-asserted-by":"crossref","unstructured":"Mehl, L., Schmalfuss, J., Jahedi, A., Nalivayko, Y., Bruhn, A.: Spring: a high-resolution high-detail dataset and benchmark for scene flow, optical flow and stereo. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00482"},{"key":"21_CR47","doi-asserted-by":"crossref","unstructured":"Mottaghi, R., Bagherinezhad, H., Rastegari, M., Farhadi, A.: Newtonian scene understanding: unfolding the dynamics of objects in static images. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3521\u20133529 (2016)","DOI":"10.1109\/CVPR.2016.383"},{"key":"21_CR48","unstructured":"Mrowca, D., et al.: Flexible neural representation for physics prediction. In: Advances in Neural Information Processing Systems, vol. 31 (2018)"},{"key":"21_CR49","unstructured":"Nair, S., Rajeswaran, A., Kumar, V., Finn, C., Gupta, A.: R3M: a universal visual representation for robot manipulation. arXiv preprint arXiv:2203.12601 (2022)"},{"key":"21_CR50","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1007\/978-3-319-46466-4_5","volume-title":"Computer Vision \u2013 ECCV 2016","author":"M Noroozi","year":"2016","unstructured":"Noroozi, M., Favaro, P.: Unsupervised learning of visual representations by solving jigsaw puzzles. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9910, pp. 69\u201384. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46466-4_5"},{"key":"21_CR51","unstructured":"Oord, A.V.D., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)"},{"key":"21_CR52","unstructured":"OpenAI: GPT-4 for vision (ChatGPT with image input) (2023). https:\/\/openai.com\/. Accessed 27 Oct 2023"},{"key":"21_CR53","unstructured":"Oquab, M., et al.: DINOv2: learning robust visual features without supervision (2023)"},{"key":"21_CR54","doi-asserted-by":"crossref","unstructured":"Pathak, D., Girshick, R., Doll\u00e1r, P., Darrell, T., Hariharan, B.: Learning features by watching objects move. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2701\u20132710 (2017)","DOI":"10.1109\/CVPR.2017.638"},{"issue":"9","key":"21_CR55","doi-asserted-by":"publisher","first-page":"1257","DOI":"10.1038\/s41562-022-01394-8","volume":"6","author":"LS Piloto","year":"2022","unstructured":"Piloto, L.S., Weinstein, A., Battaglia, P., Botvinick, M.: Intuitive physics learning in a deep-learning model inspired by developmental psychology. Nat. Hum. Behav. 6(9), 1257\u20131267 (2022)","journal-title":"Nat. Hum. Behav."},{"key":"21_CR56","unstructured":"Pont-Tuset, J., Perazzi, F., Caelles, S., Arbelaez, P., Sorkine-Hornung, A., Gool, L.V.: The 2017 DAVIS challenge on video object segmentation. arXiv preprint arXiv:1704.00675 (2017)"},{"key":"21_CR57","unstructured":"Qi, H., Wang, X., Pathak, D., Ma, Y., Malik, J.: Learning long-term visual dynamics with region proposal interaction networks. arXiv preprint arXiv:2008.02265 (2020)"},{"key":"21_CR58","doi-asserted-by":"crossref","unstructured":"Qian, R., et al.: Spatiotemporal contrastive video representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6964\u20136974 (2021)","DOI":"10.1109\/CVPR46437.2021.00689"},{"issue":"9","key":"21_CR59","doi-asserted-by":"publisher","first-page":"5016","DOI":"10.1109\/TPAMI.2021.3083839","volume":"44","author":"R Riochet","year":"2021","unstructured":"Riochet, R., et al.: IntPhys 2019: a benchmark for visual intuitive physics understanding. IEEE Trans. Pattern Anal. Mach. Intell. 44(9), 5016\u20135025 (2021)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"21_CR60","unstructured":"Sanchez-Gonzalez, A., et al.: Graph networks as learnable physics engines for inference and control. In: International Conference on Machine Learning, pp. 4470\u20134479. PMLR (2018)"},{"key":"21_CR61","unstructured":"Smith, K., et al.: Modeling expectation violation in intuitive physics with coarse probabilistic object representations. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"issue":"1","key":"21_CR62","doi-asserted-by":"publisher","first-page":"29","DOI":"10.1207\/s15516709cog1401_3","volume":"14","author":"ES Spelke","year":"1990","unstructured":"Spelke, E.S.: Principles of object perception. Cogn. Sci. 14(1), 29\u201356 (1990)","journal-title":"Cogn. Sci."},{"key":"21_CR63","unstructured":"Tacchetti, A., et al.: Relational forward models for multi-agent learning. arXiv preprint arXiv:1809.11044 (2018)"},{"key":"21_CR64","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"402","DOI":"10.1007\/978-3-030-58536-5_24","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Z Teed","year":"2020","unstructured":"Teed, Z., Deng, J.: RAFT: recurrent all-pairs field transforms for optical flow. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020, Part II. LNCS, vol. 12347, pp. 402\u2013419. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58536-5_24"},{"key":"21_CR65","unstructured":"Tong, Z., Song, Y., Wang, J., Wang, L.: VideoMAE: masked autoencoders are data-efficient learners for self-supervised video pre-training. In: Advances in Neural Information Processing Systems, vol. 35, pp. 10078\u201310093 (2022)"},{"key":"21_CR66","unstructured":"Voleti, V., Jolicoeur-Martineau, A., Pal, C.: MCVD-masked conditional video diffusion for prediction, generation, and interpolation. In: Advances in Neural Information Processing Systems, vol. 35, pp. 23371\u201323385 (2022)"},{"key":"21_CR67","doi-asserted-by":"crossref","unstructured":"Wang, L., et al.: VideoMAE v2: scaling video masked autoencoders with dual masking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14549\u201314560 (2023)","DOI":"10.1109\/CVPR52729.2023.01398"},{"key":"21_CR68","doi-asserted-by":"crossref","unstructured":"Wang, X., Gupta, A.: Unsupervised learning of visual representations using videos. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2794\u20132802 (2015)","DOI":"10.1109\/ICCV.2015.320"},{"key":"21_CR69","doi-asserted-by":"crossref","unstructured":"Wang, X., et al.: FreeSOLO: learning to segment objects without annotations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14176\u201314186 (2022)","DOI":"10.1109\/CVPR52688.2022.01378"},{"key":"21_CR70","doi-asserted-by":"crossref","unstructured":"Wang, X., Girdhar, R., Yu, S.X., Misra, I.: Cut and learn for unsupervised object detection and instance segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3124\u20133134 (2023)","DOI":"10.1109\/CVPR52729.2023.00305"},{"key":"21_CR71","doi-asserted-by":"crossref","unstructured":"Wu, Z., Xiong, Y., Yu, S.X., Lin, D.: Unsupervised feature learning via non-parametric instance discrimination. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3733\u20133742 (2018)","DOI":"10.1109\/CVPR.2018.00393"},{"key":"21_CR72","unstructured":"Wu, Z., Dvornik, N., Greff, K., Kipf, T., Garg, A.: SlotFormer: unsupervised visual dynamics simulation with object-centric models. arXiv preprint arXiv:2210.05861 (2022)"},{"key":"21_CR73","unstructured":"Yan, W., Hafner, D., James, S., Abbeel, P.: Temporally consistent transformers for video generation (2023)"},{"key":"21_CR74","doi-asserted-by":"crossref","unstructured":"Ye, Y., Singh, M., Gupta, A., Tulsiani, S.: Compositional video prediction. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10353\u201310362 (2019)","DOI":"10.1109\/ICCV.2019.01045"},{"key":"21_CR75","doi-asserted-by":"crossref","unstructured":"You, Y., et al.: KeypointNet: a large-scale 3D keypoint dataset aggregated from numerous human annotations. arXiv preprint arXiv:2002.12687 (2020)","DOI":"10.1109\/CVPR42600.2020.01366"},{"key":"21_CR76","doi-asserted-by":"crossref","unstructured":"Yu, L., et\u00a0al.: MAGVIT: masked generative video transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10459\u201310469 (2023)","DOI":"10.1109\/CVPR52729.2023.01008"},{"key":"21_CR77","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"649","DOI":"10.1007\/978-3-319-46487-9_40","volume-title":"Computer Vision \u2013 ECCV 2016","author":"R Zhang","year":"2016","unstructured":"Zhang, R., Isola, P., Efros, A.A.: Colorful image colorization. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016, Part III. LNCS, vol. 9907, pp. 649\u2013666. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46487-9_40"},{"key":"21_CR78","doi-asserted-by":"crossref","unstructured":"Zheng, Y., Harley, A.W., Shen, B., Wetzstein, G., Guibas, L.J.: PointOdyssey: a large-scale synthetic dataset for long-term point tracking. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 19855\u201319865 (2023)","DOI":"10.1109\/ICCV51070.2023.01818"},{"key":"21_CR79","doi-asserted-by":"crossref","unstructured":"Zhuang, C., She, T., Andonian, A., Mark, M.S., Yamins, D.: Unsupervised learning from video with deep neural embeddings. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9563\u20139572 (2020)","DOI":"10.1109\/CVPR42600.2020.00958"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72691-0_21","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T18:09:16Z","timestamp":1730570956000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72691-0_21"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,3]]},"ISBN":["9783031726903","9783031726910"],"references-count":79,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72691-0_21","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,3]]},"assertion":[{"value":"3 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}