{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,25]],"date-time":"2026-01-25T02:52:59Z","timestamp":1769309579634,"version":"3.49.0"},"publisher-location":"Cham","reference-count":55,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726514","type":"print"},{"value":"9783031726521","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T00:00:00Z","timestamp":1730246400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T00:00:00Z","timestamp":1730246400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72652-1_11","type":"book-chapter","created":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T08:29:02Z","timestamp":1730190542000},"page":"177-193","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["CarFormer: Self-driving with\u00a0Learned Object-Centric Representations"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8966-2347","authenticated-orcid":false,"given":"Shadi","family":"Hamdan","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0358-983X","authenticated-orcid":false,"given":"Fatma","family":"G\u00fcney","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,30]]},"reference":[{"key":"11_CR1","unstructured":"Aydemir, G., Xie, W., G\u00fcney, F.: Self-supervised object-centric learning for videos. In: NeurIPS (2023)"},{"key":"11_CR2","doi-asserted-by":"crossref","unstructured":"Bao, Z., Tokmakov, P., Jabri, A., Wang, Y.X., Gaidon, A., Hebert, M.: Discovering objects that can move. In: CVPR, pp. 11789\u201311798 (2022)","DOI":"10.1109\/CVPR52688.2022.01149"},{"key":"11_CR3","doi-asserted-by":"crossref","unstructured":"Bao, Z., Tokmakov, P., Wang, Y.X., Gaidon, A., Hebert, M.: Object discovery from motion-guided tokens. In: CVPR, pp. 22972\u201322981 (2023)","DOI":"10.1109\/CVPR52729.2023.02200"},{"key":"11_CR4","doi-asserted-by":"crossref","unstructured":"Behl, A., Chitta, K., Prakash, A., Ohn-Bar, E., Geiger, A.: Label efficient visual abstractions for autonomous driving. In: IROS, pp. 2338\u20132345 (2020)","DOI":"10.1109\/IROS45743.2020.9340641"},{"key":"11_CR5","doi-asserted-by":"crossref","unstructured":"Caron, M., et al.: Emerging properties in self-supervised vision transformers. In: ICCV, pp. 9650\u20139660 (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"11_CR6","doi-asserted-by":"crossref","unstructured":"Chang, M.F., et\u00a0al.: Argoverse: 3d tracking and forecasting with rich maps. In: CVPR, pp. 8748\u20138757 (2019)","DOI":"10.1109\/CVPR.2019.00895"},{"key":"11_CR7","doi-asserted-by":"crossref","unstructured":"Chen, C., Seff, A., Kornhauser, A., Xiao, J.: Deepdriving: learning affordance for direct perception in autonomous driving. In: ICCV, pp. 2722\u20132730 (2015)","DOI":"10.1109\/ICCV.2015.312"},{"key":"11_CR8","doi-asserted-by":"crossref","unstructured":"Chen, D., Kr\u00e4henb\u00fchl, P.: Learning from all vehicles. In: CVPR, 17222\u201317231 (2022)","DOI":"10.1109\/CVPR52688.2022.01671"},{"key":"11_CR9","unstructured":"Chen, D., Zhou, B., Koltun, V., Kr\u00e4henb\u00fchl, P.: Learning by cheating. In: CORL, pp. 66\u201375 (2019)"},{"key":"11_CR10","doi-asserted-by":"crossref","unstructured":"Chen, L., Wu, P., Chitta, K., Jaeger, B., Geiger, A., Li, H.: End-to-end autonomous driving: Challenges and frontiers (2024). https:\/\/arxiv.org\/abs\/2306.16927","DOI":"10.1109\/TPAMI.2024.3435937"},{"key":"11_CR11","unstructured":"Chen, L., et al.: Decision transformer: reinforcement learning via sequence modeling. In: NeurIPS, vol. 34, pp. 15084\u201315097 (2021)"},{"key":"11_CR12","doi-asserted-by":"crossref","unstructured":"Chitta, K., Prakash, A., Jaeger, B., Yu, Z., Renz, K., Geiger, A.: Transfuser: imitation with transformer-based sensor fusion for autonomous driving. IEEE TPAMI 45(11), 12878\u201312895 (2023)","DOI":"10.1109\/TPAMI.2022.3200245"},{"key":"11_CR13","doi-asserted-by":"crossref","unstructured":"Cho, K., et al.: Learning phrase representations using RNN encoder\u2013decoder for statistical machine translation. In: Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP) (2014)","DOI":"10.3115\/v1\/D14-1179"},{"key":"11_CR14","unstructured":"Dosovitskiy, A., Ros, G., Codevilla, F., Lopez, A., Koltun, V.: CARLA: an open urban driving simulator. In: CORL, pp. 1\u201316 (2017)"},{"key":"11_CR15","unstructured":"Elsayed, G.F., Mahendran, A., van Steenkiste, S., Greff, K., Mozer, M.C., Kipf, T.: SAVi++: towards end-to-end object-centric learning from real-world videos. In: NeurIPS, vol. 35, pp. 28940\u201328954 (2022)"},{"key":"11_CR16","doi-asserted-by":"crossref","unstructured":"Esser, P., Rombach, R., Ommer, B.: Taming transformers for high-resolution image synthesis. In: CVPR, pp. 12873\u201312883 (2021)","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"11_CR17","doi-asserted-by":"publisher","unstructured":"Everingham, M., Van\u00a0Gool, L., Williams, C.K., Winn, J., Zisserman, A.: The pascal visual object classes (VOC) challenge. IJCV 88, 303\u2013338 (2010). https:\/\/doi.org\/10.1007\/s11263-009-0275-4","DOI":"10.1007\/s11263-009-0275-4"},{"key":"11_CR18","unstructured":"Furuta, H., Matsuo, Y., Gu, S.S.: Generalized decision transformer for offline hindsight information matching. In: ICLR (2022)"},{"key":"11_CR19","doi-asserted-by":"crossref","unstructured":"52 Hanselmann, N., Renz, K., Chitta, K., Bhattacharyya, A., Geiger, A.: King: Generating safety-critical driving scenarios for robust imitation via kinematics gradients. In: ECCV, pp. 335\u2013352 (2022)","DOI":"10.1007\/978-3-031-19839-7_20"},{"key":"11_CR20","doi-asserted-by":"crossref","unstructured":"Harley, A.W., Fang, Z., Li, J., Ambrus, R., Fragkiadaki, K.: Simple-bev: What really matters for multi-sensor bev perception? In: ICRA, pp. 2759\u20132765 (2023)","DOI":"10.1109\/ICRA48891.2023.10160831"},{"key":"11_CR21","doi-asserted-by":"crossref","unstructured":"Hu, Y., et al.: Planning-oriented autonomous driving. In: CVPR, pp. 17853\u201317862 (2023)","DOI":"10.1109\/CVPR52729.2023.01712"},{"key":"11_CR22","doi-asserted-by":"publisher","unstructured":"Janai, J., G\u00fcney, F., Behl, A., Geiger, A.: Computer vision for autonomous vehicles: problems, datasets and state of the art. Found. Trends$$\\text{\\textregistered} $$ Comput. Graph. Vis. 12(1-3), 1\u2013308 (2020). https:\/\/doi.org\/10.1561\/0600000079, http:\/\/dx.doi.org\/10.1561\/0600000079","DOI":"10.1561\/0600000079"},{"key":"11_CR23","unstructured":"Janner, M., Li, Q., Levine, S.: Offline reinforcement learning as one big sequence modeling problem. In: NeurIPS, vol. 34, pp. 1273\u20131286 (2021)"},{"key":"11_CR24","doi-asserted-by":"crossref","unstructured":"Johnson, J., Hariharan, B., Van Der\u00a0Maaten, L., Fei-Fei, L., Lawrence\u00a0Zitnick, C., Girshick, R.: Clevr: a diagnostic dataset for compositional language and elementary visual reasoning. In: CVPR, pp. 2901\u20132910 (2017)","DOI":"10.1109\/CVPR.2017.215"},{"key":"11_CR25","unstructured":"Karazija, L., Laina, I., Rupprecht, C.: Clevrtex: A texture-rich benchmark for unsupervised multi-object segmentation (2021). Arxiv 2111.10265"},{"key":"11_CR26","unstructured":"Kipf, T., et al.: Conditional object-centric learning from video. In: ICLR (2022)"},{"key":"11_CR27","doi-asserted-by":"crossref","unstructured":"Li, F., Kim, T., Humayun, A., Tsai, D., Rehg, J.M.: Video segmentation by tracking many figure-ground segments. In: CVPR, pp. 2192\u20132199 (2013)","DOI":"10.1109\/ICCV.2013.273"},{"key":"11_CR28","doi-asserted-by":"crossref","unstructured":"Li, Z., et al.: Bevformer: Learning bird\u2019s-eye-view representation from multi-camera images via spatiotemporal transformers. In: ECCV, pp. 1\u201318 (2022)","DOI":"10.1007\/978-3-031-20077-9_1"},{"key":"11_CR29","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., et al.: Microsoft coco: common objects in context. In: ECCV, pp. 740\u2013755 (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"11_CR30","unstructured":"Liu, P.J., et al.: Generating wikipedia by summarizing long sequences. In: ICLR (2018)"},{"key":"11_CR31","unstructured":"Locatello, F., et al.: Object-centric learning with slot attention. In: NeurIPS, vol. 33, pp. 11525\u201311538 (2020)"},{"key":"11_CR32","unstructured":"Micheli, V., Alonso, E., Fleuret, F.: Transformers are sample-efficient world models. In: ICLR (2023)"},{"key":"11_CR33","doi-asserted-by":"crossref","unstructured":"Mousavian, A., Fiser, M., Davidson, J., Kosecka, J., Toshev, A.: Visual representations for semantic target driven navigation. In: ICRA, pp. 8846\u20138852 (2019)","DOI":"10.1109\/ICRA.2019.8793493"},{"key":"11_CR34","unstructured":"M\u00fcller, M., Dosovitskiy, A., Ghanem, B., Koltun, V.: Driving policy transfer via modularity and abstraction. In: CORL (2018)"},{"key":"11_CR35","unstructured":"Nash, C., et al.: Transframer: Arbitrary frame prediction with generative models (2022). ARXIV 2203.09494"},{"key":"11_CR36","doi-asserted-by":"crossref","unstructured":"Ochs, P., Malik, J., Brox, T.: Segmentation of moving objects by long term video analysis. IEEE TPAMI 36(6), 1187\u20131200 (2013)","DOI":"10.1109\/TPAMI.2013.242"},{"key":"11_CR37","unstructured":"van\u00a0den Oord, A., Vinyals, O., Kavukcuoglu, K.: Neural discrete representation learning. In: NeurIPS, vol. 30 (2017)"},{"key":"11_CR38","doi-asserted-by":"crossref","unstructured":"Perazzi, F., Pont-Tuset, J., McWilliams, B., Van\u00a0Gool, L., Gross, M., Sorkine-Hornung, A.: A benchmark dataset and evaluation methodology for video object segmentation. In: CVPR, pp. 724\u2013732 (2016)","DOI":"10.1109\/CVPR.2016.85"},{"key":"11_CR39","doi-asserted-by":"crossref","unstructured":"Philion, J., Fidler, S.: Lift, splat, shoot: encoding images from arbitrary camera rigs by implicitly unprojecting to 3d. In: ECCV, pp. 194\u2013210 (2020)","DOI":"10.1007\/978-3-030-58568-6_12"},{"issue":"8","key":"11_CR40","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I., et al.: Language models are unsupervised multitask learners. OpenAI blog 1(8), 9 (2019)","journal-title":"OpenAI blog"},{"key":"11_CR41","doi-asserted-by":"crossref","unstructured":"Ren, X., Wang, X.: Look outside the room: synthesizing a consistent long-term 3d scene video from a single image. In: CVPR, 3563\u20133573 (2022)","DOI":"10.1109\/CVPR52688.2022.00355"},{"key":"11_CR42","unstructured":"Renz, K., Chitta, K., Mercea, O.B., Koepke, A.S., Akata, Z., Geiger, A.: Plant: explainable planning transformers via object-level representations. In: CORL (2022)"},{"key":"11_CR43","unstructured":"Sauer, A., Savinov, N., Geiger, A.: Conditional affordance learning for driving in urban environments. In: CORL, pp. 237\u2013252 (2018)"},{"key":"11_CR44","unstructured":"Sax, A., Emi, B., Zamir, A.R., Guibas, L.J., Savarese, S., Malik, J.: Mid-level visual representations improve generalization and sample efficiency for learning visuomotor policies. In: CORL (2019)"},{"key":"11_CR45","unstructured":"Seitzer, M., et\u00a0al.: Bridging the gap to real-world object-centric learning. In: ICLR (2023)"},{"key":"11_CR46","unstructured":"Shafiullah, N.M.M., Cui, Z.J., Altanzaya, A., Pinto, L.: Behavior transformers: cloning $$k$$ modes with one stone. In: NeurIPS, vol. 35, pp. 22955\u201322968 (2022)"},{"key":"11_CR47","doi-asserted-by":"crossref","unstructured":"Wang, D., Devin, C., Cai, Q.Z., Kr\u00e4henb\u00fchl, P., Darrell, T.: Monocular plan view networks for autonomous driving. In: IROS, pp. 2876\u20132883 (2019)","DOI":"10.1109\/IROS40897.2019.8967897"},{"key":"11_CR48","unstructured":"Wu, Z., Dvornik, N., Greff, K., Kipf, T., Garg, A.: Slotformer: unsupervised visual dynamics simulation with object-centric models. In: ICLR (2023)"},{"key":"11_CR49","doi-asserted-by":"crossref","unstructured":"Xu, N., et al.: Youtube-vos: a large-scale video object segmentation benchmark. In: ECCV (2018)","DOI":"10.1007\/978-3-030-01228-1_36"},{"key":"11_CR50","unstructured":"Yan, W., Zhang, Y., Abbeel, P., Srinivas, A.: Videogpt: Video generation using VQ-VAE and transformers (2021). CoRR abs\/2104.10157, https:\/\/arxiv.org\/abs\/2104.10157"},{"key":"11_CR51","unstructured":"Yan, W., Zhang, Y., Abbeel, P., Srinivas, A.: VideoGPT: video generation using VQ-VAE and transformers (2021). ARXIV 2104.10157"},{"key":"11_CR52","doi-asserted-by":"crossref","unstructured":"Yang, L., Fan, Y., Xu, N.: Video instance segmentation. In: CVPR, pp. 5188\u20135197 (2019)","DOI":"10.1109\/ICCV.2019.00529"},{"key":"11_CR53","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Liniger, A., Dai, D., Yu, F., Van\u00a0Gool, L.: End-to-end urban driving by imitating a reinforcement learning coach. In: ICCV, pp. 15222\u201315232 (2021)","DOI":"10.1109\/ICCV48922.2021.01494"},{"key":"11_CR54","unstructured":"Zheng, Q., Zhang, A., Grover, A.: Online decision transformer. In: ICML, pp. 27042\u201327059 (2022)"},{"key":"11_CR55","doi-asserted-by":"crossref","unstructured":"Zhou, B., Kr\u00e4henb\u00fchl, P., Koltun, V.: Does computer vision matter for action? Sci. Robot. 4(30) (2019)","DOI":"10.1126\/scirobotics.aaw6661"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72652-1_11","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T08:33:49Z","timestamp":1730190829000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72652-1_11"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,30]]},"ISBN":["9783031726514","9783031726521"],"references-count":55,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72652-1_11","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,30]]},"assertion":[{"value":"30 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}