{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T17:24:27Z","timestamp":1777569867206,"version":"3.51.4"},"publisher-location":"Cham","reference-count":67,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031732225","type":"print"},{"value":"9783031732232","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,8]],"date-time":"2024-11-08T00:00:00Z","timestamp":1731024000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,8]],"date-time":"2024-11-08T00:00:00Z","timestamp":1731024000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73223-2_19","type":"book-chapter","created":{"date-parts":[[2024,11,7]],"date-time":"2024-11-07T18:49:30Z","timestamp":1731005370000},"page":"326-342","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":8,"title":["DeTra: A Unified Model for\u00a0Object Detection and\u00a0Trajectory Forecasting"],"prefix":"10.1007","author":[{"given":"Sergio","family":"Casas","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ben","family":"Agro","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiageng","family":"Mao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Thomas","family":"Gilles","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Alexander","family":"Cui","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Thomas","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Raquel","family":"Urtasun","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,11,8]]},"reference":[{"key":"19_CR1","doi-asserted-by":"crossref","unstructured":"Agro, B., Sykora, Q., Casas, S., Urtasun, R.: Implicit occupancy flow fields for perception and prediction in self-driving. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00139"},{"key":"19_CR2","doi-asserted-by":"crossref","unstructured":"Cai, Z., Vasconcelos, N.: Cascade r-cnn: high quality object detection and instance segmentation. PAMI (2019)","DOI":"10.1109\/CVPR.2018.00644"},{"key":"19_CR3","doi-asserted-by":"crossref","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: ECCV (2020)","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"19_CR4","doi-asserted-by":"crossref","unstructured":"Casas, S., Gulino, C., Liao, R., Urtasun, R.: Spagnn: spatially-aware graph neural networks for relational behavior forecasting from sensor data. In: ICRA (2020)","DOI":"10.1109\/ICRA40945.2020.9196697"},{"key":"19_CR5","doi-asserted-by":"crossref","unstructured":"Casas, S., Gulino, C., Suo, S., Luo, K., Liao, R., Urtasun, R.: Implicit latent variable model for scene-consistent motion forecasting. In: ECCV (2020)","DOI":"10.1007\/978-3-030-58592-1_37"},{"key":"19_CR6","unstructured":"Casas, S., Luo, W., Urtasun, R.: Intentnet: learning to predict intention from raw sensor data. In: CoRL (2018)"},{"key":"19_CR7","doi-asserted-by":"crossref","unstructured":"Casas, S., Sadat, A., Urtasun, R.: Mp3: a unified model to map, perceive, predict and plan. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.01417"},{"key":"19_CR8","unstructured":"Chai, Y., Sapp, B., Bansal, M., Anguelov, D.: Multipath: multiple probabilistic anchor trajectory hypotheses for behavior prediction. arXiv preprint arXiv:1910.05449 (2019)"},{"key":"19_CR9","doi-asserted-by":"crossref","unstructured":"Chen, Y., Dai, X., Liu, M., Chen, D., Yuan, L., Liu, Z.: Dynamic convolution: attention over convolution kernels. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11030\u201311039 (2020)","DOI":"10.1109\/CVPR42600.2020.01104"},{"key":"19_CR10","doi-asserted-by":"crossref","unstructured":"Chitta, K., Prakash, A., Jaeger, B., Yu, Z., Renz, K., Geiger, A.: Transfuser: Imitation with transformer-based sensor fusion for autonomous driving. PAMI (2022)","DOI":"10.1109\/TPAMI.2022.3200245"},{"key":"19_CR11","doi-asserted-by":"crossref","unstructured":"Cho, K., et al.: Learning phrase representations using rnn encoder-decoder for statistical machine translation. arXiv preprint arXiv:1406.1078 (2014)","DOI":"10.3115\/v1\/D14-1179"},{"key":"19_CR12","doi-asserted-by":"crossref","unstructured":"Cui, A., Casas, S., Sadat, A., Liao, R., Urtasun, R.: Lookout: diverse multi-future prediction and planning for self-driving. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.01580"},{"key":"19_CR13","doi-asserted-by":"crossref","unstructured":"Cui, A., Casas, S., Wong, K., Suo, S., Urtasun, R.: Gorela: go relative for viewpoint-invariant motion forecasting. arXiv preprint arXiv:2211.02545 (2022)","DOI":"10.1109\/ICRA48891.2023.10160984"},{"key":"19_CR14","doi-asserted-by":"crossref","unstructured":"Cui, H., et al.: Multimodal trajectory predictions for autonomous driving using deep convolutional networks. 2019 ICRA, pp. 2090\u20132096 (2018). https:\/\/api.semanticscholar.org\/CorpusID:52891221","DOI":"10.1109\/ICRA.2019.8793868"},{"key":"19_CR15","unstructured":"Deo, N., Wolff, E.M., Beijbom, O.: Multimodal trajectory prediction conditioned on lane-graph traversals. In: CoRL (2021)"},{"key":"19_CR16","doi-asserted-by":"crossref","unstructured":"Djuric, N., et al.: Multixnet: multiclass multistage multimodal motion prediction. IV (2021)","DOI":"10.1109\/IV48863.2021.9575718"},{"key":"19_CR17","doi-asserted-by":"crossref","unstructured":"Ettinger, S., et\u00a0al.: Large scale interactive motion forecasting for autonomous driving: The waymo open motion dataset. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00957"},{"key":"19_CR18","unstructured":"Fan, H., et al.: Baidu apollo em motion planner. arXiv preprint arXiv:1807.08048 (2018)"},{"key":"19_CR19","doi-asserted-by":"crossref","unstructured":"Gao, J., et al.: Vectornet: encoding hd maps and agent dynamics from vectorized representation. CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01154"},{"key":"19_CR20","unstructured":"Gilles, T., Sabatini, S., Tsishkou, D., Stanciulescu, B., Moutarde, F.: Thomas: trajectory heatmap output with learned multi-agent sampling. arXiv preprint arXiv:2110.06607 (2021)"},{"key":"19_CR21","unstructured":"Girgis, R., et al.: Latent variable sequential set transformers for joint multi-agent motion prediction. arXiv preprint arXiv:2104.00563 (2021)"},{"key":"19_CR22","unstructured":"Grubb, A., Bagnell, D.: Speedboost: Anytime prediction with uniform near-optimality. In: Artificial Intelligence and Statistics, pp. 458\u2013466. PMLR (2012)"},{"key":"19_CR23","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"19_CR24","doi-asserted-by":"crossref","unstructured":"Hu, H., Dey, D., Hebert, M., Bagnell, J.A.: Learning anytime predictions in neural networks via adaptive loss balancing. In: Proceedings of the AAAI Conference on Artificial Intelligence (2019)","DOI":"10.1609\/aaai.v33i01.33013812"},{"key":"19_CR25","doi-asserted-by":"crossref","unstructured":"Hu, J., Shen, L., Sun, G.: Squeeze-and-excitation networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7132\u20137141 (2018)","DOI":"10.1109\/CVPR.2018.00745"},{"key":"19_CR26","doi-asserted-by":"crossref","unstructured":"Hu, Y., et al.: Planning-oriented autonomous driving. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01712"},{"key":"19_CR27","doi-asserted-by":"crossref","unstructured":"Ivanovic, B., Lin, Y., Shrivastava, S., Chakravarty, P., Pavone, M.: Propagating state uncertainty through trajectory forecasting. In: ICRA (2022)","DOI":"10.1109\/ICRA46639.2022.9811776"},{"key":"19_CR28","unstructured":"Kipf, T.N., Welling, M.: Semi-supervised classification with graph convolutional networks. arXiv preprint arXiv:1609.02907 (2016)"},{"key":"19_CR29","unstructured":"Kumbhar, O., Sizikova, E., Majaj, N., Pelli, D.G.: Anytime prediction as a model of human reaction time. arXiv preprint arXiv:2011.12859 (2020)"},{"key":"19_CR30","doi-asserted-by":"crossref","unstructured":"Lang, A.H., Vora, S., Caesar, H., Zhou, L., Yang, J., Beijbom, O.: Pointpillars: Fast encoders for object detection from point clouds. CVPR (2018)","DOI":"10.1109\/CVPR.2019.01298"},{"key":"19_CR31","doi-asserted-by":"crossref","unstructured":"Li, L.L., et al.: End-to-end contextual perception and prediction with interaction transformer. IROS (2020)","DOI":"10.1109\/IROS45743.2020.9341392"},{"key":"19_CR32","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"541","DOI":"10.1007\/978-3-030-58536-5_32","volume-title":"Computer Vision \u2013 ECCV 2020","author":"M Liang","year":"2020","unstructured":"Liang, M., et al.: Learning lane graph representations for motion forecasting. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12347, pp. 541\u2013556. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58536-5_32"},{"key":"19_CR33","doi-asserted-by":"crossref","unstructured":"Liang, M., et al.: Pnpnet: end-to-end perception and prediction with tracking in the loop. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01157"},{"key":"19_CR34","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Goyal, P., Girshick, R., He, K., Doll\u00e1r, P.: Focal loss for dense object detection. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.324"},{"key":"19_CR35","unstructured":"Liu, S., et al.: DAB-DETR: Dynamic anchor boxes are better queries for DETR. In: ICLR (2022)"},{"key":"19_CR36","doi-asserted-by":"crossref","unstructured":"Liu, W., et al.: Ssd: single shot multibox detector. In: ECCV (2015)","DOI":"10.1007\/978-3-319-46448-0_2"},{"key":"19_CR37","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"},{"key":"19_CR38","doi-asserted-by":"crossref","unstructured":"Luo, W., Yang, B., Urtasun, R.: Fast and furious: real time end-to-end 3d detection, tracking and motion forecasting with a single convolutional net. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00376"},{"issue":"2","key":"19_CR39","doi-asserted-by":"publisher","first-page":"5639","DOI":"10.1109\/LRA.2022.3151613","volume":"7","author":"R Mahjourian","year":"2022","unstructured":"Mahjourian, R., Kim, J., Chai, Y., Tan, M., Sapp, B., Anguelov, D.: Occupancy flow fields for motion forecasting in autonomous driving. IEEE Robot. Autom. Lett. 7(2), 5639\u20135646 (2022)","journal-title":"IEEE Robot. Autom. Lett."},{"key":"19_CR40","doi-asserted-by":"publisher","first-page":"526","DOI":"10.1109\/LRA.2020.3047793","volume":"6","author":"GP Meyer","year":"2021","unstructured":"Meyer, G.P., et al.: Laserflow: efficient and probabilistic object detection and motion forecasting. IEEE Robot. Autom. Lett. 6, 526\u2013533 (2021)","journal-title":"IEEE Robot. Autom. Lett."},{"key":"19_CR41","doi-asserted-by":"crossref","unstructured":"Mo, X., Huang, Z., Xing, Y., Lv, C.: Multi-agent trajectory prediction with heterogeneous edge-enhanced graph attention network. IEEE Trans. Intell. Transp. Syst. (2022)","DOI":"10.1109\/TITS.2022.3146300"},{"key":"19_CR42","doi-asserted-by":"crossref","unstructured":"Nayakanti, N., Al-Rfou, R., Zhou, A., Goel, K., Refaat, K.S., Sapp, B.: Wayformer: Motion forecasting via simple & efficient attention networks. arXiv preprint arXiv:2207.05844 (2022)","DOI":"10.1109\/ICRA48891.2023.10160609"},{"key":"19_CR43","unstructured":"Ngiam, J., et\u00a0al.: Scene transformer: A unified architecture for predicting multiple agent trajectories. arXiv preprint arXiv:2106.08417 (2021)"},{"key":"19_CR44","doi-asserted-by":"crossref","unstructured":"Phan-Minh, T., Grigore, E.C., Boulton, F.A., Beijbom, O., Wolff, E.M.: Covernet: Multimodal behavior prediction using trajectory sets. CVPR (2019)","DOI":"10.1109\/CVPR42600.2020.01408"},{"key":"19_CR45","unstructured":"Qi, C.R., Su, H., Mo, K., Guibas, L.J.: Pointnet: Deep learning on point sets for 3d classification and segmentation. In: CVPR (2017)"},{"key":"19_CR46","doi-asserted-by":"crossref","unstructured":"Qi, C.R., et al.: Offboard 3d object detection from point cloud sequences. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00607"},{"key":"19_CR47","doi-asserted-by":"crossref","unstructured":"Redmon, J., Divvala, S.K., Girshick, R.B., Farhadi, A.: You only look once: Unified, real-time object detection. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2015)","DOI":"10.1109\/CVPR.2016.91"},{"key":"19_CR48","unstructured":"Ren, S., He, K., Girshick, R.B., Sun, J.: Faster r-cnn: towards real-time object detection with region proposal networks. PAMI (2015)"},{"key":"19_CR49","unstructured":"Rhinehart, N., McAllister, R., Levine, S.: Deep imitative models for flexible inference, planning, and control. arXiv preprint arXiv:1810.06544 (2018)"},{"key":"19_CR50","doi-asserted-by":"crossref","unstructured":"Sadat, A., Ren, M., Pokrovsky, A., Lin, Y.C., Yumer, E., Urtasun, R.: Jointly learnable behavior and trajectory planning for self-driving vehicles. In: 2019 IROS, pp. 3949\u20133956. IEEE (2019)","DOI":"10.1109\/IROS40897.2019.8967615"},{"key":"19_CR51","unstructured":"Shah, M., et al.: Liranet: end-to-end trajectory prediction using spatio-temporal radar fusion. In: CoRL (2020)"},{"key":"19_CR52","doi-asserted-by":"crossref","unstructured":"Sun, P., et\u00a0al.: Scalability in perception for autonomous driving: Waymo open dataset. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00252"},{"key":"19_CR53","doi-asserted-by":"crossref","unstructured":"Varadarajan, B., et\u00a0al.: Multipath++: Efficient information fusion and trajectory aggregation for behavior prediction. In: 2022 ICRA (2022)","DOI":"10.1109\/ICRA46639.2022.9812107"},{"key":"19_CR54","unstructured":"Vaswani, A., et al.: Attention is all you need. NeurIPS (2017)"},{"key":"19_CR55","doi-asserted-by":"crossref","unstructured":"Weng, X., Ivanovic, B., Pavone, M.: Mtp: multi-hypothesis tracking and prediction for reduced error propagation. In: IV (2022)","DOI":"10.1109\/IV51971.2022.9827273"},{"key":"19_CR56","unstructured":"Wilson, B., et\u00a0al.: Argoverse 2: next generation datasets for self-driving perception and forecasting. arXiv preprint arXiv:2301.00493 (2023)"},{"key":"19_CR57","unstructured":"Yang, A.J., et al.: Labelformer: object trajectory refinement for offboard perception from lidar point clouds. In: CoRL. PMLR (2023)"},{"key":"19_CR58","unstructured":"Yang, B., Bai, M., Liang, M., Zeng, W., Urtasun, R.: Auto4d: learning to label 4d objects from sequential point clouds. arXiv preprint arXiv:2101.06586 (2021)"},{"key":"19_CR59","doi-asserted-by":"crossref","unstructured":"Yang, B., Luo, W., Urtasun, R.: Pixor: real-time 3d object detection from point clouds. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00798"},{"key":"19_CR60","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Weng, X., Ou, Y., Kitani, K.M.: Agentformer: agent-aware transformers for socio-temporal multi-agent forecasting. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00967"},{"key":"19_CR61","doi-asserted-by":"crossref","unstructured":"Zeng, W., et al.: End-to-end interpretable neural motion planner. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00886"},{"key":"19_CR62","unstructured":"Zhou, X., Wang, D., Kr\u00e4henb\u00fchl, P.: Objects as points. ArXiv (2019)"},{"key":"19_CR63","unstructured":"Zhou, Y., et al.: End-to-end multi-view fusion for 3d object detection in lidar point clouds. ArXiv (2019)"},{"key":"19_CR64","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Tuzel, O.: Voxelnet: end-to-end learning for point cloud based 3d object detection. In: 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2017)","DOI":"10.1109\/CVPR.2018.00472"},{"key":"19_CR65","doi-asserted-by":"crossref","unstructured":"Zhou, Z., Wang, J., Li, Y.H., Huang, Y.K.: Query-centric trajectory prediction. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01713"},{"key":"19_CR66","doi-asserted-by":"crossref","unstructured":"Zhou, Z., Ye, L., Wang, J., Wu, K., Lu, K.: Hivt: hierarchical vector transformer for multi-agent motion prediction. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00862"},{"key":"19_CR67","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., Dai, J.: Deformable detr: deformable transformers for end-to-end object detection. arXiv preprint arXiv:2010.04159 (2020)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73223-2_19","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,7]],"date-time":"2024-11-07T19:06:29Z","timestamp":1731006389000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73223-2_19"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,8]]},"ISBN":["9783031732225","9783031732232"],"references-count":67,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73223-2_19","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,8]]},"assertion":[{"value":"8 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}