{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,18]],"date-time":"2026-03-18T14:13:34Z","timestamp":1773843214877,"version":"3.50.1"},"publisher-location":"Cham","reference-count":40,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729393","type":"print"},{"value":"9783031729409","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,17]],"date-time":"2024-11-17T00:00:00Z","timestamp":1731801600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,17]],"date-time":"2024-11-17T00:00:00Z","timestamp":1731801600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72940-9_19","type":"book-chapter","created":{"date-parts":[[2024,11,16]],"date-time":"2024-11-16T20:42:52Z","timestamp":1731789772000},"page":"331-347","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["RoScenes: A Large-Scale Multi-view 3D Dataset for\u00a0Roadside Perception"],"prefix":"10.1007","author":[{"given":"Xiaosu","family":"Zhu","sequence":"first","affiliation":[]},{"given":"Hualian","family":"Sheng","sequence":"additional","affiliation":[]},{"given":"Sijia","family":"Cai","sequence":"additional","affiliation":[]},{"given":"Bing","family":"Deng","sequence":"additional","affiliation":[]},{"given":"Shaopeng","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Qiao","family":"Liang","sequence":"additional","affiliation":[]},{"given":"Ken","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Lianli","family":"Gao","sequence":"additional","affiliation":[]},{"given":"Jingkuan","family":"Song","sequence":"additional","affiliation":[]},{"given":"Jieping","family":"Ye","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,17]]},"reference":[{"key":"19_CR1","doi-asserted-by":"crossref","unstructured":"Caesar, H., et al.: nuScenes: a multimodal dataset for autonomous driving. In: CVPR, pp. 11618\u201311628 (2020)","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"19_CR2","doi-asserted-by":"crossref","unstructured":"Cao, J., Pang, J., Weng, X., Khirodkar, R., Kitani, K.: Observation-centric SORT: rethinking SORT for robust multi-object tracking. In: CVPR, pp. 9686\u20139696 (2023)","DOI":"10.1109\/CVPR52729.2023.00934"},{"key":"19_CR3","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"19_CR4","doi-asserted-by":"crossref","unstructured":"Chang, M., et al.: Argoverse: 3D tracking and forecasting with rich maps. In: CVPR, pp. 8748\u20138757 (2019)","DOI":"10.1109\/CVPR.2019.00895"},{"key":"19_CR5","doi-asserted-by":"crossref","unstructured":"Cre\u00df, C., et al.: A9-dataset: multi-sensor infrastructure-based dataset for mobility research. In: IEEE Intelligent Vehicles Symposium, pp. 965\u2013970 (2022)","DOI":"10.1109\/IV51971.2022.9827401"},{"key":"19_CR6","doi-asserted-by":"crossref","unstructured":"Dai, J., et al.: Deformable convolutional networks. In: ICCV, pp. 764\u2013773 (2017)","DOI":"10.1109\/ICCV.2017.89"},{"key":"19_CR7","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L., Li, K., Fei-Fei, L.: Imagenet: a large-scale hierarchical image database. In: CVPR, pp. 248\u2013255 (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"19_CR8","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. In: ICLR (2021)"},{"key":"19_CR9","doi-asserted-by":"crossref","unstructured":"Ettinger, S., et al.: Large scale interactive motion forecasting for autonomous driving: the Waymo open motion dataset. In: ICCV, pp. 9690\u20139699 (2021)","DOI":"10.1109\/ICCV48922.2021.00957"},{"key":"19_CR10","doi-asserted-by":"crossref","unstructured":"Geiger, A., Lenz, P., Urtasun, R.: Are we ready for autonomous driving? The KITTI vision benchmark suite. In: CVPR, pp. 3354\u20133361 (2012)","DOI":"10.1109\/CVPR.2012.6248074"},{"key":"19_CR11","unstructured":"Geyer, J., et al.: A2D2: Audi autonomous driving dataset. arXiv preprint 2004.06320 (2020)"},{"key":"19_CR12","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"19_CR13","doi-asserted-by":"crossref","unstructured":"Henkel, P., Mittmann, U., Iafrancesco, M.: Real-time kinematic positioning with GPS and GLONASS. In: European Signal Processing Conference (2016)","DOI":"10.1109\/EUSIPCO.2016.7760411"},{"key":"19_CR14","unstructured":"Huang, J., Huang, G.: BEVDet4D: exploit temporal cues in multi-camera 3D object detection. arXiv preprint 2203.17054 (2022)"},{"key":"19_CR15","unstructured":"Huang, J., Huang, G., Zhu, Z., Du, D.: BEVDet: high-performance multi-camera 3D object detection in bird-eye-view. arXiv preprint 2112.11790 (2021)"},{"issue":"10","key":"19_CR16","doi-asserted-by":"publisher","first-page":"2702","DOI":"10.1109\/TPAMI.2019.2926463","volume":"42","author":"X Huang","year":"2020","unstructured":"Huang, X., Wang, P., Cheng, X., Zhou, D., Geng, Q., Yang, R.: The apolloscape open dataset for autonomous driving and its application. IEEE Trans. Pattern Anal. Mach. Intell. 42(10), 2702\u20132719 (2020)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"19_CR17","unstructured":"Kirillov, A., et al.: Segment anything. arXiv preprint arXiv:2304.02643 (2023)"},{"key":"19_CR18","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: BEVDepth: acquisition of reliable depth for multi-view 3D object detection. In: AAAI, pp. 1477\u20131485 (2023)","DOI":"10.1609\/aaai.v37i2.25233"},{"key":"19_CR19","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/978-3-031-20077-9_1","volume-title":"Computer Vision - ECCV 2022","author":"Z Li","year":"2022","unstructured":"Li, Z., et al.: BEVFormer: learning bird\u2019s-eye-view representation from multi-camera images via spatiotemporal transformers. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13669, pp. 1\u201318. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20077-9_1"},{"key":"19_CR20","doi-asserted-by":"crossref","unstructured":"Liu, H., Teng, Y., Lu, T., Wang, H., Wang, L.: SparseBEV: high-performance sparse 3D object detection from multi-camera videos. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01703"},{"key":"19_CR21","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"531","DOI":"10.1007\/978-3-031-19812-0_31","volume-title":"Computer Vision - ECCV 2022","author":"Y Liu","year":"2022","unstructured":"Liu, Y., Wang, T., Zhang, X., Sun, J.: PETR: position embedding transformation for multi-view 3d object detection. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13687, pp. 531\u2013548. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19812-0_31"},{"key":"19_CR22","doi-asserted-by":"crossref","unstructured":"Liu, Y., et al.: PETRv2: a unified framework for 3D perception from multi-camera images. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00302"},{"key":"19_CR23","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: ICCV, pp. 9992\u201310002 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"19_CR24","unstructured":"Lyu, C., et al.: RTMDet: an empirical study of designing real-time object detectors. arXiv preprint 2212.07784 (2022)"},{"key":"19_CR25","unstructured":"Mao, J., et al.: One million scenes for autonomous driving: ONCE dataset. In: NeurIPS (2021)"},{"key":"19_CR26","unstructured":"Park, J., et al.: Time will tell: new outlooks and A baseline for temporal multi-view 3D object detection. In: ICLR (2023)"},{"key":"19_CR27","doi-asserted-by":"crossref","unstructured":"Patil, A., Malla, S., Gang, H., Chen, Y.: The H3D dataset for full-surround 3D multi-object detection and tracking in crowded urban scenes. In: ICRA, pp. 9552\u20139557 (2019)","DOI":"10.1109\/ICRA.2019.8793925"},{"key":"19_CR28","doi-asserted-by":"crossref","unstructured":"Pham, Q., et al.: A*3d dataset: towards autonomous driving in challenging environments. In: ICRA, pp. 2267\u20132273 (2020)","DOI":"10.1109\/ICRA40945.2020.9197385"},{"key":"19_CR29","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"194","DOI":"10.1007\/978-3-030-58568-6_12","volume-title":"Computer Vision \u2013 ECCV 2020","author":"J Philion","year":"2020","unstructured":"Philion, J., Fidler, S.: Lift, splat, shoot: encoding images from arbitrary camera rigs by implicitly unprojecting to 3D. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12359, pp. 194\u2013210. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58568-6_12"},{"key":"19_CR30","unstructured":"Ravi, N., et al.: Accelerating 3D deep learning with pytorch3d. arXiv preprint 2007.08501 (2020)"},{"key":"19_CR31","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NeurIPS, pp. 5998\u20136008 (2017)"},{"key":"19_CR32","doi-asserted-by":"crossref","unstructured":"Wang, S., Liu, Y., Wang, T., Li, Y., Zhang, X.: Exploring object-centric temporal modeling for efficient multi-view 3D object detection. In: ICCV, pp. 3621\u20133631 (2023)","DOI":"10.1109\/ICCV51070.2023.00335"},{"key":"19_CR33","doi-asserted-by":"crossref","unstructured":"Wang, W., et al.: Internimage: exploring large-scale vision foundation models with deformable convolutions. In: CVPR, pp. 14408\u201314419 (2023)","DOI":"10.1109\/CVPR52729.2023.01385"},{"key":"19_CR34","unstructured":"Wang, Y., Guizilini, V., Zhang, T., Wang, Y., Zhao, H., Solomon, J.: DETR3D: 3D object detection from multi-view images via 3D-to-2D queries. In: CoRL, pp. 180\u2013191 (2021)"},{"key":"19_CR35","doi-asserted-by":"crossref","unstructured":"Xia, G., et al.: DOTA: a large-scale dataset for object detection in aerial images. In: CVPR, pp. 3974\u20133983 (2018)","DOI":"10.1109\/CVPR.2018.00418"},{"key":"19_CR36","doi-asserted-by":"crossref","unstructured":"Yang, C., et al.: BEVFormer v2: adapting modern image backbones to bird\u2019s-eye-view recognition via perspective supervision. In: CVPR, pp. 17830\u201317839 (2023)","DOI":"10.1109\/CVPR52729.2023.01710"},{"key":"19_CR37","doi-asserted-by":"crossref","unstructured":"Yang, L., et al.: BEVHeight: a robust framework for vision-based roadside 3D object detection. In: CVPR, pp. 21611\u201321620 (2023)","DOI":"10.1109\/CVPR52729.2023.02070"},{"key":"19_CR38","doi-asserted-by":"crossref","unstructured":"Ye, X., et al.: Rope3D: the roadside perception dataset for autonomous driving and monocular 3D object detection task. In: CVPR, pp. 21309\u201321318 (2022)","DOI":"10.1109\/CVPR52688.2022.02065"},{"key":"19_CR39","doi-asserted-by":"crossref","unstructured":"Yu, H., et al.: DAIR-V2X: a large-scale dataset for vehicle-infrastructure cooperative 3D object detection. In: CVPR, pp. 21329\u201321338 (2022)","DOI":"10.1109\/CVPR52688.2022.02067"},{"key":"19_CR40","doi-asserted-by":"crossref","unstructured":"Yu, H., et al.: V2x-Seq: a large-scale sequential dataset for vehicle-infrastructure cooperative perception and forecasting. In: CVPR, pp. 5486\u20135495 (2023)","DOI":"10.1109\/CVPR52729.2023.00531"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72940-9_19","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,16]],"date-time":"2024-11-16T21:34:31Z","timestamp":1731792871000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72940-9_19"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,17]]},"ISBN":["9783031729393","9783031729409"],"references-count":40,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72940-9_19","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,17]]},"assertion":[{"value":"17 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}