{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T16:46:17Z","timestamp":1777653977569,"version":"3.51.4"},"publisher-location":"Cham","reference-count":36,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031727832","type":"print"},{"value":"9783031727849","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72784-9_16","type":"book-chapter","created":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T07:01:50Z","timestamp":1727593310000},"page":"279-295","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":14,"title":["Benchmarking Object Detectors with\u00a0COCO: A New Path Forward"],"prefix":"10.1007","author":[{"given":"Shweta","family":"Singh","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Aayan","family":"Yadav","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jitesh","family":"Jain","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Humphrey","family":"Shi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Justin","family":"Johnson","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Karan","family":"Desai","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,9,30]]},"reference":[{"key":"16_CR1","doi-asserted-by":"crossref","unstructured":"Benenson, R., Popov, S., Ferrari, V.: Large-scale interactive object segmentation with human annotators. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2019)","DOI":"10.1109\/CVPR.2019.01197"},{"key":"16_CR2","unstructured":"Beyer, L., H\u00e9naff, O.J., Kolesnikov, A., Zhai, X., van\u00a0den Oord, A.: Are we done with ImageNet? arXiv preprint arXiv:2006.07159 (2020)"},{"key":"16_CR3","doi-asserted-by":"crossref","unstructured":"Cai, Z., Vasconcelos, N.: Cascade R-CNN: delving into high quality object detection. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2018)","DOI":"10.1109\/CVPR.2018.00644"},{"key":"16_CR4","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"16_CR5","unstructured":"Chen, L.-C., Papandreou, G., Kokkinos, I., Murphy, K., Yuille, A.L.: Semantic image segmentation with deep convolutional nets and fully connected CRFs. In: ICLR (2015)"},{"key":"16_CR6","doi-asserted-by":"crossref","unstructured":"Chen, L.-C., Papandreou, G., Kokkinos, I., Murphy, K., Yuille, A.L.: Deeplab: semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected CRFs. In: TPAMI (2017)","DOI":"10.1109\/TPAMI.2017.2699184"},{"key":"16_CR7","doi-asserted-by":"crossref","unstructured":"Cheng, B., et al.: Panoptic-deeplab: a simple, strong, and fast baseline for bottom-up panoptic segmentation. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2020)","DOI":"10.1109\/CVPR42600.2020.01249"},{"key":"16_CR8","doi-asserted-by":"crossref","unstructured":"Cheng, B., Misra, I., Schwing, A.G., Kirillov, A., Girdhar, R.: Masked-attention mask transformer for universal image segmentation. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"16_CR9","doi-asserted-by":"crossref","unstructured":"Cordts, M., et al.: The cityscapes dataset for semantic urban scene understanding. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016)","DOI":"10.1109\/CVPR.2016.350"},{"key":"16_CR10","unstructured":"Cordts, M., et al.: Semantic understanding of scenes through the ade20k dataset. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2017)"},{"key":"16_CR11","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. In: Proceedings of the International Conference on Learning Representations (ICLR) (2021)"},{"key":"16_CR12","unstructured":"Everingham, M., Van\u00a0Gool, L., Williams, C.K.I., Winn, J., Zisserman, A.: The PASCAL Visual Object Classes Challenge 2012 (VOC2012) Results (2012). http:\/\/www.pascal-network.org\/challenges\/VOC\/voc2012\/workshop\/index.html"},{"key":"16_CR13","doi-asserted-by":"crossref","unstructured":"Geiger, A., Lenz, P., Urtasun, R.: Are we ready for autonomous driving? The kitti vision benchmark suite. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2012)","DOI":"10.1109\/CVPR.2012.6248074"},{"key":"16_CR14","doi-asserted-by":"crossref","unstructured":"Gupta, A., Dollar, P., Girshick, R.: LVIS: a dataset for large vocabulary instance segmentation. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2019)","DOI":"10.1109\/CVPR.2019.00550"},{"key":"16_CR15","doi-asserted-by":"crossref","unstructured":"Hassani, A., Shi, H.: Dilated neighborhood attention transformer. arXiv:2209.15001 (2022)","DOI":"10.1109\/CVPR52729.2023.00599"},{"key":"16_CR16","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"16_CR17","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., Girshick, R.: Mask R-CNN. In: Proceedings of IEEE International Conference on Computer Vision (ICCV) (2017)","DOI":"10.1109\/ICCV.2017.322"},{"key":"16_CR18","doi-asserted-by":"crossref","unstructured":"Jain, J., Li, J., Chiu, M., Hassani, A., Orlov, N., Shi, H.: OneFormer: one transformer to rule universal image segmentation. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00292"},{"key":"16_CR19","doi-asserted-by":"crossref","unstructured":"Kirillov, A., Girshick, R., He, K., Doll\u00e1r, P.: Panoptic feature pyramid networks. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2019)","DOI":"10.1109\/CVPR.2019.00656"},{"key":"16_CR20","doi-asserted-by":"crossref","unstructured":"Kirillov, A., et al.: Segment anything. In: Proceedings of IEEE International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"16_CR21","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: Imagenet classification with deep convolutional neural networks. In: Advances in Neural Information Processing Systems (NeurIPS) (2012)"},{"key":"16_CR22","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"280","DOI":"10.1007\/978-3-031-20077-9_17","volume-title":"ECCV 2022","author":"Y Li","year":"2022","unstructured":"Li, Y., Mao, H., Girshick, R., He, K.: Exploring plain vision transformer backbones for object detection. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13669, pp. 280\u2013296. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20077-9_17"},{"key":"16_CR23","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: Mvitv2: improved multiscale vision transformers for classification and detection. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.00476"},{"key":"16_CR24","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"16_CR25","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of IEEE International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"16_CR26","doi-asserted-by":"crossref","unstructured":"Liu, Z., Mao, H., Wu, C.-Y., Feichtenhofer, C., Darrell, T., Xie, S.: A convnet for the 2020s. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"16_CR27","doi-asserted-by":"crossref","unstructured":"Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2015)","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"16_CR28","unstructured":"TorchVision maintainers and contributors. TorchVision: PyTorch\u2019s Computer Vision library (2016). https:\/\/github.com\/pytorch\/vision"},{"key":"16_CR29","doi-asserted-by":"crossref","unstructured":"Neuhold, G., Ollmann, T., Rota\u00a0Bulo, S., Kontschieder, P.: The mapillary vistas dataset for semantic understanding of street scenes. In: Proceedings of IEEE International Conference on Computer Vision (ICCV) (2017)","DOI":"10.1109\/ICCV.2017.534"},{"key":"16_CR30","doi-asserted-by":"crossref","unstructured":"Radosavovic, I., Kosaraju, R.P. Girshick, R., He, K., Doll\u00e1r, P.: Designing network design spaces. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2020)","DOI":"10.1109\/CVPR42600.2020.01044"},{"key":"16_CR31","unstructured":"Recht, B., Roelofs, R., Schmidt, L., Shankar, V.: Do ImageNet classifiers generalize to ImageNet? In: Proceedings of the International Conference on Machine Learning (ICML) (2019)"},{"key":"16_CR32","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems (NeurIPS) (2017)"},{"key":"16_CR33","doi-asserted-by":"crossref","unstructured":"Wang, H., Zhu, Y., Adam, H., Yuille, A., Chen, L.C.: MaX-DeepLab: end-to-end panoptic segmentation with mask transformers. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2021)","DOI":"10.1109\/CVPR46437.2021.00542"},{"key":"16_CR34","doi-asserted-by":"crossref","unstructured":"Wang, W., et\u00a0al.: Internimage: exploring large-scale vision foundation models with deformable convolutions. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.01385"},{"key":"16_CR35","unstructured":"Wu, Y., Kirillov, A., Massa, F., Lo, W.-Y., Girshick, R.: Detectron2 (2019). https:\/\/github.com\/facebookresearch\/detectron2"},{"key":"16_CR36","unstructured":"Zimmermann, E., Szeto, J., Pasquero, J., Ratle, F.: Benchmarking a benchmark: how reliable is MS-COCO? arXiv preprint arXiv:2311.02709 (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72784-9_16","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T07:50:55Z","timestamp":1727596255000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72784-9_16"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,30]]},"ISBN":["9783031727832","9783031727849"],"references-count":36,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72784-9_16","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,9,30]]},"assertion":[{"value":"30 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}