{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,15]],"date-time":"2026-04-15T17:50:15Z","timestamp":1776275415605,"version":"3.50.1"},"reference-count":238,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"3","license":[{"start":{"date-parts":[[2023,3,1]],"date-time":"2023-03-01T00:00:00Z","timestamp":1677628800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2023,3,1]],"date-time":"2023-03-01T00:00:00Z","timestamp":1677628800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,3,1]],"date-time":"2023-03-01T00:00:00Z","timestamp":1677628800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62125102"],"award-info":[{"award-number":["62125102"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Proc. IEEE"],"published-print":{"date-parts":[[2023,3]]},"DOI":"10.1109\/jproc.2023.3238524","type":"journal-article","created":{"date-parts":[[2023,1,27]],"date-time":"2023-01-27T18:44:58Z","timestamp":1674845098000},"page":"257-276","source":"Crossref","is-referenced-by-count":2494,"title":["Object Detection in 20 Years: A Survey"],"prefix":"10.1109","volume":"111","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1774-552X","authenticated-orcid":false,"given":"Zhengxia","family":"Zou","sequence":"first","affiliation":[{"name":"Department of Guidance, Navigation and Control, School of Astronautics, Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0483-1306","authenticated-orcid":false,"given":"Keyan","family":"Chen","sequence":"additional","affiliation":[{"name":"Image Processing Center, School of Astronautics, Beijing Key Laboratory of Digital MediaState Key Laboratory of Virtual Reality Technology and Systems, Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4772-3172","authenticated-orcid":false,"given":"Zhenwei","family":"Shi","sequence":"additional","affiliation":[{"name":"Image Processing Center, School of Astronautics, Beijing Key Laboratory of Digital MediaState Key Laboratory of Virtual Reality Technology and Systems, Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7621-342X","authenticated-orcid":false,"given":"Yuhong","family":"Guo","sequence":"additional","affiliation":[{"name":"School of Computer Science, Carleton University, Ottawa, ON, Canada"}]},{"given":"Jieping","family":"Ye","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10584-0_20"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298642"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.343"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.5555\/3045118.3045336"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2708709"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2017.2736553"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1038\/nature14539"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2001.990517"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1023\/B:VISI.0000013087.49260.fb"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2005.177"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2008.4587597"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2010.5539906"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2009.167"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.81"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10578-9_23"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2016.2577031"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.91"},{"key":"ref21","article-title":"YOLOv3: An incremental improvement","author":"Redmon","year":"2018","journal-title":"arXiv:1804.02767"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2004.10934"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1512.02325"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.106"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2858826"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_45"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2018.2876865"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.1999.790410"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1023\/b:visi.0000029664.99615.94"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/34.993558"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126229"},{"key":"ref33","first-page":"442","article-title":"Object detection with grammar models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Girshick"},{"key":"ref34","volume-title":"From Rigid Templates to Grammars: Object Detection With Structured Models","author":"Girshick","year":"2012"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1145\/3065386"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2015.2437384"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10590-1_5"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00442"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00615"},{"key":"ref40","article-title":"Objects as points","author":"Zhou","year":"2019","journal-title":"arXiv:1904.07850"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00972"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00511"},{"key":"ref43","article-title":"Deformable DETR: Deformable transformers for end-to-end object detection","author":"Zhu","year":"2020","journal-title":"arXiv:2010.04159"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-013-0620-5"},{"key":"ref46","volume-title":"Discriminatively Trained Deformable Part Models, Release 5","author":"Girshick","year":"2023"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10590-1_53"},{"key":"ref49","first-page":"379","article-title":"R-FCN: Object detection via region-based fully convolutional networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Dai"},{"key":"ref50","article-title":"Light-head R-CNN: In defense of two-stage object detector","author":"Li","year":"2017","journal-title":"arXiv:1711.07264"},{"key":"ref51","article-title":"YOLO9000: Better, faster, stronger","author":"Redmon","year":"2016","journal-title":"arXiv:1612.08242"},{"key":"ref52","article-title":"YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors","author":"Wang","year":"2022","journal-title":"arXiv:2207.02696"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00094"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-009-0275-4"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-014-0733-5"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-020-01316-z"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01197"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00852"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref62","volume-title":"OpenImages: A Public Dataset for Large-Scale Multi-Label and Multi-Class Image Classification","author":"Krasin","year":"2017"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206631"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2011.155"},{"key":"ref65","article-title":"OverFeat: Integrated recognition, localization and detection using convolutional networks","author":"Sermanet","year":"2013","journal-title":"arXiv:1312.6229"},{"key":"ref66","first-page":"2553","article-title":"Deep neural networks for object detection","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Szegedy"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_22"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00644"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00975"},{"key":"ref70","volume-title":"Exemplar-Based Representations for Object Detection, Association and Beyond","author":"Malisiewicz","year":"2011"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2015.2465908"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.5244\/C.28.24"},{"key":"ref73","first-page":"73","article-title":"What is an object","volume-title":"Proc. CVPR","author":"Alexe"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2012.28"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.414"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.276"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00667"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.21236\/ADA636815"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.5244\/C.30.15"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46478-7_22"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2745563"},{"key":"ref82","article-title":"Learning chained deep features and classifiers for cascade in object detection","author":"Ouyang","year":"2017","journal-title":"arXiv:1702.07054"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.135"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.444"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-011-0439-x"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.314"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12265"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01252-6_24"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00813"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2014.2343217"},{"key":"ref91","article-title":"Exploring person context and local scene context for object detection","author":"Gupta","year":"2015","journal-title":"arXiv:1511.08177"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.440"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00378"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00730"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01462"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206532"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-54193-8_14"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2016.2642789"},{"key":"ref99","first-page":"875","article-title":"Human face detection in visual scenes","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Rowley"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.1998.710772"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_28"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.89"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.308"},{"key":"ref104","first-page":"1","article-title":"When does label smoothing help","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"32","author":"M\u00fcller"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2967274"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00075"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1911.08287"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1049\/ip-vis:19941301"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-54193-8_13"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.685"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00836"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.593"},{"key":"ref113","article-title":"Improving object localization with fitness NMS and bounded IoU loss","author":"Tychsen-Smith","year":"2017","journal-title":"arXiv:1711.00164"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00300"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00662"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-16865-4_19"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.232"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2021.104117"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2021.3095305"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01559"},{"issue":"1","key":"ref121","doi-asserted-by":"crossref","first-page":"15","DOI":"10.1023\/A:1008162616689","article-title":"A trainable system for object detection","volume":"38","author":"Papageorgiou","year":"2000","journal-title":"Int. J. Comput. Vis."},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298686"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1109\/jproc.2023.3238524"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2006.119"},{"issue":"1","key":"ref125","doi-asserted-by":"crossref","first-page":"85","DOI":"10.1023\/A:1011113216584","article-title":"Coarse-to-fine face detection","volume":"41","author":"Fleuret","year":"2001","journal-title":"Int. J. Comput. Vis."},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299170"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2016.2603342"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.384"},{"key":"ref129","first-page":"598","article-title":"Optimal brain damage","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"LeCun"},{"key":"ref130","article-title":"Deep compression: Compressing deep neural networks with pruning, trained quantization and Huffman coding","author":"Han","year":"2015","journal-title":"arXiv:1510.00149"},{"key":"ref131","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299173"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00682"},{"key":"ref133","first-page":"1967","article-title":"Pelee: A real-time object detection system on mobile devices","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Wang"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.1109\/BigData.2018.8621865"},{"key":"ref135","article-title":"CornerNet-lite: Efficient keypoint based object detection","author":"Law","year":"2019","journal-title":"arXiv:1904.08900"},{"key":"ref136","article-title":"PP-PicoDet: A better real-time object detector on mobile devices","author":"Yu","year":"2021","journal-title":"arXiv:2111.00902"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.308"},{"key":"ref138","article-title":"Efficient and accurate approximations of nonlinear convolutional networks","author":"Zhang","year":"2014","journal-title":"arXiv:1411.4229"},{"key":"ref139","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2015.2502579"},{"key":"ref140","article-title":"ShuffleNet: An extremely efficient convolutional neural network for mobile devices","author":"Zhang","year":"2017","journal-title":"arXiv:1707.01083"},{"issue":"12","key":"ref141","first-page":"11","article-title":"CondenseNet: An efficient densenet using learned group convolutions","volume":"3","author":"Huang","year":"2017","journal-title":"Group"},{"key":"ref142","article-title":"Xception: Deep learning with depthwise separable convolutions","author":"Chollet","year":"2016","journal-title":"arXiv:1610.02357"},{"key":"ref143","article-title":"MobileNets: Efficient convolutional neural networks for mobile vision applications","author":"Howard","year":"2017","journal-title":"arXiv:1704.04861"},{"key":"ref144","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00474"},{"key":"ref145","article-title":"Tiny-DSOD: Lightweight object detection for resource-restricted usages","author":"Li","year":"2018","journal-title":"arXiv:1807.11013"},{"key":"ref146","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1602.07360"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2017.60"},{"key":"ref148","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.98"},{"key":"ref149","article-title":"DetNAS: Backbone search for object detection","author":"Chen","year":"2019","journal-title":"arXiv:1903.10979"},{"key":"ref150","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00675"},{"key":"ref151","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00720"},{"key":"ref152","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01142"},{"key":"ref153","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01196"},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6958"},{"key":"ref155","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01188"},{"key":"ref156","first-page":"571","article-title":"Boxlets: A fast convolution algorithm for signal processing and neural networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Simard"},{"key":"ref157","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2009.5459207"},{"key":"ref158","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2005.188"},{"key":"ref159","doi-asserted-by":"publisher","DOI":"10.5244\/C.23.91"},{"key":"ref160","article-title":"Fast training of convolutional networks through FFTs","author":"Mathieu","year":"2013","journal-title":"arXiv:1312.5851"},{"key":"ref161","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-71249-9_47"},{"key":"ref162","article-title":"Fast convolutional nets with FBFFT: A GPU performance evaluation","author":"Vasilache","year":"2014","journal-title":"arXiv:1412.7580"},{"key":"ref163","first-page":"2449","article-title":"Spectral representations for convolutional neural networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Rippel"},{"key":"ref164","first-page":"2949","article-title":"Fast template evaluation with vector quantization","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Sadeghi"},{"key":"ref165","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-33885-4_5"},{"key":"ref166","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2015.7351502"},{"key":"ref167","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2018.2829147"},{"key":"ref168","doi-asserted-by":"publisher","DOI":"10.1016\/j.isprsjprs.2014.10.002"},{"key":"ref169","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.315"},{"key":"ref170","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2016.2601622"},{"key":"ref171","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2867198"},{"key":"ref172","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00244"},{"key":"ref173","first-page":"2017","article-title":"Spatial transformer networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Jaderberg"},{"key":"ref174","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_8"},{"key":"ref175","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00296"},{"key":"ref176","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00377"},{"key":"ref177","article-title":"SNIPER: Efficient multi-scale training","author":"Singh","year":"2018","journal-title":"arXiv:1805.09300"},{"key":"ref178","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref179","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00724"},{"key":"ref180","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.258"},{"key":"ref181","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.199"},{"key":"ref182","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.207"},{"key":"ref183","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00203"},{"key":"ref184","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46484-8_29"},{"key":"ref185","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2017.10.013"},{"key":"ref186","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.351"},{"key":"ref187","article-title":"Cascade R-CNN: Delving into high quality object detection","author":"Cai","year":"2017","journal-title":"arXiv:1712.00726"},{"key":"ref188","doi-asserted-by":"publisher","DOI":"10.1109\/ITSC.2016.7795760"},{"key":"ref189","doi-asserted-by":"publisher","DOI":"10.23919\/MVA.2017.7986913"},{"key":"ref190","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_48"},{"key":"ref191","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.92"},{"key":"ref192","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2017.109"},{"key":"ref193","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_20"},{"key":"ref194","doi-asserted-by":"publisher","DOI":"10.5555\/2969033.2969125"},{"key":"ref195","article-title":"Unsupervised representation learning with deep convolutional generative adversarial networks","author":"Radford","year":"2015","journal-title":"arXiv:1511.06434"},{"key":"ref196","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.244"},{"key":"ref197","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.19"},{"key":"ref198","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.211"},{"key":"ref199","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_13"},{"key":"ref200","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.324"},{"key":"ref201","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3074313"},{"key":"ref202","doi-asserted-by":"publisher","DOI":"10.1016\/S0004-3702(96)00034-3"},{"key":"ref203","first-page":"577","article-title":"Support vector machines for multiple-instance learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Andrews"},{"key":"ref204","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2535231"},{"key":"ref205","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.99"},{"key":"ref206","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.3046647"},{"key":"ref207","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2876304"},{"key":"ref208","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2804907"},{"key":"ref209","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-018-1112-4"},{"key":"ref210","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.204"},{"key":"ref211","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.545"},{"key":"ref212","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.319"},{"key":"ref213","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.311"},{"key":"ref214","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2016.7477688"},{"key":"ref215","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00604"},{"key":"ref216","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00352"},{"key":"ref217","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00948"},{"key":"ref218","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00980"},{"key":"ref219","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00078"},{"key":"ref220","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00712"},{"key":"ref221","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01174"},{"key":"ref222","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.244"},{"key":"ref223","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01274"},{"key":"ref224","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00525"},{"key":"ref225","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093358"},{"key":"ref226","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2021.107929"},{"key":"ref227","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01330"},{"key":"ref228","first-page":"9934","article-title":"What makes for end-to-end object detection","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Sun"},{"key":"ref229","doi-asserted-by":"publisher","DOI":"10.1109\/TII.2021.3061419"},{"key":"ref230","article-title":"Towards large-scale small object detection: Survey and benchmarks","author":"Cheng","year":"2022","journal-title":"arXiv:2207.14096"},{"key":"ref231","first-page":"180","article-title":"DETR3D: 3D object detection from multi-view images via 3D-to-2D queries","volume-title":"Proc. Conf. Robot Learn.","author":"Wang"},{"key":"ref232","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01180"},{"key":"ref233","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01349"},{"key":"ref234","article-title":"TransVOD: End-to-end video object detection with spatial\u2013temporal transformers","author":"Zhou","year":"2022","journal-title":"arXiv:2201.05047"},{"key":"ref235","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3216198"},{"key":"ref236","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547752"},{"key":"ref237","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_41"},{"key":"ref238","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01629"}],"container-title":["Proceedings of the IEEE"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/5\/10061175\/10028728.pdf?arnumber=10028728","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,2]],"date-time":"2024-03-02T16:22:26Z","timestamp":1709396546000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10028728\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,3]]},"references-count":238,"journal-issue":{"issue":"3"},"URL":"https:\/\/doi.org\/10.1109\/jproc.2023.3238524","relation":{},"ISSN":["0018-9219","1558-2256"],"issn-type":[{"value":"0018-9219","type":"print"},{"value":"1558-2256","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,3]]}}}