{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,5]],"date-time":"2026-05-05T10:41:00Z","timestamp":1777977660243,"version":"3.51.4"},"reference-count":317,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2019,1,1]],"date-time":"2019-01-01T00:00:00Z","timestamp":1546300800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"State Key Program of National Natural Science of China","award":["61836009"],"award-info":[{"award-number":["61836009"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U1701267"],"award-info":[{"award-number":["U1701267"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Major Research Plan of the National Natural Science Foundation of China","award":["91438201"],"award-info":[{"award-number":["91438201"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Access"],"published-print":{"date-parts":[[2019]]},"DOI":"10.1109\/access.2019.2939201","type":"journal-article","created":{"date-parts":[[2019,9,5]],"date-time":"2019-09-05T19:57:47Z","timestamp":1567713467000},"page":"128837-128868","source":"Crossref","is-referenced-by-count":1162,"title":["A Survey of Deep Learning-Based Object Detection"],"prefix":"10.1109","volume":"7","author":[{"given":"Licheng","family":"Jiao","sequence":"first","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of Ministry of Education, International Research Center for Intelligent Perception and Computation, Joint International Research Laboratory of Intelligent Perception and Computation, School of Artificial Intelligence, Xidian University, Xi&#x2019;an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9715-867X","authenticated-orcid":false,"given":"Fan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of Ministry of Education, International Research Center for Intelligent Perception and Computation, Joint International Research Laboratory of Intelligent Perception and Computation, School of Artificial Intelligence, Xidian University, Xi&#x2019;an, China"}]},{"given":"Fang","family":"Liu","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of Ministry of Education, International Research Center for Intelligent Perception and Computation, Joint International Research Laboratory of Intelligent Perception and Computation, School of Artificial Intelligence, Xidian University, Xi&#x2019;an, China"}]},{"given":"Shuyuan","family":"Yang","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of Ministry of Education, International Research Center for Intelligent Perception and Computation, Joint International Research Laboratory of Intelligent Perception and Computation, School of Artificial Intelligence, Xidian University, Xi&#x2019;an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6130-2518","authenticated-orcid":false,"given":"Lingling","family":"Li","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of Ministry of Education, International Research Center for Intelligent Perception and Computation, Joint International Research Laboratory of Intelligent Perception and Computation, School of Artificial Intelligence, Xidian University, Xi&#x2019;an, China"}]},{"given":"Zhixi","family":"Feng","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of Ministry of Education, International Research Center for Intelligent Perception and Computation, Joint International Research Laboratory of Intelligent Perception and Computation, School of Artificial Intelligence, Xidian University, Xi&#x2019;an, China"}]},{"given":"Rong","family":"Qu","sequence":"additional","affiliation":[{"name":"ASAP Research Group, School of Computer Science, University of Nottingham, Nottingham, U.K."}]}],"member":"263","reference":[{"key":"ref275","first-page":"506","article-title":"Learning multiple visual domains with residual adapters","author":"rebuffi","year":"2017","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref274","article-title":"Universal representations: The missing link between faces, text, planktons, and cat breeds","author":"bilen","year":"2017","journal-title":"arXiv 1701 07275"},{"key":"ref277","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00352"},{"key":"ref276","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00847"},{"key":"ref271","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00788"},{"key":"ref270","article-title":"Accurate text localization in natural image with cascaded convolutional text network","author":"he","year":"2016","journal-title":"arXiv 1603 09423"},{"key":"ref273","article-title":"Towards universal object detection by domain attention","author":"wang","year":"2019","journal-title":"arXiv 1904 04402"},{"key":"ref170","doi-asserted-by":"publisher","DOI":"10.1016\/j.isprsjprs.2016.03.014"},{"key":"ref272","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2818020"},{"key":"ref172","doi-asserted-by":"publisher","DOI":"10.4018\/978-1-5225-5832-3.ch002"},{"key":"ref171","doi-asserted-by":"publisher","DOI":"10.1049\/trit.2018.1015"},{"key":"ref174","doi-asserted-by":"publisher","DOI":"10.2991\/cimns-18.2018.26"},{"key":"ref173","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2018.2847291"},{"key":"ref176","doi-asserted-by":"crossref","first-page":"25","DOI":"10.26634\/jip.5.1.13984","article-title":"A survey of techniques for license plate detection and recognition","volume":"5","author":"nair","year":"2018","journal-title":"i-Manager&#x2019;s J Image Process"},{"key":"ref175","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN.2018.8489629"},{"key":"ref178","article-title":"ApolloCar3D: A large 3D car instance understanding benchmark for autonomous driving","author":"song","year":"2018","journal-title":"arXiv 1811 12222"},{"key":"ref177","first-page":"6389","article-title":"$L_{3}$\n-Net: Towards learning based LiDAR localization for autonomous driving","author":"lu","year":"2019","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref168","doi-asserted-by":"crossref","first-page":"1938","DOI":"10.1109\/LGRS.2015.2439517","article-title":"Fast multiclass vehicle detection on aerial images","volume":"12","author":"liu","year":"2015","journal-title":"IEEE Geosci Remote Sens Lett"},{"key":"ref169","doi-asserted-by":"publisher","DOI":"10.1016\/j.jvcir.2015.11.002"},{"key":"ref39","article-title":"Deformable ConvNets v2: More deformable, better results","author":"zhu","year":"2018","journal-title":"arXiv 1811 11168"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.89"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.324"},{"key":"ref32","article-title":"YOLOv3: An incremental improvement","author":"redmon","year":"2018","journal-title":"arXiv 1804 02767"},{"key":"ref31","article-title":"Batch normalization: Accelerating deep network training by reducing internal covariate shift","author":"ioffe","year":"2015","journal-title":"arXiv 1502 03167"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.690"},{"key":"ref267","article-title":"Improving text proposals for scene images with fully convolutional networks","author":"bazazian","year":"2017","journal-title":"arXiv 1702 05089"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00378"},{"key":"ref268","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.451"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00442"},{"key":"ref269","article-title":"Scene text detection via holistic, multi-channel prediction","author":"yao","year":"2016","journal-title":"arXiv 1606 09002"},{"key":"ref35","article-title":"M2Det: A single-shot object detector based on multi-level feature pyramid network","author":"zhao","year":"2018","journal-title":"arXiv 1811 04533"},{"key":"ref34","article-title":"DSSD: Deconvolutional single shot detector","author":"fu","year":"2017","journal-title":"arXiv 1701 06659"},{"key":"ref288","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01258-8_21"},{"key":"ref287","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.357"},{"key":"ref286","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.52"},{"key":"ref285","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.441"},{"key":"ref284","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.101"},{"key":"ref181","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2018.2843815"},{"key":"ref283","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2017.2736553"},{"key":"ref180","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2019.2892405"},{"key":"ref282","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.95"},{"key":"ref281","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.330"},{"key":"ref280","article-title":"Seq-NMS for video object detection","author":"han","year":"2016","journal-title":"arXiv 1602 08465"},{"key":"ref185","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN.2018.8489623"},{"key":"ref184","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2018.01.005"},{"key":"ref183","doi-asserted-by":"publisher","DOI":"10.1049\/iet-its.2018.5223"},{"key":"ref182","article-title":"Traffic sign recognition system","author":"moritani","year":"2018"},{"key":"ref189","doi-asserted-by":"publisher","DOI":"10.1038\/s41467-019-10168-2"},{"key":"ref188","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2019.04.028"},{"key":"ref187","doi-asserted-by":"publisher","DOI":"10.32604\/cmc.2018.02617"},{"key":"ref186","first-page":"1","article-title":"Traffic sign recognition with light convolutional networks","author":"wu","year":"2018","journal-title":"Proc IEEE Int Conf Consum Electron -Taiwan (ICCE-TW)"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.81"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1162\/neco_a_00990"},{"key":"ref179","doi-asserted-by":"publisher","DOI":"10.1109\/IVS.2018.8500699"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00716"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.195"},{"key":"ref21","article-title":"SqueezeNet: AlexNet-level accuracy with \n$50\\times$\n fewer parameters and < 0.5 MB model size","author":"iandola","year":"2016","journal-title":"arXiv 1602 07360"},{"key":"ref24","first-page":"1963","article-title":"Pelee: A real-time object detection system on mobile devices","author":"wang","year":"2018","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00474"},{"key":"ref278","article-title":"Strong-weak distribution alignment for adaptive object detection","author":"saito","year":"2018","journal-title":"arXiv 1812 04798"},{"key":"ref26","article-title":"Very deep convolutional networks for large-scale image recognition","author":"simonyan","year":"2014","journal-title":"arXiv 1409 1556"},{"key":"ref279","first-page":"4893","article-title":"Contrastive adaptation network for unsupervised domain adaptation","author":"haupmann","year":"2019","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit (CVPR)"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref293","first-page":"5099","article-title":"PointNet: Deep hierarchical feature learning on point sets in a metric space","author":"qi","year":"2017","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.557"},{"key":"ref292","first-page":"652","article-title":"PointNet: Deep learning on point sets for 3D classification and segmentation","author":"qi","year":"2017","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_45"},{"key":"ref295","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.236"},{"key":"ref294","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00472"},{"key":"ref297","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2019.8794195"},{"key":"ref296","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.691"},{"key":"ref299","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46478-7_44"},{"key":"ref298","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.143"},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2018.2864716"},{"key":"ref153","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2016.2645610"},{"key":"ref156","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2014.2374218"},{"key":"ref155","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2016.2569141"},{"key":"ref150","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2018.2848901"},{"key":"ref291","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2017.7989161"},{"key":"ref152","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2017.2776357"},{"key":"ref290","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2910529"},{"key":"ref151","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2019.2899955"},{"key":"ref146","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2019.2900302"},{"key":"ref147","article-title":"R3-Net: A deep network for multi-oriented vehicle detection in aerial images and videos","author":"li","year":"0","journal-title":"IEEE Trans Geosci Remote Sens"},{"key":"ref148","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2017.2694890"},{"key":"ref149","doi-asserted-by":"publisher","DOI":"10.3390\/rs9040368"},{"key":"ref289","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01237-3_30"},{"key":"ref59","first-page":"2672","article-title":"Generative adversarial nets","author":"goodfellow","year":"2014","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2008.260"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206638"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2007.4409092"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2005.177"},{"key":"ref54","first-page":"1028","article-title":"A new benchmark for vision-based cyclist detection","author":"li","year":"2016","journal-title":"Proc IEEE Intell Vehicles Symp (IV)"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.474"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2897684"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.89"},{"key":"ref167","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00418"},{"key":"ref166","doi-asserted-by":"publisher","DOI":"10.1016\/j.isprsjprs.2014.10.002"},{"key":"ref165","doi-asserted-by":"publisher","DOI":"10.1080\/2150704X.2019.1633486"},{"key":"ref164","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2019.2930939"},{"key":"ref163","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2019.2921396"},{"key":"ref162","doi-asserted-by":"crossref","first-page":"737","DOI":"10.3390\/rs11070737","article-title":"A novel multi-model decision fusion network for object detection in remote sensing images","volume":"11","author":"ma","year":"2019","journal-title":"Remote Sens"},{"key":"ref161","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2016.2606481"},{"key":"ref160","doi-asserted-by":"publisher","DOI":"10.3390\/rs9040312"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-009-0275-4"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"ref6","article-title":"The open images dataset v4: Unified image classification, object detection, and visual relationship detection at scale","author":"kuznetsova","year":"2018","journal-title":"arXiv 1811 00982"},{"key":"ref5","first-page":"740","article-title":"Microsoft COCO: Common objects in context","author":"lin","year":"2014","journal-title":"Vision Computer"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"ref159","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2014.2309695"},{"key":"ref7","article-title":"Vision meets drones: A challenge","author":"zhu","year":"2018","journal-title":"arXiv 1804 07437"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00719"},{"key":"ref157","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461407"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.91"},{"key":"ref158","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2018.2841808"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.593"},{"key":"ref45","article-title":"Beyond skip connections: Top-down modulation for object detection","author":"shrivastava","year":"2016","journal-title":"arXiv 1612 06851"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00377"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00644"},{"key":"ref42","first-page":"379","article-title":"R-FCN: Object detection via region-based fully convolutional networks","author":"dai","year":"2016","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.314"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.351"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.444"},{"key":"ref73","first-page":"9452","article-title":"Object detection with location-aware deformable convolution and backward attention filtering","author":"zhang","year":"2019","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2836461"},{"key":"ref71","first-page":"9310","article-title":"SNIPER: Efficient multi-scale training","author":"singh","year":"2018","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00730"},{"key":"ref76","article-title":"Multiple object recognition with visual attention","author":"ba","year":"2014","journal-title":"Arxiv 1412 7755"},{"key":"ref317","doi-asserted-by":"publisher","DOI":"10.1007\/s11633-017-1053-3"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/TMI.2019.2927226"},{"key":"ref316","article-title":"Fine-grained visual classification of aircraft","author":"maji","year":"2013","journal-title":"arXiv 1306 5151"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.305"},{"key":"ref315","first-page":"1","article-title":"Novel dataset for fine-grained image categorization: Stanford dogs","volume":"2","author":"khosla","year":"2011","journal-title":"Proc CVPR Workshop on Fine-Grained Visual Categorization (FGVC)"},{"key":"ref75","article-title":"Show, attend and tell: Neural image caption generation with visual attention","author":"xu","year":"2015","journal-title":"arXiv 1502 03044"},{"key":"ref314","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2018.2834480"},{"key":"ref313","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123319"},{"key":"ref312","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.170"},{"key":"ref78","article-title":"Attentional network for visual object detection","author":"hara","year":"2017","journal-title":"arXiv 1702 01478"},{"key":"ref311","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2013.77"},{"key":"ref79","article-title":"An attentive survey of attention models","author":"chaudhari","year":"2019","journal-title":"arXiv 1904 02874?context=cs"},{"key":"ref310","article-title":"Rethinking on multi-stage networks for human pose estimation","author":"li","year":"2019","journal-title":"arXiv 1901 00148"},{"key":"ref60","article-title":"Learning data augmentation strategies for object detection","author":"zoph","year":"2019","journal-title":"arXiv 1906 11172"},{"key":"ref62","article-title":"FoveaBox: Beyond anchor-based object detector","author":"kong","year":"2019","journal-title":"arXiv 1904 03797"},{"key":"ref61","article-title":"FCOS: Fully convolutional one-stage object detection","author":"tian","year":"2019","journal-title":"arXiv 1904 01355"},{"key":"ref305","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2892985"},{"key":"ref63","article-title":"Libra R-CNN: Towards balanced learning for object detection","author":"pang","year":"2019","journal-title":"arXiv 1904 02701"},{"key":"ref304","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.299"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01228-1_15"},{"key":"ref307","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.395"},{"key":"ref65","article-title":"FSSD: Feature fusion single shot multibox detector","author":"li","year":"2017","journal-title":"arXiv 1712 00960"},{"key":"ref306","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00742"},{"key":"ref66","article-title":"Weaving multi-scale context for single shot detector","author":"chen","year":"2017","journal-title":"arXiv 1712 03149"},{"key":"ref301","first-page":"1736","article-title":"Articulated pose estimation by a graphical model with image dependent pairwise relations","author":"chen","year":"2014","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1117\/12.2503001"},{"key":"ref300","doi-asserted-by":"crossref","first-page":"483","DOI":"10.1007\/978-3-319-46484-8_29","article-title":"Stacked hourglass networks for human pose estimation","author":"newell","year":"2016","journal-title":"Computer Vision&#x2014;ECCV 2016"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018094"},{"key":"ref303","first-page":"1347","article-title":"Combining local appearance and holistic view: Dual-source deep neural networks for human pose estimation","author":"fan","year":"2015","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref69","article-title":"On the utility of context (or the lack thereof) for object detection","author":"barnea","year":"2017","journal-title":"arXiv 1711 05471v2"},{"key":"ref302","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.214"},{"key":"ref309","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.511"},{"key":"ref308","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_29"},{"key":"ref197","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2809933"},{"key":"ref198","doi-asserted-by":"publisher","DOI":"10.1177\/0165551517698564"},{"key":"ref199","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995319"},{"key":"ref193","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2019.2929365"},{"key":"ref194","first-page":"5227","article-title":"Precise detection in densely packed scenes","author":"goldman","year":"2019","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref195","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2893953"},{"key":"ref196","doi-asserted-by":"publisher","DOI":"10.1145\/2393347.2396332"},{"key":"ref95","first-page":"106151e","article-title":"Feature-fused SSD: Fast detection for small objects","volume":"10615","author":"cao","year":"2018","journal-title":"Proc 9th Int Conf Graphic Image Process (ICGIP)"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2018.00198"},{"key":"ref190","doi-asserted-by":"publisher","DOI":"10.1038\/s41467-019-10836-3"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.5244\/C.31.76"},{"key":"ref191","doi-asserted-by":"publisher","DOI":"10.1109\/ISBI.2018.8363547"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.685"},{"key":"ref192","doi-asserted-by":"publisher","DOI":"10.1007\/s10462-018-9664-9"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-45886-1_16"},{"key":"ref90","article-title":"Adaptive NMS: Refining pedestrian detection in a crowd","author":"liu","year":"2019","journal-title":"arXiv 1904 03629"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.166"},{"key":"ref99","article-title":"MDSSD: Multi-scale deconvolutional single shot detector for small objects","author":"xu","year":"2018","journal-title":"arXiv 1805 07009"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.211"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_38"},{"key":"ref82","article-title":"Generalized intersection over union: A metric and a loss for bounding box regression","author":"rezatofighi","year":"2019","journal-title":"arXiv 1902 09630"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2967274"},{"key":"ref84","article-title":"Softer-NMS: Rethinking bounding box regression for accurate object detection","author":"he","year":"2018","journal-title":"arXiv 1809 08545"},{"key":"ref83","first-page":"2888","article-title":"Bounding box regression with uncertainty for accurate object detection","author":"he","year":"2019","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01228-1_11"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_48"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1038\/s41467-019-09901-8"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-49409-8_45"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN.2018.8489071"},{"key":"ref88","article-title":"Towards accurate one-stage object detection with AP-loss","author":"chen","year":"2019","journal-title":"arXiv 1904 06373"},{"key":"ref200","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2010.5540192"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00811"},{"key":"ref100","article-title":"Face attention network: An effective face detector for the occluded faces","author":"wang","year":"2017","journal-title":"arXiv 1711 07246"},{"key":"ref209","doi-asserted-by":"publisher","DOI":"10.1088\/0266-5611\/27\/2\/025010"},{"key":"ref203","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2008.4587461"},{"key":"ref204","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-15552-9_11"},{"key":"ref201","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-012-0575-y"},{"key":"ref202","doi-asserted-by":"publisher","DOI":"10.1145\/2070781.2024218"},{"key":"ref207","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298899"},{"key":"ref208","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.410"},{"key":"ref205","doi-asserted-by":"publisher","DOI":"10.1145\/1276377.1276484"},{"key":"ref206","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-15552-9_5"},{"key":"ref211","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2012.39"},{"key":"ref210","doi-asserted-by":"publisher","DOI":"10.1145\/1970392.1970395"},{"key":"ref212","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2858795"},{"key":"ref213","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"ref214","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"ref215","first-page":"6837","article-title":"Stack-captioning: Coarse-to-fine learning for image captioning","author":"gu","year":"2018","journal-title":"Proc 32nd AAAI Conf Artif Intell"},{"key":"ref216","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_42"},{"key":"ref217","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00583"},{"key":"ref218","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2018.05.080"},{"key":"ref219","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2895793"},{"key":"ref220","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2919616"},{"key":"ref222","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.311"},{"key":"ref221","doi-asserted-by":"publisher","DOI":"10.1111\/2041-210X.13075"},{"key":"ref229","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2876304"},{"key":"ref228","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2898858"},{"key":"ref227","article-title":"Activity driven weakly supervised object detection","author":"yang","year":"2019","journal-title":"arXiv 1904 01665"},{"key":"ref226","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_2"},{"key":"ref225","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.545"},{"key":"ref224","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.326"},{"key":"ref223","doi-asserted-by":"crossref","first-page":"350","DOI":"10.1007\/978-3-319-46454-1_22","article-title":"ContextLocNet: Context-aware deep network models for weakly supervised localization","author":"kantorov","year":"2016","journal-title":"Computer Vision&#x2014;ECCV 2016"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2781233"},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01252-6_24"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.379"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.776"},{"key":"ref129","article-title":"AdaCos: Adaptively scaling cosine logits for effectively learning deep face representations","author":"zhang","year":"2019","journal-title":"arXiv 1905 00292"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2842770"},{"key":"ref130","article-title":"Rethinking feature discrimination and polymerization for large-scale recognition","author":"liu","year":"2017","journal-title":"arXiv 1710 00870"},{"key":"ref133","article-title":"ArcFace: Additive angular margin loss for deep face recognition","author":"deng","year":"2018","journal-title":"arXiv 1801 07698"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2017.2739338"},{"key":"ref131","article-title":"$L_{2}$\n-constrained softmax loss for discriminative face verification","author":"ranjan","year":"2017","journal-title":"arXiv 1703 09507"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123359"},{"key":"ref232","article-title":"Cascaded partial decoder for fast and accurate salient object detection","author":"wu","year":"2019","journal-title":"arXiv 1904 08739"},{"key":"ref233","article-title":"A simple pooling-based design for real-time salient object detection","author":"liu","year":"2019","journal-title":"arXiv 1904 09569"},{"key":"ref230","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2843329"},{"key":"ref231","article-title":"C-MIL: Continuation multiple instance learning for weakly supervised object detection","author":"wan","year":"2019","journal-title":"arXiv 1904 05647"},{"key":"ref239","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.273"},{"key":"ref238","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2015.2425544"},{"key":"ref235","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2846598"},{"key":"ref234","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2905607"},{"key":"ref237","first-page":"8554","article-title":"Shifting more attention to video salient object detection","author":"fan","year":"2019","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref236","first-page":"1623","article-title":"Attentive feedback network for boundary-aware salient object detection","author":"feng","year":"2019","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref136","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.384"},{"key":"ref135","article-title":"Deep face recognition: A survey","author":"wang","year":"2018","journal-title":"arXiv 1804 06655"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2014.2300479"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2011.281"},{"key":"ref139","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2018.01.092"},{"key":"ref140","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2013.01.012"},{"key":"ref141","doi-asserted-by":"publisher","DOI":"10.1145\/3051126"},{"key":"ref142","doi-asserted-by":"publisher","DOI":"10.1109\/ICDM.2015.61"},{"key":"ref143","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-12519-5_7"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6248074"},{"key":"ref144","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2823766"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2011.155"},{"key":"ref145","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2016.2601622"},{"key":"ref241","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2016.2595324"},{"key":"ref242","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2013.242"},{"key":"ref243","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2015.2460013"},{"key":"ref244","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2017.2670143"},{"key":"ref240","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2017.2762594"},{"key":"ref248","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01252-6_44"},{"key":"ref247","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2014.2308642"},{"key":"ref246","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01219-9_13"},{"key":"ref245","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2813165"},{"key":"ref249","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2018.2859773"},{"key":"ref109","article-title":"Region proposal by guided anchoring","author":"wang","year":"2019","journal-title":"arXiv 1901 03278"},{"key":"ref108","article-title":"CenterNet: Keypoint triplets for object detection","author":"duan","year":"2019","journal-title":"arXiv 1904 08189"},{"key":"ref107","article-title":"CornerNet-Lite: Efficient keypoint based object detection","author":"law","year":"2019","journal-title":"arXiv 1904 08900"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2009.167"},{"key":"ref105","article-title":"OverFeat: Integrated recognition, localization and detection using convolutional networks","author":"sermanet","year":"2013","journal-title":"arXiv 1312 6229"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2015.2389824"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.38"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01219-9_39"},{"key":"ref111","article-title":"Objects as points","author":"zhou","year":"2019","journal-title":"arXiv 1904 07850v1"},{"key":"ref112","article-title":"DuBox: No-prior box objection detection via residual dual scale detectors","author":"chen","year":"2019","journal-title":"arXiv 1904 06883"},{"key":"ref110","article-title":"Bottom-up object detection by grouping extreme and center points","author":"zhou","year":"2019","journal-title":"arXiv 1901 08043"},{"key":"ref250","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2017.2754941"},{"key":"ref251","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.256"},{"key":"ref254","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.165"},{"key":"ref255","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.429"},{"key":"ref252","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298961"},{"key":"ref253","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2016.2631900"},{"key":"ref257","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.112"},{"key":"ref256","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10590-1_51"},{"key":"ref259","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298994"},{"key":"ref10","first-page":"21","article-title":"SSD: Single shot multibox detector","author":"liu","year":"2016","journal-title":"Vision Computer"},{"key":"ref258","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.526"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"ref12","article-title":"A survey of the recent architectures of deep convolutional neural networks","author":"khan","year":"2019","journal-title":"arXiv 1901 06032"},{"key":"ref13","article-title":"Object detection in 20 years: A survey","author":"zou","year":"2019","journal-title":"arXiv 1905 05055v2"},{"key":"ref14","article-title":"Deep learning for generic object detection: A survey","author":"liu","year":"2018","journal-title":"arXiv 1809 02165"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.106"},{"key":"ref16","article-title":"DetNet: A backbone network for object detection","author":"li","year":"2018","journal-title":"arXiv 1804 06215"},{"key":"ref118","article-title":"Object detection from scratch with deep supervision","author":"shen","year":"2018","journal-title":"arXiv 1809 09294"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.634"},{"key":"ref117","article-title":"Tiny-DSOD: Lightweight object detection for resource-restricted usages","author":"li","year":"2018","journal-title":"arXiv 1807 11013"},{"key":"ref18","article-title":"NAS-FPN: Learning scalable feature pyramid architecture for object detection","author":"ghiasi","year":"2019","journal-title":"arXiv 1904 07392"},{"key":"ref19","article-title":"MobileNets: Efficient convolutional neural networks for mobile vision applications","author":"howard","year":"2017","journal-title":"arXiv 1704 04861"},{"key":"ref119","article-title":"Light-head R-CNN: In defense of two-stage object detector","author":"li","year":"2017","journal-title":"arXiv 1711 07264"},{"key":"ref114","first-page":"2268","article-title":"ScratchDet: Training single-shot object detectors from scratch","author":"zhu","year":"2019","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref113","article-title":"Feature selective anchor-free module for single-shot object detection","author":"zhu","year":"2019","journal-title":"arXiv 1903 00621"},{"key":"ref116","article-title":"Improving object detection from scratch via gated feature reuse","author":"shen","year":"2017","journal-title":"arXiv 1712 00886"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.212"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1109\/CRV.2018.00023"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.54"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2017.56"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1109\/IVS.2017.7995808"},{"key":"ref260","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.395"},{"key":"ref261","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10599-4_35"},{"key":"ref262","article-title":"Less is more: Learning highlight detection from video duration","author":"xiong","year":"2019","journal-title":"arXiv 1903 00859"},{"key":"ref263","article-title":"Bi-directional cascade network for perceptual edge detection","author":"he","year":"2019","journal-title":"arXiv 1902 10903"},{"key":"ref264","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2878849"},{"key":"ref265","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2016.2625259"},{"key":"ref266","first-page":"4161","article-title":"TextBoxes: A fast text detector with a single deep neural network","author":"liao","year":"2017","journal-title":"Proc AAAI Conf Artif Intell"}],"container-title":["IEEE Access"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6287639\/8600701\/08825470.pdf?arnumber=8825470","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,9,9]],"date-time":"2022-09-09T20:48:27Z","timestamp":1662756507000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8825470\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019]]},"references-count":317,"URL":"https:\/\/doi.org\/10.1109\/access.2019.2939201","relation":{},"ISSN":["2169-3536"],"issn-type":[{"value":"2169-3536","type":"electronic"}],"subject":[],"published":{"date-parts":[[2019]]}}}