{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,17]],"date-time":"2026-06-17T16:20:04Z","timestamp":1781713204111,"version":"3.54.5"},"reference-count":172,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/"}],"funder":[{"name":"\u201cPioneer\u201d and \u201cLeading Goose\u201d Research and Development Program of Zhejiang, China","award":["2022C03132"],"award-info":[{"award-number":["2022C03132"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Access"],"published-print":{"date-parts":[[2023]]},"DOI":"10.1109\/access.2023.3273736","type":"journal-article","created":{"date-parts":[[2023,5,8]],"date-time":"2023-05-08T14:58:02Z","timestamp":1683557882000},"page":"45416-45441","source":"Crossref","is-referenced-by-count":26,"title":["Object Detection and X-Ray Security Imaging: A Survey"],"prefix":"10.1109","volume":"11","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4522-4003","authenticated-orcid":false,"given":"Jiajie","family":"Wu","sequence":"first","affiliation":[{"name":"Department of Computer Science, Hangzhou Dianzi University, Qiantang, Hangzhou, Zhejiang, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9832-5804","authenticated-orcid":false,"given":"Xianghua","family":"Xu","sequence":"additional","affiliation":[{"name":"Department of Computer Science, Hangzhou Dianzi University, Qiantang, Hangzhou, Zhejiang, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Junyan","family":"Yang","sequence":"additional","affiliation":[{"name":"Department of Computer Science, Hangzhou Dianzi University, Qiantang, Hangzhou, Zhejiang, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref57","article-title":"Multi-scale high-resolution vision transformer for semantic segmentation","author":"gu","year":"2021","journal-title":"arXiv 2111 01236"},{"key":"ref56","first-page":"7281","article-title":"HRFormer: High-resolution vision transformer for dense predict","volume":"34","author":"yuan","year":"2021","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00009"},{"key":"ref58","first-page":"2286","article-title":"ConViT: Improving vision transformers with soft convolutional inductive biases","author":"d\u2019ascoli","year":"2021","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00299"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"ref168","article-title":"Can vision transformers perform convolution?","author":"li","year":"2021","journal-title":"arXiv 2111 01353"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58604-1_20"},{"key":"ref169","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-1407"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref170","first-page":"857","article-title":"Self-supervised learning: Generative or contrastive","volume":"35","author":"liu","year":"2023","journal-title":"IEEE Trans Knowl Data Eng"},{"key":"ref51","article-title":"Focal self-attention for local-global interactions in vision transformers","author":"yang","year":"2021","journal-title":"arXiv 2107 00641"},{"key":"ref50","first-page":"1","article-title":"FP-DETR: Detection transformer advanced by fully pre-training","author":"wang","year":"2021","journal-title":"Proc Int Conf Learn Represent"},{"key":"ref171","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-56769-9"},{"key":"ref172","first-page":"1","article-title":"Generative adversarial nets","volume":"27","author":"goodfellow","year":"2014","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00165"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00298"},{"key":"ref48","article-title":"BEiT v2: Masked image modeling with vector-quantized visual tokenizers","author":"peng","year":"2022","journal-title":"arXiv 2208 06366"},{"key":"ref47","article-title":"BEiT: BERT pre-training of image transformers","author":"bao","year":"2021","journal-title":"arXiv 2106 08254"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00359"},{"key":"ref41","first-page":"26183","article-title":"You only look at one sequence: Rethinking transformer in vision through object detection","volume":"34","author":"fang","year":"2021","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref44","article-title":"Efficient DETR: Improving end-to-end object detector with dense prior","author":"yao","year":"2021","journal-title":"arXiv 2104 01318"},{"key":"ref43","article-title":"DINO: DETR with improved denoising anchor boxes for end-to-end object detection","author":"zhang","year":"2022","journal-title":"arXiv 2203 03605"},{"key":"ref49","article-title":"Image as a foreign language: BEiT pretraining for all vision and vision-language tasks","author":"wang","year":"2022","journal-title":"arXiv 2208 10442"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"ref9","article-title":"MobileNets: Efficient convolutional neural networks for mobile vision applications","author":"howard","year":"2017","journal-title":"arXiv 1704 04861"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2001.990517"},{"key":"ref3","article-title":"An image is worth 16&#x00D7;16 words: Transformers for image recognition at scale","author":"dosovitskiy","year":"2020","journal-title":"arXiv 2010 11929"},{"key":"ref6","article-title":"Very deep convolutional networks for large-scale image recognition","author":"simonyan","year":"2014","journal-title":"arXiv 1409 1556"},{"key":"ref5","article-title":"How do vision transformers work?","author":"park","year":"2022","journal-title":"arXiv 2202 06709"},{"key":"ref100","article-title":"ImageNet-trained CNNs are biased towards texture; increasing shape bias improves accuracy and robustness","author":"geirhos","year":"2018","journal-title":"arXiv 1811 12231"},{"key":"ref101","first-page":"1","article-title":"Attention is all you need","volume":"30","author":"vaswani","year":"2017","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref40","article-title":"DAB-DETR: Dynamic anchor boxes are better queries for DETR","author":"liu","year":"2022","journal-title":"arXiv 2201 12329"},{"key":"ref35","article-title":"Sparse DETR: Efficient end-to-end object detection with learnable sparsity","author":"roh","year":"2021","journal-title":"arXiv 2111 14330"},{"key":"ref34","article-title":"End-to-end object detection with adaptive clustering transformer","author":"zheng","year":"2020","journal-title":"arXiv 2011 09315"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00363"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00360"},{"key":"ref148","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.324"},{"key":"ref31","first-page":"213","article-title":"End-to-end object detection with transformers","author":"carion","year":"2020","journal-title":"Computer Vision&#x2014;ECCV 2020"},{"key":"ref149","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00644"},{"key":"ref30","article-title":"YOLOX: Exceeding YOLO series in 2021","author":"ge","year":"2021","journal-title":"arXiv 2107 08430"},{"key":"ref33","first-page":"1","article-title":"Deformable DETR: Deformable transformers for end-to-end object detection","author":"zhu","year":"2021","journal-title":"Proc Int Conf Learn Represent"},{"key":"ref146","first-page":"1","article-title":"R-FCN: Object detection via region-based fully convolutional networks","volume":"29","author":"dai","year":"2016","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref32","article-title":"Pix2seq: A language modeling framework for object detection","author":"chen","year":"2021","journal-title":"arXiv 2109 10852"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.243"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01325"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20158"},{"key":"ref155","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-013-0620-5"},{"key":"ref156","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2005.239"},{"key":"ref153","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01283"},{"key":"ref154","article-title":"PP-YOLOE: An evolved version of YOLO","author":"xu","year":"2022","journal-title":"arXiv 2203 16250"},{"key":"ref151","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00511"},{"key":"ref152","year":"2022","journal-title":"YOLOv6-L (V2 1)"},{"key":"ref150","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.106"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.690"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.91"},{"key":"ref26","article-title":"YOLOv4: Optimal speed and accuracy of object detection","author":"bochkovskiy","year":"2020","journal-title":"arXiv 2004 10934"},{"key":"ref25","article-title":"YOLOv3: An incremental improvement","author":"redmon","year":"2018","journal-title":"arXiv 1804 02767"},{"key":"ref159","author":"chaucer","year":"2022","journal-title":"YOLOU United Study and Easier to Deploy"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"ref157","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01170"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"ref158","article-title":"Evolution of YOLO algorithm and YOLOv5: The state-of-the-art object detention algorithm","author":"thuan","year":"2021"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"ref28","article-title":"You only learn one representation: Unified network for multiple tasks","author":"wang","year":"2021","journal-title":"arXiv 2105 04206"},{"key":"ref27","article-title":"YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors","author":"wang","year":"2022","journal-title":"arXiv 2207 02696"},{"key":"ref29","article-title":"PP-YOLO: An effective and efficient implementation of object detector","author":"long","year":"2020","journal-title":"arXiv 2007 12099"},{"key":"ref166","article-title":"InternImage: Exploring large-scale vision foundation models with deformable convolutions","author":"wang","year":"2022","journal-title":"arXiv 2211 05778"},{"key":"ref167","article-title":"On the relationship between self-attention and convolutional layers","author":"cordonnier","year":"2019","journal-title":"arXiv 1911 03584"},{"key":"ref164","article-title":"EVA: Exploring the limits of masked visual representation learning at scale","author":"fang","year":"2022","journal-title":"arXiv 2211 07636"},{"key":"ref165","article-title":"ConvMAE: Masked convolution meets masked autoencoders","author":"gao","year":"2022","journal-title":"arXiv 2205 03892"},{"key":"ref162","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6999"},{"key":"ref163","article-title":"Mish: A self regularized non-monotonic activation function","author":"misra","year":"2019","journal-title":"arXiv 1908 08681"},{"key":"ref160","year":"2022","journal-title":"YOLOSeries"},{"key":"ref161","year":"2022","journal-title":"YOLOAir Makes Improvements Easy Again"},{"key":"ref13","first-page":"116","article-title":"ShuffleNet V2: Practical guidelines for efficient CNN architecture design","author":"ma","year":"2018","journal-title":"Proc Eur Conf Comput Vis (ECCV)"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00716"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00165"},{"key":"ref128","year":"2022","journal-title":"Pixray"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00745"},{"key":"ref129","year":"2022","journal-title":"CLCXray"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1016\/j.dsp.2022.103514"},{"key":"ref126","year":"2022","journal-title":"FSOD EDS"},{"key":"ref96","year":"2022","journal-title":"YOLOv5"},{"key":"ref127","year":"2022","journal-title":"Xray-Pi"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00140"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00203"},{"key":"ref124","article-title":"EBJR: Energy-based joint reasoning for adaptive inference","author":"akbari","year":"2021","journal-title":"arXiv 2110 10343"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00474"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.634"},{"key":"ref125","year":"2022","journal-title":"X-ray dataset"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01352"},{"key":"ref16","first-page":"6105","article-title":"EfficientNet: Rethinking model scaling for convolutional neural networks","author":"tan","year":"2019","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2015.2389824"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.81"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN.2019.8851829"},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN52387.2021.9534034"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01074"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR48806.2021.9413007"},{"key":"ref95","first-page":"1","article-title":"Trainable structure tensors for autonomous baggage threat detection under extreme occlusion","author":"hassan","year":"2020","journal-title":"Proc Asian Conf Comput Vis"},{"key":"ref131","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP42928.2021.9506608"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3174339"},{"key":"ref132","year":"2022","journal-title":"Pidray"},{"key":"ref130","year":"2022","journal-title":"HiXray"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2018.2812196"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00048"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00536"},{"key":"ref139","year":"2022","journal-title":"GDXray"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.18280\/ria.340415"},{"key":"ref137","year":"2022","journal-title":"Compass-XP"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1117\/12.2558542"},{"key":"ref138","article-title":"Toward automatic threat recognition for airport X-ray baggage screening with deep convolutional object detection","author":"liang","year":"2019","journal-title":"arXiv 1912 06329"},{"key":"ref88","first-page":"17","article-title":"Deep cmst framework for the autonomous recognition of heavily occluded and cluttered baggage items from multivendor security radiographs","volume":"14","author":"hassan","year":"2019","journal-title":"CoRR"},{"key":"ref135","year":"2022","journal-title":"OPIXray"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/ICSAI.2018.8599420"},{"key":"ref136","year":"2022","journal-title":"SIXray"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/ICMLA.2018.00049"},{"key":"ref144","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_28"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00222"},{"key":"ref145","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_2"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2017.8296499"},{"key":"ref142","doi-asserted-by":"publisher","DOI":"10.1049\/ic.2016.0080"},{"key":"ref83","first-page":"130","article-title":"Transferring X-ray based automated threat detection between scanners with different energies and resolution","volume":"10441","author":"caldwell","year":"2017","journal-title":"Proc SPIE"},{"key":"ref143","article-title":"OverFeat: Integrated recognition, localization and detection using convolutional networks","author":"sermanet","year":"2013","journal-title":"arXiv 1312 6229"},{"key":"ref140","doi-asserted-by":"publisher","DOI":"10.1007\/s10921-015-0315-7"},{"key":"ref141","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413828"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2019.2902121"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-03398-9_36"},{"key":"ref108","article-title":"Sharpness-aware minimization for efficiently improving generalization","author":"foret","year":"2020","journal-title":"arXiv 2010 01412"},{"key":"ref78","first-page":"106","article-title":"A deep learning framework for the automated inspection of complex dual-energy X-ray cargo imagery","volume":"10187","author":"rogers","year":"2017","journal-title":"Proc SPIE"},{"key":"ref109","article-title":"When vision transformers outperform ResNets without pre-training or strong data augmentations","author":"chen","year":"2021","journal-title":"arXiv 2106 01548"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3152247"},{"key":"ref107","first-page":"27378","article-title":"Understanding the robustness in vision transformers","author":"zhou","year":"2022","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19790-1_39"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-019-01247-4"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20053-3_27"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1145\/3505244"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2016.7532519"},{"key":"ref102","article-title":"A survey of transformers","author":"lin","year":"2021","journal-title":"arXiv 2106 04554"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2021.108245"},{"key":"ref103","article-title":"A survey of visual transformers","author":"liu","year":"2021","journal-title":"arXiv 2111 06091"},{"key":"ref2","first-page":"1","article-title":"ImageNet classification with deep convolutional neural networks","volume":"25","author":"krizhevsky","year":"2012","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref1","article-title":"The impact of image based factors and training on threat detection performance in X-ray screening","author":"schwaninger","year":"2008"},{"key":"ref71","first-page":"3965","article-title":"CoAtNet: Marrying convolution and attention for all data sizes","volume":"34","author":"dai","year":"2021","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref111","article-title":"CMT: Convolutional neural networks meet vision transformers","author":"guo","year":"2021","journal-title":"arXiv 2107 06263"},{"key":"ref70","first-page":"15475","article-title":"ResT: An efficient transformer for visual recognition","volume":"34","author":"zhang","year":"2021","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1117\/12.957385"},{"key":"ref73","article-title":"Global context vision transformers","author":"hatamizadeh","year":"2022","journal-title":"arXiv 2206 09959"},{"key":"ref72","article-title":"Next-ViT: Next generation vision transformer for efficient deployment in realistic industrial scenarios","author":"li","year":"2022","journal-title":"arXiv 2207 05501"},{"key":"ref110","first-page":"23296","article-title":"Intriguing properties of vision transformers","volume":"34","author":"naseer","year":"2021","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref68","article-title":"LocalViT: Bringing locality to vision transformers","author":"li","year":"2021","journal-title":"arXiv 2104 05707"},{"key":"ref119","article-title":"The PASCAL visual object classes challenge 2007 (VOC2007) development kit","author":"everingham","year":"2007"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00062"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-009-0275-4"},{"key":"ref69","article-title":"Conditional positional encodings for vision transformers","author":"chu","year":"2021","journal-title":"arXiv 2102 10882"},{"key":"ref118","first-page":"1","article-title":"The PASCAL visual object classes challenge 2012 (VOC2012) development kit","author":"everingham","year":"2012"},{"key":"ref64","first-page":"30392","article-title":"Early convolutions help transformers see better","volume":"34","author":"xiao","year":"2021","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-20747-6"},{"key":"ref63","article-title":"Visual transformers: Token-based image representation and processing for computer vision","author":"wu","year":"2020","journal-title":"arXiv 2006 03677"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.3233\/XST-190545"},{"key":"ref66","first-page":"10347","article-title":"Training data-efficient image transformers & distillation through attention","author":"touvron","year":"2021","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2005.521"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01625"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.3015014"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01625"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-020-01316-z"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01055"},{"key":"ref120","doi-asserted-by":"crossref","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","article-title":"ImageNet large scale visual recognition challenge","volume":"115","author":"russakovsky","year":"2015","journal-title":"Int J Comput Vis"},{"key":"ref61","article-title":"Mobile-former: Bridging MobileNet and transformer","author":"chen","year":"2021","journal-title":"arXiv 2108 05895"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"}],"container-title":["IEEE Access"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6287639\/10005208\/10120944.pdf?arnumber=10120944","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,27]],"date-time":"2026-01-27T05:59:44Z","timestamp":1769493584000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10120944\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"references-count":172,"URL":"https:\/\/doi.org\/10.1109\/access.2023.3273736","relation":{},"ISSN":["2169-3536"],"issn-type":[{"value":"2169-3536","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023]]}}}