{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,31]],"date-time":"2025-05-31T05:09:13Z","timestamp":1748668153040,"version":"3.37.3"},"reference-count":55,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2022,8,6]],"date-time":"2022-08-06T00:00:00Z","timestamp":1659744000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2022,8,6]],"date-time":"2022-08-06T00:00:00Z","timestamp":1659744000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"published-print":{"date-parts":[[2023,2]]},"DOI":"10.1007\/s11227-022-04726-7","type":"journal-article","created":{"date-parts":[[2022,8,6]],"date-time":"2022-08-06T10:02:32Z","timestamp":1659780152000},"page":"2108-2136","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["MAN and CAT: mix attention to nn and concatenate attention to YOLO"],"prefix":"10.1007","volume":"79","author":[{"given":"Runwei","family":"Guan","sequence":"first","affiliation":[]},{"given":"Ka Lok","family":"Man","sequence":"additional","affiliation":[]},{"given":"Haocheng","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Ruixiao","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Shanliang","family":"Yao","sequence":"additional","affiliation":[]},{"given":"Jeremy","family":"Smith","sequence":"additional","affiliation":[]},{"given":"Eng Gee","family":"Lim","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4532-0924","authenticated-orcid":false,"given":"Yutao","family":"Yue","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,8,6]]},"reference":[{"key":"4726_CR1","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"4726_CR2","unstructured":"Simonyan K, Zisserman A (2014) Very deep convolutional networks for large-scale image recognition. preprint arXiv:1409.1556"},{"key":"4726_CR3","unstructured":"Tan M, Le Q (2019) Efficientnet: Rethinking model scaling for convolutional neural networks. In: International Conference on Machine Learning, PMLR, pp 6105\u20136114"},{"key":"4726_CR4","unstructured":"Ren S, He K, Girshick R, Sun J (2015) Faster r-cnn: towards real-time object detection with region proposal networks. Advances in neural information processing systems. 28"},{"key":"4726_CR5","unstructured":"Redmon J, Farhadi A (2018) Yolov3: an incremental improvement. Preprint arXiv:1804.02767"},{"key":"4726_CR6","unstructured":"Bochkovskiy A, Wang C-Y, Liao H-YM (2020) Yolov4: optimal speed and accuracy of object detection. arXiv preprint arXiv:2004.10934"},{"key":"4726_CR7","unstructured":"Jocher G (2021) YOLOv5. https:\/\/github.com\/ultralytics\/yolov5"},{"key":"4726_CR8","unstructured":"Ge Z, Liu S, Wang F, Li Z, Sun J (2021) Yolox: exceeding yolo series in 2021. Preprint arXiv:2107.08430"},{"key":"4726_CR9","unstructured":"Shao R, Shi Z, Yi J, Chen P-Y, Hsieh C-J (2021) On the adversarial robustness of visual transformers. arXiv e-prints, 2103"},{"key":"4726_CR10","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Dehghani M, Minderer M, Heigold G, Gelly S, et al (2020) An image is worth 16x16 words: transformers for image recognition at scale. Preprint arXiv:2010.11929"},{"key":"4726_CR11","doi-asserted-by":"crossref","unstructured":"Liu Z, Lin Y, Cao Y, Hu H, Wei Y, Zhang Z, Lin S, Guo B (2021) Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 10012\u201310022","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"4726_CR12","unstructured":"Chu X, Tian Z, Wang Y, Zhang B, Ren H, Wei X, Xia H, Shen,C (2021) Twins: revisiting the design of spatial attention in vision transformers. Advances in Neural Information Processing Systems, 34"},{"key":"4726_CR13","unstructured":"Touvron H, Cord M, Douze M, Massa F, Sablayrolles, A, J\u00e9gou H (2021) Training data-efficient image transformers & distillation through attention. In: International Conference on Machine Learning, PMLR, pp 10347\u201310357"},{"key":"4726_CR14","doi-asserted-by":"crossref","unstructured":"Yuan K, Guo S, Liu Z, Zhou A, Yu F, Wu W (2021) Incorporating convolution designs into visual transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 579\u2013588","DOI":"10.1109\/ICCV48922.2021.00062"},{"key":"4726_CR15","unstructured":"Han K, Xiao A, Wu E, Guo J, Xu C, Wang Y (2021) Transformer in transformer. Advances in Neural Information Processing Systems, 34"},{"key":"4726_CR16","doi-asserted-by":"crossref","unstructured":"Carion N, Massa F, Synnaeve G, Usunier N, Kirillov A, Zagoruyko S (2020) End-to-end object detection with transformers. In: European Conference on Computer Vision, Springer, pp 213\u2013229","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"4726_CR17","unstructured":"Fang Y, Liao B, Wang X, Fang J, Qi J, Wu R, Niu J, Liu W (2021) You only look at one sequence: rethinking transformer in vision through object detection. Advances in Neural Information Processing Systems, 34"},{"key":"4726_CR18","unstructured":"Zhu X, Su W, Lu L, Li B, Wang X, Dai J (2020) Deformable detr: deformable transformers for end-to-end object detection. Preprint arXiv:2010.04159"},{"key":"4726_CR19","doi-asserted-by":"crossref","unstructured":"Hu J, Shen L, Sun G (2018) Squeeze-and-excitation networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 7132\u20137141","DOI":"10.1109\/CVPR.2018.00745"},{"key":"4726_CR20","doi-asserted-by":"crossref","unstructured":"Li X, Wang W, Hu X, Yang J (2019) Selective kernel networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 510\u2013519","DOI":"10.1109\/CVPR.2019.00060"},{"key":"4726_CR21","doi-asserted-by":"crossref","unstructured":"Woo S, Park J, Lee J-Y, Kweon IS (2018) Cbam: Convolutional block attention module. In: Proceedings of the European Conference on Computer Vision (ECCV), pp 3\u201319","DOI":"10.1007\/978-3-030-01234-2_1"},{"key":"4726_CR22","doi-asserted-by":"crossref","unstructured":"Ramchoun H, Ghanou Y, Ettaouil M, Janati\u00a0Idrissi MA (2016) Multilayer perceptron: architecture optimization and training","DOI":"10.1145\/3090354.3090427"},{"key":"4726_CR23","doi-asserted-by":"crossref","unstructured":"Saeed F, Paul A, Rho S (2020) Faster r-cnn based fault detection in industrial images. In: International Conference on Industrial, Engineering and Other Applications of Applied Intelligent Systems, Springer, pp 280\u2013287","DOI":"10.1007\/978-3-030-55789-8_25"},{"key":"4726_CR24","doi-asserted-by":"publisher","DOI":"10.1016\/j.scs.2021.102986","volume":"71","author":"MM Rathore","year":"2021","unstructured":"Rathore MM, Paul A, Rho S, Khan M, Vimal S, Shah SA (2021) Smart traffic control: identifying driving-violations using fog devices with vehicular cameras in smart cities. Sustain Cities Soc 71:102986","journal-title":"Sustain Cities Soc"},{"issue":"28","key":"4726_CR25","doi-asserted-by":"publisher","first-page":"35789","DOI":"10.1007\/s11042-020-09087-y","volume":"80","author":"H Nawaz","year":"2021","unstructured":"Nawaz H, Maqsood M, Afzal S, Aadil F, Mehmood I, Rho S (2021) A deep feature-based real-time system for alzheimer disease stage detection. Multimed Tools Appl 80(28):35789\u201335807","journal-title":"Multimed Tools Appl"},{"key":"4726_CR26","doi-asserted-by":"crossref","unstructured":"Robinson YH, Vimal S, Julie EG, Lakshmi\u00a0Narayanan K, Rho S (2021) 3-dimensional manifold and machine learning based localization algorithm for wireless sensor networks. Wireless Personal Communications, 1\u201319","DOI":"10.1007\/s11277-021-08291-9"},{"issue":"4","key":"4726_CR27","doi-asserted-by":"publisher","first-page":"663","DOI":"10.1007\/s11760-019-01527-z","volume":"15","author":"S Fan","year":"2021","unstructured":"Fan S, Wang R, Wu Z, Rho S, Liu S, Xiong J, Fu S, Jiang F (2021) High-speed tracking based on multi-cf filters and attention mechanism. SIViP 15(4):663\u2013671","journal-title":"SIViP"},{"key":"4726_CR28","doi-asserted-by":"crossref","unstructured":"Girshick R, Donahue J, Darrell T, Malik J (2014) Rich feature hierarchies for accurate object detection and semantic segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 580\u2013587","DOI":"10.1109\/CVPR.2014.81"},{"key":"4726_CR29","doi-asserted-by":"crossref","unstructured":"Girshick R (2015) Fast r-cnn. In: Proceedings of the IEEE International Conference on Computer Vision, pp 1440\u20131448","DOI":"10.1109\/ICCV.2015.169"},{"key":"4726_CR30","doi-asserted-by":"crossref","unstructured":"Redmon J, Divvala S, Girshick R, Farhadi A (2016) You only look once: Unified, real-time object detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 779\u2013788","DOI":"10.1109\/CVPR.2016.91"},{"key":"4726_CR31","doi-asserted-by":"crossref","unstructured":"Redmon J, Farhadi A (2017) Yolo9000: better, faster, stronger. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 7263\u20137271","DOI":"10.1109\/CVPR.2017.690"},{"key":"4726_CR32","doi-asserted-by":"crossref","unstructured":"Liu S, Qi L, Qin H, Shi J, Jia J (2018) Path aggregation network for instance segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 8759\u20138768","DOI":"10.1109\/CVPR.2018.00913"},{"key":"4726_CR33","doi-asserted-by":"crossref","unstructured":"Zhang H, Cisse M, Dauphin YN, Lopez-Paz D (2017) mixup: Beyond empirical risk minimization. Preprint arXiv:1710.09412","DOI":"10.1007\/978-1-4899-7687-1_79"},{"key":"4726_CR34","doi-asserted-by":"crossref","unstructured":"Yun S, Han D, Oh SJ, Chun S, Choe J, Yoo Y (2019) Cutmix: Regularization strategy to train strong classifiers with localizable features. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 6023\u20136032","DOI":"10.1109\/ICCV.2019.00612"},{"key":"4726_CR35","doi-asserted-by":"crossref","unstructured":"Zheng Z, Wang P, Liu W, Li J, Ye R, Ren D (2020) Distance-iou loss: Faster and better learning for bounding box regression. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol 34, pp 12993\u201313000","DOI":"10.1609\/aaai.v34i07.6999"},{"key":"4726_CR36","unstructured":"M\u00fcller R, Kornblith S, Hinton GE (2019) When does label smoothing help? Advances in neural information processing systems, 32"},{"key":"4726_CR37","unstructured":"Tan Z, Wang J, Sun X, Lin M, Li H, et al (2021) Giraffedet: A heavy-neck paradigm for object detection. In: International Conference on Learning Representations"},{"key":"4726_CR38","doi-asserted-by":"crossref","unstructured":"Dai J, Qi H, Xiong Y, Li Y, Zhang G, Hu H, Wei Y (2017) Deformable convolutional networks. In: Proceedings of the IEEE International Conference on Computer Vision, pp 764\u2013773","DOI":"10.1109\/ICCV.2017.89"},{"key":"4726_CR39","unstructured":"Paul S, Chen P-Y (2021) Vision transformers are robust learners. Preprint<error l=\"301\" c=\"bad csname\" \/>arXiv:2105.075812(3)"},{"key":"4726_CR40","unstructured":"Naseer MM, Ranasinghe K, Khan SH, Hayat M, Shahbaz\u00a0Khan F, Yang M-H (2021) Intriguing properties of vision transformers. Advances in Neural Information Processing Systems, 34"},{"key":"4726_CR41","doi-asserted-by":"crossref","unstructured":"Srinivas A, Lin T-Y, Parmar N, Shlens J, Abbeel P, Vaswani A (2021) Bottleneck transformers for visual recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 16519\u201316529","DOI":"10.1109\/CVPR46437.2021.01625"},{"key":"4726_CR42","doi-asserted-by":"crossref","unstructured":"Chen Z, Xie L, Niu J, Liu X, Wei L, Tian Q (2021) Visformer: The vision-friendly transformer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 589\u2013598","DOI":"10.1109\/ICCV48922.2021.00063"},{"key":"4726_CR43","unstructured":"Wang Q, Wu B, Zhu P, Li P, Zuo W, Hu Q Supplementary material for \u2018eca-net: Efficient channel attention for deep convolutional neural networks. Technical report"},{"key":"4726_CR44","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser, \u0141, Polosukhin I (2017) Attention is all you need. Advances in neural information processing systems, 30"},{"key":"4726_CR45","unstructured":"Park N, Kim S (2022) How do vision transformers work? Preprint arXiv:2202.06709"},{"key":"4726_CR46","unstructured":"Misra D (2019) Mish: A self regularized non-monotonic activation function. Preprint arXiv:1908.08681"},{"key":"4726_CR47","doi-asserted-by":"crossref","unstructured":"Lin T-Y, Doll\u00e1r P, Girshick R, He K, Hariharan B, Belongie S (2017) Feature pyramid networks for object detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 2117\u20132125","DOI":"10.1109\/CVPR.2017.106"},{"issue":"9","key":"4726_CR48","doi-asserted-by":"publisher","first-page":"1904","DOI":"10.1109\/TPAMI.2015.2389824","volume":"37","author":"K He","year":"2015","unstructured":"He K, Zhang X, Ren S, Sun J (2015) Spatial pyramid pooling in deep convolutional networks for visual recognition. IEEE Trans Pattern Anal Mach Intell 37(9):1904\u20131916","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"4726_CR49","doi-asserted-by":"crossref","unstructured":"Deng J, Dong W, Socher R, Li L-J, Li K, Fei-Fei L (2009) Imagenet: A large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, IEEE, pp 248\u2013255","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"4726_CR50","unstructured":"Krizhevsky A, Hinton G, et al (2009) Learning multiple layers of features from tiny images"},{"issue":"7","key":"4726_CR51","first-page":"3","volume":"7","author":"Y Le","year":"2015","unstructured":"Le Y, Yang X (2015) Tiny imagenet visual recognition challenge. CS 231N 7(7):3","journal-title":"CS 231N"},{"key":"4726_CR52","unstructured":"Hassani A, Walton S, Shah N, Abuduweili A, Li J, Shi H (2021) Escaping the big data paradigm with compact transformers. Preprint arXiv:2104.05704"},{"key":"4726_CR53","doi-asserted-by":"crossref","unstructured":"Zagoruyko S, Komodakis N (2016) Wide residual networks. Preprint arXiv:1605.07146","DOI":"10.5244\/C.30.87"},{"key":"4726_CR54","doi-asserted-by":"crossref","unstructured":"Lin T-Y, Maire M, Belongie S, Hays J, Perona P, Ramanan D, Doll\u00e1r P, Zitnick CL (2014) Microsoft coco: Common objects in context. In: European Conference on Computer Vision, Springer, pp 740\u2013755","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"4726_CR55","unstructured":"RangiLyu (2021) NanoDet-Plus: Super fast and high accuracy lightweight anchor-free object detection model. https:\/\/github.com\/RangiLyu\/nanodet"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-022-04726-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11227-022-04726-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-022-04726-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,1,13]],"date-time":"2023-01-13T11:23:00Z","timestamp":1673608980000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11227-022-04726-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,8,6]]},"references-count":55,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2023,2]]}},"alternative-id":["4726"],"URL":"https:\/\/doi.org\/10.1007\/s11227-022-04726-7","relation":{},"ISSN":["0920-8542","1573-0484"],"issn-type":[{"type":"print","value":"0920-8542"},{"type":"electronic","value":"1573-0484"}],"subject":[],"published":{"date-parts":[[2022,8,6]]},"assertion":[{"value":"13 July 2022","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 August 2022","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}