{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,20]],"date-time":"2026-02-20T03:58:09Z","timestamp":1771559889248,"version":"3.50.1"},"reference-count":70,"publisher":"Springer Science and Business Media LLC","issue":"11","license":[{"start":{"date-parts":[[2025,6,10]],"date-time":"2025-06-10T00:00:00Z","timestamp":1749513600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,6,10]],"date-time":"2025-06-10T00:00:00Z","timestamp":1749513600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100020449","name":"People's Public Security University of China","doi-asserted-by":"publisher","award":["No.2023SYL06"],"award-info":[{"award-number":["No.2023SYL06"]}],"id":[{"id":"10.13039\/501100020449","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100020449","name":"People's Public Security University of China","doi-asserted-by":"publisher","award":["No.2023SYL06"],"award-info":[{"award-number":["No.2023SYL06"]}],"id":[{"id":"10.13039\/501100020449","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Vis Comput"],"published-print":{"date-parts":[[2025,9]]},"DOI":"10.1007\/s00371-025-03913-w","type":"journal-article","created":{"date-parts":[[2025,6,10]],"date-time":"2025-06-10T10:27:17Z","timestamp":1749551237000},"page":"9057-9075","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["ViT-SIR: vision transformer-based shoe image retrieval with enhanced feature representation"],"prefix":"10.1007","volume":"41","author":[{"given":"Jisong","family":"Li","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yunqi","family":"Tang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,6,10]]},"reference":[{"key":"3913_CR1","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1186\/s40537-021-00444-8","volume":"8","author":"L Alzubaidi","year":"2021","unstructured":"Alzubaidi, L., Zhang, J., Humaidi, A.J., Al-Dujaili, A., Duan, Y., Al-Shamma, O., Santamar\u00eda, J., Fadhel, M.A., Al-Amidie, M., Farhan, L.: Review of deep learning: concepts, CNN architectures, challenges, applications, future directions. J. Big Data 8, 1\u201374 (2021)","journal-title":"J. Big Data"},{"issue":"7","key":"3913_CR2","doi-asserted-by":"publisher","first-page":"5930","DOI":"10.3390\/su15075930","volume":"15","author":"AW Salehi","year":"2023","unstructured":"Salehi, A.W., Khan, S., Gupta, G., Alabduallah, B.I., Almjally, A., Alsolai, H., Siddiqui, T., Mellit, A.: A study of CNN and transfer learning in medical imaging: advantages, challenges, future scope. Sustainability 15(7), 5930 (2023)","journal-title":"Sustainability"},{"issue":"20","key":"3913_CR3","doi-asserted-by":"publisher","first-page":"2470","DOI":"10.3390\/electronics10202470","volume":"10","author":"D Bhatt","year":"2021","unstructured":"Bhatt, D., Patel, C., Talsania, H., Patel, J., Vaghela, R., Pandya, S., Modi, K., Ghayvat, H.: CNN variants for computer vision: history, architecture, application, challenges and future scope. Electronics 10(20), 2470 (2021)","journal-title":"Electronics"},{"key":"3913_CR4","doi-asserted-by":"publisher","DOI":"10.23954\/osj.v4i1.2141","author":"M Millar","year":"2019","unstructured":"Millar, M.: Review of current methods for re-identification in computer vision. Open Sci. J. (2019). https:\/\/doi.org\/10.23954\/osj.v4i1.2141","journal-title":"Open Sci. J."},{"key":"3913_CR5","unstructured":"Dosovitskiy, A.: An image is worth 16 $$\\times $$ 16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"3913_CR6","unstructured":"Vaswani, A.: Attention is all you need. In: Advances in Neural Information Processing Systems (2017)"},{"issue":"1","key":"3913_CR7","doi-asserted-by":"publisher","first-page":"87","DOI":"10.1109\/TPAMI.2022.3152247","volume":"45","author":"K Han","year":"2022","unstructured":"Han, K., Wang, Y., Chen, H., Chen, X., Guo, J., Liu, Z., Tang, Y., Xiao, A., Xu, C., Xu, Y., et al.: A survey on vision transformer. IEEE Trans. Pattern Anal. Mach. Intell. 45(1), 87\u2013110 (2022)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"3913_CR8","doi-asserted-by":"crossref","unstructured":"Yao, J., Chen, J., Niu, L., Sheng, B.: Scene-aware human pose generation using transformer. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 2847\u20132855 (2023)","DOI":"10.1145\/3581783.3612439"},{"issue":"1","key":"3913_CR9","doi-asserted-by":"publisher","first-page":"57","DOI":"10.1016\/j.vrih.2022.07.006","volume":"5","author":"M Zhang","year":"2023","unstructured":"Zhang, M., Tian, X.: Transformer architecture based on mutual attention for image-anomaly detection. Virtual Real. Intell. Hardw. 5(1), 57\u201367 (2023)","journal-title":"Virtual Real. Intell. Hardw."},{"issue":"1","key":"3913_CR10","doi-asserted-by":"publisher","first-page":"2201","DOI":"10.1002\/cav.2201","volume":"35","author":"X Zhu","year":"2024","unstructured":"Zhu, X., Yao, X., Zhang, J., Zhu, M., You, L., Yang, X., Zhang, J., Zhao, H., Zeng, D.: TMSDNet: transformer with multi-scale dense network for single and multi-view 3D reconstruction. Comput. Animat. Virtual Worlds 35(1), 2201 (2024)","journal-title":"Comput. Animat. Virtual Worlds"},{"key":"3913_CR11","doi-asserted-by":"crossref","unstructured":"Mao, X., Qi, G., Chen, Y., Li, X., Duan, R., Ye, S., He, Y., Xue, H.: Towards robust vision transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12042\u201312051 (2022)","DOI":"10.1109\/CVPR52688.2022.01173"},{"issue":"10s","key":"3913_CR12","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3505244","volume":"54","author":"S Khan","year":"2022","unstructured":"Khan, S., Naseer, M., Hayat, M., Zamir, S.W., Khan, F.S., Shah, M.: Transformers in vision: a survey. ACM Comput. Surv. CSUR 54(10s), 1\u201341 (2022)","journal-title":"ACM Comput. Surv. CSUR"},{"issue":"9","key":"3913_CR13","doi-asserted-by":"publisher","first-page":"5521","DOI":"10.3390\/app13095521","volume":"13","author":"J Maur\u00edcio","year":"2023","unstructured":"Maur\u00edcio, J., Domingues, I., Bernardino, J.: Comparing vision transformers and convolutional neural networks for image classification: a literature review. Appl. Sci. 13(9), 5521 (2023)","journal-title":"Appl. Sci."},{"key":"3913_CR14","doi-asserted-by":"publisher","first-page":"100258","DOI":"10.1016\/j.array.2022.100258","volume":"16","author":"A Mumuni","year":"2022","unstructured":"Mumuni, A., Mumuni, F.: Data augmentation: a comprehensive survey of modern approaches. Array 16, 100258 (2022)","journal-title":"Array"},{"key":"3913_CR15","unstructured":"Yang, S., Xiao, W., Zhang, M., Guo, S., Zhao, J., Shen, F.: Image data augmentation for deep learning: a survey. arXiv preprint arXiv:2204.08610 (2022)"},{"key":"3913_CR16","doi-asserted-by":"publisher","first-page":"107791","DOI":"10.1016\/j.engappai.2023.107791","volume":"131","author":"H Naveed","year":"2024","unstructured":"Naveed, H., Anwar, S., Hayat, M., Javed, K., Mian, A.: Survey: image mixing and deleting for data augmentation. Eng. Appl. Artif. Intell. 131, 107791 (2024)","journal-title":"Eng. Appl. Artif. Intell."},{"issue":"3","key":"3913_CR17","doi-asserted-by":"publisher","first-page":"2270","DOI":"10.1002\/cav.2270","volume":"35","author":"J Lee","year":"2024","unstructured":"Lee, J., Kang, H.: PIPformers: patch based inpainting with vision transformers for generalize paintings. Comput. Animat. Virtual Worlds 35(3), 2270 (2024)","journal-title":"Comput. Animat. Virtual Worlds"},{"key":"3913_CR18","first-page":"37878","volume":"35","author":"R Balestriero","year":"2022","unstructured":"Balestriero, R., Bottou, L., LeCun, Y.: The effects of regularization and data augmentation are class dependent. Adv. Neural Inf. Process. Syst. 35, 37878\u201337891 (2022)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"3913_CR19","unstructured":"Steiner, A., Kolesnikov, A., Zhai, X., Wightman, R., Uszkoreit, J., Beyer, L.: How to train your vit? Data, augmentation, and regularization in vision transformers. arXiv preprint arXiv:2106.10270 (2021)"},{"key":"3913_CR20","unstructured":"Hern\u00e1ndez-Garc\u00eda, A., K\u00f6nig, P.: Data augmentation instead of explicit regularization. arXiv preprint arXiv:1806.03852 (2018)"},{"key":"3913_CR21","doi-asserted-by":"publisher","first-page":"58774","DOI":"10.1109\/ACCESS.2018.2872698","volume":"6","author":"D Liang","year":"2018","unstructured":"Liang, D., Yang, F., Zhang, T., Yang, P.: Understanding mixup training methods. IEEE Access 6, 58774\u201358783 (2018)","journal-title":"IEEE Access"},{"key":"3913_CR22","doi-asserted-by":"crossref","unstructured":"Yun, S., Han, D., Oh, S.J., Chun, S., Choe, J., Yoo, Y.: Cutmix: Regularization strategy to train strong classifiers with localizable features. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6023\u20136032 (2019)","DOI":"10.1109\/ICCV.2019.00612"},{"key":"3913_CR23","doi-asserted-by":"crossref","unstructured":"Saravanakumar, S., Jayakumar, S., Balavikneshwaran, A.: A comprehensive study of image re-ranking methods in visual and semantic features. In: International Conference on Cognitive Computing and Cyber Physical Systems, pp. 559\u2013575. Springer (2023)","DOI":"10.1007\/978-981-97-2550-2_40"},{"issue":"3","key":"3913_CR24","doi-asserted-by":"publisher","first-page":"119","DOI":"10.1007\/s00530-024-01296-x","volume":"30","author":"F Sabahi","year":"2024","unstructured":"Sabahi, F., Ahmad, M.O., Swamy, M.: RefinerHash: a new hashing-based re-ranking technique for image retrieval. Multimed. Syst. 30(3), 119 (2024)","journal-title":"Multimed. Syst."},{"key":"3913_CR25","doi-asserted-by":"crossref","unstructured":"Dange, B., Yadav, S., Kshirsagar, D.: Enhancing image retrieval and re-ranking efficiency using hybrid approach. In: 2020 International Conference on Smart Innovations in Design, Environment, Management, Planning and Computing (ICSIDEMPC), pp. 20\u201326. IEEE (2020)","DOI":"10.1109\/ICSIDEMPC49020.2020.9299579"},{"key":"3913_CR26","doi-asserted-by":"crossref","unstructured":"Yang, H., Lin, J., Yang, A., Wang, P., Zhou, C., Yang, H.: Prompt tuning for generative multimodal pretrained models. arXiv preprint arXiv:2208.02532 (2022)","DOI":"10.18653\/v1\/2023.findings-acl.27"},{"key":"3913_CR27","doi-asserted-by":"crossref","unstructured":"Zhong, Z., Zheng, L., Cao, D., Li, S.: Re-ranking person re-identification with k-reciprocal encoding. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1318\u20131327 (2017)","DOI":"10.1109\/CVPR.2017.389"},{"key":"3913_CR28","doi-asserted-by":"crossref","unstructured":"Jia, M., Tang, L., Chen, B.-C., Cardie, C., Belongie, S., Hariharan, B., Lim, S.-N.: Visual prompt tuning. In: European Conference on Computer Vision, pp. 709\u2013727. Springer (2022)","DOI":"10.1007\/978-3-031-19827-4_41"},{"issue":"3","key":"3913_CR29","doi-asserted-by":"publisher","first-page":"9775","DOI":"10.4249\/scholarpedia.9775","volume":"5","author":"M Pietik\u00e4inen","year":"2010","unstructured":"Pietik\u00e4inen, M.: Local binary patterns. Scholarpedia 5(3), 9775 (2010)","journal-title":"Scholarpedia"},{"issue":"1","key":"3913_CR30","doi-asserted-by":"publisher","first-page":"47","DOI":"10.1016\/j.patcog.2009.06.017","volume":"43","author":"J \u017duni\u0107","year":"2010","unstructured":"\u017duni\u0107, J., Hirota, K., Rosin, P.L.: A hu moment invariant as a shape circularity measure. Pattern Recogn. 43(1), 47\u201357 (2010)","journal-title":"Pattern Recogn."},{"key":"3913_CR31","doi-asserted-by":"crossref","unstructured":"Satiyan, M., Nagarajan, R.: Recognition of facial expression using Haar-like feature extraction method. In: 2010 International Conference on Intelligent and Advanced Systems, pp. 1\u20134. IEEE (2010)","DOI":"10.1109\/ICIAS.2010.5716228"},{"key":"3913_CR32","doi-asserted-by":"publisher","first-page":"217","DOI":"10.1007\/s11263-011-0505-4","volume":"98","author":"J Cruz-Mota","year":"2012","unstructured":"Cruz-Mota, J., Bogdanova, I., Paquier, B., Bierlaire, M., Thiran, J.-P.: Scale invariant feature transform on the sphere: theory and applications. Int. J. Comput. Vis. 98, 217\u2013241 (2012)","journal-title":"Int. J. Comput. Vis."},{"key":"3913_CR33","doi-asserted-by":"crossref","unstructured":"Dalal, N., Triggs, B.: Histograms of oriented gradients for human detection. In: 2005 IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR\u201905), vol. 1, pp. 886\u2013893. IEEE (2005)","DOI":"10.1109\/CVPR.2005.177"},{"key":"3913_CR34","doi-asserted-by":"crossref","unstructured":"Bay, H., Tuytelaars, T., Van\u00a0Gool, L.: Surf: speeded up robust features. In: Computer Vision\u2013ECCV 2006: 9th European Conference on Computer Vision, Graz, Austria, May 7-13, 2006. Proceedings, Part I 9, pp. 404\u2013417. Springer (2006)","DOI":"10.1007\/11744023_32"},{"key":"3913_CR35","unstructured":"Alom, M.Z., Taha, T.M., Yakopcic, C., Westberg, S., Sidike, P., Nasrin, M.S., Van\u00a0Esesn, B.C., Awwal, A.A.S., Asari, V.K.: The history began from alexnet: a comprehensive survey on deep learning approaches. arXiv preprint arXiv:1803.01164 (2018)"},{"key":"3913_CR36","unstructured":"Targ, S., Almeida, D., Lyman, K.: Resnet in resnet: generalizing residual architectures. arXiv preprint arXiv:1603.08029 (2016)"},{"key":"3913_CR37","doi-asserted-by":"publisher","first-page":"107660","DOI":"10.1016\/j.cmpb.2023.107660","volume":"240","author":"W Xu","year":"2023","unstructured":"Xu, W., Fu, Y.-L., Zhu, D.: Resnet and its application to medical image processing: research progress and challenges. Comput. Methods Programs Biomed. 240, 107660 (2023)","journal-title":"Comput. Methods Programs Biomed."},{"key":"3913_CR38","doi-asserted-by":"crossref","unstructured":"Verdhan, V., Verdhan, V.: VGGNet and AlexNet networks. In: Computer Vision Using Deep Learning: Neural Network Architectures with Python and Keras, pp. 103\u2013139 (2021)","DOI":"10.1007\/978-1-4842-6616-8_4"},{"issue":"1","key":"3913_CR39","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2632165","volume":"11","author":"J Huang","year":"2014","unstructured":"Huang, J., Liu, S., Xing, J., Mei, T., Yan, S.: Circle & search: attribute-aware shoe retrieval. ACM Trans. Multimed. Comput. Commun. Appl. TOMM 11(1), 1\u201321 (2014)","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl. TOMM"},{"key":"3913_CR40","first-page":"39","volume":"4","author":"L Jiang","year":"2016","unstructured":"Jiang, L., Zhao, H., Wu, C.: Shoe image retrieval based on convolutional neural networks. Mod. Comput. 4, 39\u201343 (2016)","journal-title":"Mod. Comput."},{"issue":"12","key":"3913_CR41","doi-asserted-by":"publisher","first-page":"5867","DOI":"10.1109\/TIP.2017.2736346","volume":"26","author":"H Zhan","year":"2017","unstructured":"Zhan, H., Shi, B., Kot, A.C.: Cross-domain shoe retrieval with a semantic hierarchy of attribute classification network. IEEE Trans. Image Process. 26(12), 5867\u20135881 (2017)","journal-title":"IEEE Trans. Image Process."},{"key":"3913_CR42","doi-asserted-by":"crossref","unstructured":"Yu, Q., Liu, F., Song, Y.-Z., Xiang, T., Hospedales, T.M., Loy, C.-C.: Sketch me that shoe. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 799\u2013807 (2016)","DOI":"10.1109\/CVPR.2016.93"},{"issue":"2","key":"3913_CR43","first-page":"669","volume":"20","author":"H Jiang","year":"2020","unstructured":"Jiang, H., Yang, M., Mi, Z., Tang, Y.: Automatic shoe classification method under low resolution conditions. Sci. Technol. Eng. 20(2), 669\u2013674 (2020)","journal-title":"Sci. Technol. Eng."},{"issue":"19","key":"3913_CR44","doi-asserted-by":"publisher","first-page":"191505","DOI":"10.3788\/LOP56.191505","volume":"56","author":"M Yang","year":"2019","unstructured":"Yang, M., Tang, Y., Jiang, X., et al.: Shoe type recognition method based on convolutional neural networks. Laser Optoelectron. Prog. 56(19), 191505 (2019)","journal-title":"Laser Optoelectron. Prog."},{"issue":"2","key":"3913_CR45","doi-asserted-by":"crossref","first-page":"0215004","DOI":"10.3788\/LOP202259.0215004","volume":"59","author":"J Zhang","year":"2022","unstructured":"Zhang, J., Tang, Y., Yang, Z., Geng, P., et al.: Shoe type recognition algorithm based on attention mechanism. Laser Optoelectron. Prog. 59(2), 0215004\u20130215004 (2022)","journal-title":"Laser Optoelectron. Prog."},{"issue":"19","key":"3913_CR46","first-page":"139","volume":"44","author":"J Zhang","year":"2021","unstructured":"Zhang, J., Tang, Y., Yang, Z.: Shoe shape recognition algorithm based on improved residual network and data augmentation. Electron. Meas. Technol. 44(19), 139\u2013147 (2021)","journal-title":"Electron. Meas. Technol."},{"key":"3913_CR47","first-page":"295","volume":"48","author":"J Zhang","year":"2022","unstructured":"Zhang, J., Tang, Y., Yang, Z.: Shoe type recognition algorithm integrating adaptive receptive field and multi-branch features. Comput. Eng. 48, 295\u2013303 (2022)","journal-title":"Comput. Eng."},{"key":"3913_CR48","doi-asserted-by":"publisher","first-page":"35479","DOI":"10.1109\/ACCESS.2023.3266093","volume":"11","author":"AB Amjoud","year":"2023","unstructured":"Amjoud, A.B., Amrouch, M.: Object detection using deep learning, CNNs and vision transformers: a review. IEEE Access 11, 35479\u201335516 (2023)","journal-title":"IEEE Access"},{"key":"3913_CR49","unstructured":"Islam, K.: Recent advances in vision transformer: a survey and outlook of recent work. arXiv preprint arXiv:2203.01536 (2022)"},{"key":"3913_CR50","unstructured":"Chen, W., Liu, Y., Wang, W., Bakker, E., Georgiou, T., Fieguth, P., Liu, L., Lew, M.S.: Deep image retrieval: a survey. arXiv preprint arXiv:2101.11282 (2021)"},{"key":"3913_CR51","doi-asserted-by":"crossref","unstructured":"Gkelios, S., Boutalis, Y., Chatzichristofis, S.A.: Investigating the vision transformer model for image retrieval tasks. In: 2021 17th International Conference on Distributed Computing in Sensor Systems (DCOSS), pp. 367\u2013373. IEEE (2021)","DOI":"10.1109\/DCOSS52077.2021.00065"},{"key":"3913_CR52","unstructured":"El-Nouby, A., Neverova, N., Laptev, I., J\u00e9gou, H.: Training vision transformers for image retrieval. arXiv preprint arXiv:2102.05644 (2021)"},{"key":"3913_CR53","doi-asserted-by":"crossref","unstructured":"Song, C.H., Yoon, J., Choi, S., Avrithis, Y.: Boosting vision transformers for image retrieval. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 107\u2013117 (2023)","DOI":"10.1109\/WACV56688.2023.00019"},{"key":"3913_CR54","doi-asserted-by":"crossref","unstructured":"Tan, F., Yuan, J., Ordonez, V.: Instance-level image retrieval using reranking transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 12105\u201312115 (2021)","DOI":"10.1109\/ICCV48922.2021.01189"},{"key":"3913_CR55","doi-asserted-by":"crossref","unstructured":"He, S., Luo, H., Wang, P., Wang, F., Li, H., Jiang, W.: Transreid: transformer-based object re-identification. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15013\u201315022 (2021)","DOI":"10.1109\/ICCV48922.2021.01474"},{"key":"3913_CR56","doi-asserted-by":"crossref","unstructured":"Li, X., Yu, J., Jiang, S., Lu, H., Li, Z.: Msvit: training multiscale vision transformers for image retrieval. IEEE Trans. Multimed. 26, 2809\u20132823 (2023)","DOI":"10.1109\/TMM.2023.3304021"},{"key":"3913_CR57","doi-asserted-by":"crossref","unstructured":"Phan, N., Huy, T.D., Duong, S.T., Hoang, N.T., Tran, S., Hung, D.H., Nguyen, C.D.T., Bui, T., Truong, S.Q.: Logovit: local-global vision transformer for object re-identification. In: ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1\u20135. IEEE (2023)","DOI":"10.1109\/ICASSP49357.2023.10096126"},{"issue":"6","key":"3913_CR58","doi-asserted-by":"publisher","first-page":"509","DOI":"10.1016\/j.vrih.2023.06.003","volume":"5","author":"M Wang","year":"2023","unstructured":"Wang, M., Meng, M., Liu, J., Wu, J.: Learning adequate alignment and interaction for cross-modal retrieval. Virtual Real. Intell. Hardw. 5(6), 509\u2013522 (2023)","journal-title":"Virtual Real. Intell. Hardw."},{"key":"3913_CR59","doi-asserted-by":"crossref","unstructured":"Ding, M., Xiao, B., Codella, N., Luo, P., Wang, J., Yuan, L.: Davit: dual attention vision transformers. In: European Conference on Computer Vision, pp. 74\u201392. Springer (2022)","DOI":"10.1007\/978-3-031-20053-3_5"},{"key":"3913_CR60","doi-asserted-by":"crossref","unstructured":"Yuan, L., Chen, Y., Wang, T., Yu, W., Shi, Y., Jiang, Z.-H., Tay, F.E., Feng, J., Yan, S.: Tokens-to-token vit: training vision transformers from scratch on imagenet. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 558\u2013567 (2021)","DOI":"10.1109\/ICCV48922.2021.00060"},{"key":"3913_CR61","doi-asserted-by":"crossref","unstructured":"Hu, J., Shen, L., Sun, G.: Squeeze-and-excitation networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7132\u20137141 (2018)","DOI":"10.1109\/CVPR.2018.00745"},{"key":"3913_CR62","doi-asserted-by":"crossref","unstructured":"Wang, Q., Wu, B., Zhu, P., Li, P., Zuo, W., Hu, Q.: ECA-Net: efficient channel attention for deep convolutional neural networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11534\u201311542 (2020)","DOI":"10.1109\/CVPR42600.2020.01155"},{"key":"3913_CR63","doi-asserted-by":"crossref","unstructured":"Woo, S., Park, J., Lee, J.-Y., Kweon, I.S.: Cbam: convolutional block attention module. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 3\u201319 (2018)","DOI":"10.1007\/978-3-030-01234-2_1"},{"key":"3913_CR64","unstructured":"Park, J.: Bam: Bottleneck attention module. arXiv preprint arXiv:1807.06514 (2018)"},{"key":"3913_CR65","doi-asserted-by":"crossref","unstructured":"Bastidas, A.A., Tang, H.: Channel attention networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops (2019)","DOI":"10.1109\/CVPRW.2019.00117"},{"key":"3913_CR66","doi-asserted-by":"publisher","DOI":"10.21203\/rs.3.rs-4661763\/v1","author":"Z Zeng","year":"2024","unstructured":"Zeng, Z., Li, L., Zhao, Z., Liu, Q.: Improved fine-grained image classification in few-shot learning based on channel-spatial attention and grouped bilinear convolution. Vis. Comput. (2024). https:\/\/doi.org\/10.21203\/rs.3.rs-4661763\/v1","journal-title":"Vis. Comput."},{"key":"3913_CR67","doi-asserted-by":"publisher","DOI":"10.1007\/s00371-024-03721-8","author":"Y Li","year":"2024","unstructured":"Li, Y., Xie, B., Li, Y., Zhang, J.: Multi-scale local regional attention fusion using visual transformers for fine-grained image classification. Vis. Comput. (2024). https:\/\/doi.org\/10.1007\/s00371-024-03721-8","journal-title":"Vis. Comput."},{"key":"3913_CR68","doi-asserted-by":"publisher","first-page":"50","DOI":"10.1109\/TMM.2021.3120873","volume":"25","author":"X Lin","year":"2021","unstructured":"Lin, X., Sun, S., Huang, W., Sheng, B., Li, P., Feng, D.D.: EAPT: efficient attention pyramid transformer for image processing. IEEE Trans. Multimed. 25, 50\u201361 (2021)","journal-title":"IEEE Trans. Multimed."},{"key":"3913_CR69","doi-asserted-by":"crossref","unstructured":"Li, S., Sun, L., Li, Q.: Clip-reid: exploiting vision-language model for image re-identification without concrete text labels. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 37, pp. 1405\u20131413 (2023)","DOI":"10.1609\/aaai.v37i1.25225"},{"key":"3913_CR70","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Zhang, T., Chen, D., Wang, Y., Chen, Q., Xie, X., Sun, H., Deng, W., Zhang, Q., Yang, F., et al.: Irgen: generative modeling for image retrieval. In: European Conference on Computer Vision, pp. 21\u201341. Springer (2024)","DOI":"10.1007\/978-3-031-72633-0_2"}],"container-title":["The Visual Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-025-03913-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00371-025-03913-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-025-03913-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,6]],"date-time":"2025-09-06T19:08:44Z","timestamp":1757185724000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00371-025-03913-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,10]]},"references-count":70,"journal-issue":{"issue":"11","published-print":{"date-parts":[[2025,9]]}},"alternative-id":["3913"],"URL":"https:\/\/doi.org\/10.1007\/s00371-025-03913-w","relation":{},"ISSN":["0178-2789","1432-2315"],"issn-type":[{"value":"0178-2789","type":"print"},{"value":"1432-2315","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,6,10]]},"assertion":[{"value":"3 April 2025","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"10 June 2025","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no Conflict of interest to declare that are relevant to the content of this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"This article does not contain any studies with human participants or animals performed by any of the authors.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical approval"}}]}}