{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T07:37:16Z","timestamp":1777016236698,"version":"3.51.4"},"reference-count":49,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62473201"],"award-info":[{"award-number":["62473201"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Signal Processing: Image Communication"],"published-print":{"date-parts":[[2026,7]]},"DOI":"10.1016\/j.image.2026.117540","type":"journal-article","created":{"date-parts":[[2026,3,18]],"date-time":"2026-03-18T17:23:59Z","timestamp":1773854639000},"page":"117540","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Enhanced image retrieval: Leveraging multi-head attention &amp; multi-scale descriptors and hybrid aggregation feature indexing"],"prefix":"10.1016","volume":"145","author":[{"given":"Wenbin","family":"Yu","sequence":"first","affiliation":[]},{"given":"Jiaqi","family":"Li","sequence":"additional","affiliation":[]},{"given":"Yifan","family":"Zhang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4458-5843","authenticated-orcid":false,"given":"Chengjun","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Yadang","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Na","family":"Yin","sequence":"additional","affiliation":[]},{"given":"Alex X.","family":"Liu","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.image.2026.117540_b1","doi-asserted-by":"crossref","first-page":"657","DOI":"10.1109\/TRO.2020.2964138","article-title":"Good feature matching: Toward accurate, robust vo\/vslam with low latency","volume":"36","author":"Zhao","year":"2020","journal-title":"IEEE Trans. Robot."},{"key":"10.1016\/j.image.2026.117540_b2","doi-asserted-by":"crossref","DOI":"10.1016\/j.engappai.2020.104032","article-title":"SLAM; definition and evolution","volume":"97","author":"Taheri","year":"2021","journal-title":"Eng. Appl. Artif. Intell."},{"key":"10.1016\/j.image.2026.117540_b3","doi-asserted-by":"crossref","unstructured":"H. Noh, A. Araujo, J. Sim, T. Weyand, B. Han, Large-scale image retrieval with attentive deep local features, in: Proceedings of the IEEE International Conference on Computer Vision, 2017, pp. 3456\u20133465.","DOI":"10.1109\/ICCV.2017.374"},{"issue":"9","key":"10.1016\/j.image.2026.117540_b4","doi-asserted-by":"crossref","first-page":"1704","DOI":"10.1109\/TPAMI.2011.235","article-title":"Aggregating local image descriptors into compact codes","volume":"34","author":"J\u00e9gou","year":"2011","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.image.2026.117540_b5","doi-asserted-by":"crossref","unstructured":"R. Arandjelovic, P. Gronat, A. Torii, T. Pajdla, J. Sivic, NetVLAD: CNN architecture for weakly supervised place recognition, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2016, pp. 5297\u20135307.","DOI":"10.1109\/CVPR.2016.572"},{"key":"10.1016\/j.image.2026.117540_b6","doi-asserted-by":"crossref","unstructured":"S. Hausler, S. Garg, M. Xu, M. Milford, T. Fischer, Patch-netvlad: Multi-scale fusion of locally-global descriptors for place recognition, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021, pp. 14141\u201314152.","DOI":"10.1109\/CVPR46437.2021.01392"},{"key":"10.1016\/j.image.2026.117540_b7","doi-asserted-by":"crossref","first-page":"1855","DOI":"10.1109\/LSP.2024.3425279","article-title":"MS-NetVLAD: Multi-scale NetVLAD for visual place recognition","volume":"31","author":"Uggi","year":"2024","journal-title":"IEEE Signal Process. Lett."},{"issue":"2","key":"10.1016\/j.image.2026.117540_b8","doi-asserted-by":"crossref","first-page":"3882","DOI":"10.1109\/LRA.2022.3147257","article-title":"MultiRes-NetVLAD: Augmenting place recognition training with low-resolution imagery","volume":"7","author":"Khaliq","year":"2022","journal-title":"IEEE Robot. Autom. Lett."},{"key":"10.1016\/j.image.2026.117540_b9","first-page":"5789","article-title":"SuperVLAD: Compact and robust image descriptors for visual place recognition","volume":"vol. 37","author":"Lu","year":"2024"},{"key":"10.1016\/j.image.2026.117540_b10","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2020.107380","article-title":"Recurrent bag-of-features for visual information analysis","volume":"106","author":"Krestenitis","year":"2020","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.image.2026.117540_b11","doi-asserted-by":"crossref","DOI":"10.1016\/j.bspc.2021.102656","article-title":"Ensemble-based bag of features for automated classification of normal and COVID-19 CXR images","volume":"68","author":"Ashour","year":"2021","journal-title":"Biomed. Signal Process. Control."},{"key":"10.1016\/j.image.2026.117540_b12","doi-asserted-by":"crossref","first-page":"91","DOI":"10.1023\/B:VISI.0000029664.99615.94","article-title":"Distinctive image features from scale-invariant keypoints","volume":"60","author":"Lowe","year":"2004","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.image.2026.117540_b13","doi-asserted-by":"crossref","first-page":"404","DOI":"10.1007\/11744023_32","article-title":"Surf: Speeded up robust features","volume":"3951","author":"Bay","year":"2006","journal-title":"Lecture Notes in Comput. Sci."},{"issue":"3","key":"10.1016\/j.image.2026.117540_b14","doi-asserted-by":"crossref","first-page":"328","DOI":"10.1109\/29.21701","article-title":"Phoneme recognition using time-delay neural networks","volume":"37","author":"Waibel","year":"1989","journal-title":"IEEE Trans. Acoust. Speech Signal Process."},{"issue":"4","key":"10.1016\/j.image.2026.117540_b15","doi-asserted-by":"crossref","first-page":"541","DOI":"10.1162\/neco.1989.1.4.541","article-title":"Backpropagation applied to handwritten zip code recognition","volume":"1","author":"LeCun","year":"1989","journal-title":"Neural Comput."},{"issue":"6","key":"10.1016\/j.image.2026.117540_b16","doi-asserted-by":"crossref","first-page":"84","DOI":"10.1145\/3065386","article-title":"Imagenet classification with deep convolutional neural networks","volume":"60","author":"Krizhevsky","year":"2017","journal-title":"Commun. ACM"},{"key":"10.1016\/j.image.2026.117540_b17","doi-asserted-by":"crossref","unstructured":"C. Szegedy, W. Liu, Y. Jia, P. Sermanet, S. Reed, D. Anguelov, D. Erhan, V. Vanhoucke, A. Rabinovich, Going deeper with convolutions, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2015, pp. 1\u20139.","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"10.1016\/j.image.2026.117540_b18","doi-asserted-by":"crossref","unstructured":"C. Szegedy, V. Vanhoucke, S. Ioffe, J. Shlens, Z. Wojna, Rethinking the inception architecture for computer vision, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2016, pp. 2818\u20132826.","DOI":"10.1109\/CVPR.2016.308"},{"key":"10.1016\/j.image.2026.117540_b19","article-title":"Inception-v4, inception-resnet and the impact of residual connections on learning","volume":"vol. 31","author":"Szegedy","year":"2017"},{"key":"10.1016\/j.image.2026.117540_b20","series-title":"Very deep convolutional networks for large-scale image recognition","author":"Simonyan","year":"2014"},{"key":"10.1016\/j.image.2026.117540_b21","doi-asserted-by":"crossref","unstructured":"K. He, X. Zhang, S. Ren, J. Sun, Deep residual learning for image recognition, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2016, pp. 770\u2013778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"10.1016\/j.image.2026.117540_b22","series-title":"Mobilenets: Efficient convolutional neural networks for mobile vision applications","author":"Howard","year":"2017"},{"key":"10.1016\/j.image.2026.117540_b23","doi-asserted-by":"crossref","unstructured":"M. Sandler, A. Howard, M. Zhu, A. Zhmoginov, L.-C. Chen, Mobilenetv2: Inverted residuals and linear bottlenecks, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2018, pp. 4510\u20134520.","DOI":"10.1109\/CVPR.2018.00474"},{"key":"10.1016\/j.image.2026.117540_b24","doi-asserted-by":"crossref","unstructured":"A. Howard, M. Sandler, G. Chu, L.-C. Chen, B. Chen, M. Tan, W. Wang, Y. Zhu, R. Pang, V. Vasudevan, et al., Searching for mobilenetv3, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019, pp. 1314\u20131324.","DOI":"10.1109\/ICCV.2019.00140"},{"key":"10.1016\/j.image.2026.117540_b25","doi-asserted-by":"crossref","unstructured":"X. Zhang, X. Zhou, M. Lin, J. Sun, Shufflenet: An extremely efficient convolutional neural network for mobile devices, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2018, pp. 6848\u20136856.","DOI":"10.1109\/CVPR.2018.00716"},{"key":"10.1016\/j.image.2026.117540_b26","doi-asserted-by":"crossref","unstructured":"N. Ma, X. Zhang, H.-T. Zheng, J. Sun, Shufflenet v2: Practical guidelines for efficient cnn architecture design, in: Proceedings of the European Conference on Computer Vision, ECCV, 2018, pp. 116\u2013131.","DOI":"10.1007\/978-3-030-01264-9_8"},{"key":"10.1016\/j.image.2026.117540_b27","series-title":"International Conference on Machine Learning","first-page":"6105","article-title":"Efficientnet: Rethinking model scaling for convolutional neural networks","author":"Tan","year":"2019"},{"key":"10.1016\/j.image.2026.117540_b28","series-title":"International Conference on Machine Learning","first-page":"10096","article-title":"Efficientnetv2: Smaller models and faster training","author":"Tan","year":"2021"},{"key":"10.1016\/j.image.2026.117540_b29","doi-asserted-by":"crossref","unstructured":"Z. Liu, H. Mao, C.-Y. Wu, C. Feichtenhofer, T. Darrell, S. Xie, A convnet for the 2020s, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 11976\u201311986.","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"10.1016\/j.image.2026.117540_b30","article-title":"Attention is all you need","volume":"30","author":"Vaswani","year":"2017","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.image.2026.117540_b31","series-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2020"},{"key":"10.1016\/j.image.2026.117540_b32","series-title":"Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part I 16","first-page":"213","article-title":"End-to-end object detection with transformers","author":"Carion","year":"2020"},{"key":"10.1016\/j.image.2026.117540_b33","doi-asserted-by":"crossref","unstructured":"M. Caron, H. Touvron, I. Misra, H. J\u00e9gou, J. Mairal, P. Bojanowski, A. Joulin, Emerging properties in self-supervised vision transformers, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 9650\u20139660.","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"10.1016\/j.image.2026.117540_b34","series-title":"Dinov2: Learning robust visual features without supervision","author":"Oquab","year":"2023"},{"issue":"1","key":"10.1016\/j.image.2026.117540_b35","doi-asserted-by":"crossref","first-page":"22100","DOI":"10.1038\/s41598-024-73853-3","article-title":"DINO-mix enhancing visual place recognition with foundational vision model and feature mixing","volume":"14","author":"Huang","year":"2024","journal-title":"Sci. Rep."},{"issue":"2","key":"10.1016\/j.image.2026.117540_b36","doi-asserted-by":"crossref","first-page":"1286","DOI":"10.1109\/LRA.2023.3343602","article-title":"Anyloc: Towards universal visual place recognition","volume":"9","author":"Keetha","year":"2023","journal-title":"IEEE Robot. Autom. Lett."},{"key":"10.1016\/j.image.2026.117540_b37","first-page":"1","article-title":"Visual categorization with bags of keypoints","volume":"vol. 1","author":"Csurka","year":"2004"},{"issue":"15","key":"10.1016\/j.image.2026.117540_b38","doi-asserted-by":"crossref","first-page":"5177","DOI":"10.3390\/app10155177","article-title":"Leaf image recognition based on bag of features","volume":"10","author":"Zhang","year":"2020","journal-title":"Appl. Sci."},{"issue":"10","key":"10.1016\/j.image.2026.117540_b39","doi-asserted-by":"crossref","first-page":"3301","DOI":"10.1109\/TCSVT.2019.2920657","article-title":"Smoke vehicle detection based on spatiotemporal bag-of-features and professional convolutional neural network","volume":"30","author":"Tao","year":"2019","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"4","key":"10.1016\/j.image.2026.117540_b40","doi-asserted-by":"crossref","first-page":"299","DOI":"10.1049\/iet-cvi.2013.0132","article-title":"Global and local exploitation for saliency using bag-of-words","volume":"8","author":"Zheng","year":"2014","journal-title":"IET Comput. Vis."},{"key":"10.1016\/j.image.2026.117540_b41","doi-asserted-by":"crossref","first-page":"690","DOI":"10.1016\/j.neucom.2019.12.142","article-title":"Local feature extracted by the improved bag of features method for person re-identification","volume":"458","author":"Zhang","year":"2021","journal-title":"Neurocomputing"},{"issue":"7","key":"10.1016\/j.image.2026.117540_b42","doi-asserted-by":"crossref","first-page":"1908","DOI":"10.1109\/TIP.2010.2045169","article-title":"Semantics-preserving bag-of-words models and applications","volume":"19","author":"Wu","year":"2010","journal-title":"IEEE Trans. Image Process."},{"issue":"04","key":"10.1016\/j.image.2026.117540_b43","doi-asserted-by":"crossref","DOI":"10.1142\/S0218001421530013","article-title":"Annotation-free word spotting with bag-of-features HMMs","volume":"35","author":"Rothacker","year":"2021","journal-title":"Int. J. Pattern Recognit. Artif. Intell."},{"key":"10.1016\/j.image.2026.117540_b44","doi-asserted-by":"crossref","first-page":"20821","DOI":"10.1007\/s11042-021-10612-w","article-title":"Bag-of-visual-words codebook generation using deep features for effective classification of imbalanced multi-class image datasets","volume":"80","author":"Saini","year":"2021","journal-title":"Multimedia Tools Appl."},{"key":"10.1016\/j.image.2026.117540_b45","doi-asserted-by":"crossref","first-page":"1429","DOI":"10.1007\/s40747-021-00275-3","article-title":"Improved bag-of-features using grey relational analysis for classification of histology images","volume":"7","author":"Pal","year":"2021","journal-title":"Complex & Intell. Syst."},{"key":"10.1016\/j.image.2026.117540_b46","doi-asserted-by":"crossref","unstructured":"N. Passalis, A. Tefas, Learning bag-of-features pooling for deep convolutional neural networks, in: Proceedings of the IEEE International Conference on Computer Vision, 2017, pp. 5755\u20135763.","DOI":"10.1109\/ICCV.2017.614"},{"key":"10.1016\/j.image.2026.117540_b47","doi-asserted-by":"crossref","unstructured":"R. Arandjelovic, A. Zisserman, All about VLAD, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2013, pp. 1578\u20131585.","DOI":"10.1109\/CVPR.2013.207"},{"key":"10.1016\/j.image.2026.117540_b48","doi-asserted-by":"crossref","unstructured":"R. Wang, Y. Shen, W. Zuo, S. Zhou, N. Zheng, Transvpr: Transformer-based place recognition with multi-level attention aggregation, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 13648\u201313657.","DOI":"10.1109\/CVPR52688.2022.01328"},{"key":"10.1016\/j.image.2026.117540_b49","doi-asserted-by":"crossref","unstructured":"F. Warburg, S. Hauberg, M. Milford, T. Fischer, Mapillary Street-Level Sequences: A Dataset for Lifelong Place Recognition, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR, 2020, pp. 2626\u20132635.","DOI":"10.1109\/CVPR42600.2020.00270"}],"container-title":["Signal Processing: Image Communication"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0923596526000639?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0923596526000639?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T06:44:31Z","timestamp":1777013071000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0923596526000639"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,7]]},"references-count":49,"alternative-id":["S0923596526000639"],"URL":"https:\/\/doi.org\/10.1016\/j.image.2026.117540","relation":{},"ISSN":["0923-5965"],"issn-type":[{"value":"0923-5965","type":"print"}],"subject":[],"published":{"date-parts":[[2026,7]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Enhanced image retrieval: Leveraging multi-head attention & multi-scale descriptors and hybrid aggregation feature indexing","name":"articletitle","label":"Article Title"},{"value":"Signal Processing: Image Communication","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.image.2026.117540","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"117540"}}