{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T02:21:10Z","timestamp":1774578070536,"version":"3.50.1"},"reference-count":52,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2023,1,31]],"date-time":"2023-01-31T00:00:00Z","timestamp":1675123200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,1,31]],"date-time":"2023-01-31T00:00:00Z","timestamp":1675123200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Sci. China Inf. Sci."],"published-print":{"date-parts":[[2023,3]]},"DOI":"10.1007\/s11432-022-3586-y","type":"journal-article","created":{"date-parts":[[2023,2,11]],"date-time":"2023-02-11T12:29:36Z","timestamp":1676118576000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["Siamese transformer with hierarchical concept embedding for fine-grained image recognition"],"prefix":"10.1007","volume":"66","author":[{"given":"Yilin","family":"Lyu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Liping","family":"Jing","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiaqi","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mingzhe","family":"Guo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xinyue","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jian","family":"Yu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,1,31]]},"reference":[{"key":"3586_CR1","unstructured":"Welinder P, Branson S, Mita T, et al. Caltech-UCSD Birds 200. Technical Report CNS-TR-2010-001, California Institute of Technology, 2010"},{"key":"3586_CR2","unstructured":"Horn G V, Branson S, Farrell R, et al. Building a bird recognition app and large scale dataset with citizen scientists: the fine print in fine-grained dataset collection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, Boston, 2015. 595\u2013604"},{"key":"3586_CR3","unstructured":"Maji S, Rahtu E, Kannala J, et al. Fine-grained visual classification of aircraft. 2013. ArXiv:1306.5151"},{"key":"3586_CR4","unstructured":"Khosla A, Jayadevaprakash N, Yao B, et al. Novel dataset for fine-grained image categorization. In: Proceedings of the 1st Workshop on Fine-Grained Visual Categorization, IEEE Conference on Computer Vision and Pattern Recognition, 2011"},{"key":"3586_CR5","doi-asserted-by":"crossref","unstructured":"Chen T, Wu W, Gao Y, et al. Fine-grained representation learning and recognition by exploiting hierarchical semantic embedding. In: Proceedings of the 26th ACM International Conference on Multimedia, Seoul, 2018. 2023\u20132031","DOI":"10.1145\/3240508.3240523"},{"key":"3586_CR6","doi-asserted-by":"crossref","unstructured":"Zhang N, Donahue J, Girshick R B, et al. Part-based R-CNNs for fine-grained category detection. In: Proceedings of the 13th European Conference on Computer Vision, Zurich, 2014. 8689: 834\u2013849","DOI":"10.1007\/978-3-319-10590-1_54"},{"key":"3586_CR7","doi-asserted-by":"crossref","unstructured":"Lin D, Shen X, Lu C, et al. Deep LAC: deep localization, alignment and classification for fine-grained recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, Boston, 2015. 1666\u20131674","DOI":"10.1109\/CVPR.2015.7298775"},{"key":"3586_CR8","doi-asserted-by":"crossref","unstructured":"Krause J, Jin H, Yang J, et al. Fine-grained recognition without part annotations. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, Boston, 2015. 5546\u20135555","DOI":"10.1109\/CVPR.2015.7299194"},{"key":"3586_CR9","doi-asserted-by":"crossref","unstructured":"Zhang H, Xu T, Elhoseiny M, et al. SPDA-CNN: unifying semantic part detection and abstraction for fine-grained recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, Las Vegas, 2016. 1143\u20131152","DOI":"10.1109\/CVPR.2016.129"},{"key":"3586_CR10","doi-asserted-by":"crossref","unstructured":"Fu J, Zheng H, Mei T. Look closer to see better: recurrent attention convolutional neural network for fine-grained image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, Honolulu, 2017. 4476\u20134484","DOI":"10.1109\/CVPR.2017.476"},{"key":"3586_CR11","doi-asserted-by":"crossref","unstructured":"Zheng H, Fu J, Mei T, et al. Learning multi-attention convolutional neural network for fine-grained image recognition. In: Proceedings of the IEEE International Conference on Computer Vision, Venice, 2017. 5219\u20135227","DOI":"10.1109\/ICCV.2017.557"},{"key":"3586_CR12","doi-asserted-by":"crossref","unstructured":"Li Z, Yang Y, Liu X, et al. Dynamic computational time for visual attention. In: Proceedings of the IEEE International Conference on Computer Vision Workshops, Venice, 2017. 1199\u20131209","DOI":"10.1109\/ICCVW.2017.145"},{"key":"3586_CR13","doi-asserted-by":"crossref","unstructured":"He X, Peng Y. Weakly supervised learning of part selection model with spatial constraints for fine-grained image classification. In: Proceedings of the 31st AAAI Conference on Artificial Intelligence, San Francisco, 2017. 4075\u20134081","DOI":"10.1609\/aaai.v31i1.11223"},{"key":"3586_CR14","doi-asserted-by":"crossref","unstructured":"Yang Z, Luo T, Wang D, et al. Learning to navigate for fine-grained classification. In: Proceedings of the 15th European Conference on Computer Vision, Munich, 2018. 11218: 438\u2013454","DOI":"10.1007\/978-3-030-01264-9_26"},{"key":"3586_CR15","doi-asserted-by":"publisher","first-page":"1235","DOI":"10.1007\/s11263-019-01176-2","volume":"127","author":"X He","year":"2019","unstructured":"He X, Peng Y, Zhao J. Which and how many regions to gaze: focus discriminative regions for fine-grained visual categorization. Int J Comput Vis, 2019, 127: 1235\u20131255","journal-title":"Int J Comput Vis"},{"key":"3586_CR16","doi-asserted-by":"crossref","unstructured":"Wang Z, Wang S, Zhang P, et al. Weakly supervised fine-grained image classification via correlation-guided discriminative learning. In: Proceedings of the 27th ACM International Conference on Multimedia, Nice, 2019. 1851\u20131860","DOI":"10.1145\/3343031.3350976"},{"key":"3586_CR17","doi-asserted-by":"crossref","unstructured":"Wang Z, Wang S, Li H, et al. Graph-propagation based correlation learning for weakly supervised fine-grained image classification. In: Proceedings of the 34th AAAI Conference on Artificial Intelligence, New York, 2020. 12289\u201312296","DOI":"10.1609\/aaai.v34i07.6912"},{"key":"3586_CR18","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, et al. An image is worth 16\u00d716 words: transformers for image recognition at scale. In: Proceedings of the 9th International Conference on Learning Representations, Vienna, 2021"},{"key":"3586_CR19","doi-asserted-by":"crossref","unstructured":"Carion N, Massa F, Synnaeve G, et al. End-to-end object detection with transformers. In: Proceedings of the 16th European Conference on Computer Vision, Glasgow, 2020. 12346: 213\u2013229","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"3586_CR20","unstructured":"Zhu X, Su W, Lu L, et al. Deformable DETR: deformable transformers for end-to-end object detection. In: Proceedings of the 9th International Conference on Learning Representations, Vienna, 2021"},{"key":"3586_CR21","doi-asserted-by":"crossref","unstructured":"Zheng S, Lu J, Zhao H, et al. Rethinking semantic segmentation from a sequence-to-sequence perspective with transformers. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition, 2021. 6881\u20136890","DOI":"10.1109\/CVPR46437.2021.00681"},{"key":"3586_CR22","unstructured":"Touvron H, Cord M, Douze M, et al. Training data-efficient image transformers & distillation through attention. In: Proceedings of the 38th International Conference on Machine Learning, 2021. 10347\u201310357"},{"key":"3586_CR23","doi-asserted-by":"crossref","unstructured":"He S, Luo H, Wang P, et al. TransReID: transformer-based object re-identification. In: Proceedings of IEEE\/CVF International Conference on Computer Vision, Montreal, 2021. 14993\u201315002","DOI":"10.1109\/ICCV48922.2021.01474"},{"key":"3586_CR24","doi-asserted-by":"crossref","unstructured":"He J, Chen J, Liu S, et al. TransFG: a transformer architecture for fine-grained recognition. In: Proceedings of the 36th AAAI Conference on Artificial Intelligence, the 34th Conference on Innovative Applications of Artificial Intelligence, and the 12th Symposium on Educational Advances in Artificial Intelligence, 2022. 852\u2013860","DOI":"10.1609\/aaai.v36i1.19967"},{"key":"3586_CR25","doi-asserted-by":"crossref","unstructured":"Wang D, Shen Z, Shao J, et al. Multiple granularity descriptors for fine-grained categorization. In: Proceedings of the IEEE International Conference on Computer Vision, Santiago, 2015. 2399\u20132406","DOI":"10.1109\/ICCV.2015.276"},{"key":"3586_CR26","doi-asserted-by":"publisher","first-page":"107889","DOI":"10.1016\/j.patcog.2021.107889","volume":"115","author":"G He","year":"2021","unstructured":"He G, Li F, Wang Q, et al. A hierarchical sampling based triplet network for fine-grained image classification. Pattern Recognit, 2021, 115: 107889","journal-title":"Pattern Recognit"},{"key":"3586_CR27","doi-asserted-by":"crossref","unstructured":"Lin T, RoyChowdhury A, Maji S. Bilinear CNN models for fine-grained visual recognition. In: Proceedings of the IEEE International Conference on Computer Vision, Santiago, 2015. 1449\u20131457","DOI":"10.1109\/ICCV.2015.170"},{"key":"3586_CR28","doi-asserted-by":"crossref","unstructured":"Gao Y, Beijbom O, Zhang N, et al. Compact bilinear pooling. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, Las Vegas, 2016. 317\u2013326","DOI":"10.1109\/CVPR.2016.41"},{"key":"3586_CR29","doi-asserted-by":"crossref","unstructured":"Kong S, Fowlkes C C. Low-rank bilinear pooling for fine-grained classification. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, Honolulu, 2017. 7025\u20137034","DOI":"10.1109\/CVPR.2017.743"},{"key":"3586_CR30","doi-asserted-by":"crossref","unstructured":"Wei X, Zhang Y, Gong Y, et al. Grassmann pooling as compact homogeneous bilinear pooling for fine-grained visual classification. In: Proceedings of the 15th European Conference on Computer Vision, Munich, 2018. 11207: 365\u2013380","DOI":"10.1007\/978-3-030-01219-9_22"},{"key":"3586_CR31","doi-asserted-by":"crossref","unstructured":"Li P, Xie J, Wang Q, et al. Is second-order information helpful for large-scale visual recognition? In: Proceedings of the IEEE International Conference on Computer Vision, Venice, 2017. 2089\u20132097","DOI":"10.1109\/ICCV.2017.228"},{"key":"3586_CR32","doi-asserted-by":"crossref","unstructured":"Zhuang P, Wang Y, Qiao Y. Learning attentive pairwise interaction for fine-grained classification. In: Proceedings of the 34th AAAI Conference on Artificial Intelligence, New York, 2020. 13130\u201313137","DOI":"10.1609\/aaai.v34i07.7016"},{"key":"3586_CR33","doi-asserted-by":"crossref","unstructured":"Gao Y, Han X, Wang X, et al. Channel interaction networks for fine-grained image categorization. In: Proceedings of the 34th AAAI Conference on Artificial Intelligence, New York, 2020. 10818\u201310825","DOI":"10.1609\/aaai.v34i07.6712"},{"key":"3586_CR34","doi-asserted-by":"crossref","unstructured":"Zhang S, Huang Q, Hua G, et al. Building contextual visual vocabulary for large-scale image applications. In: Proceedings of the 18th ACM International Conference on Multimedia, Firenze, 2010. 501\u2013510","DOI":"10.1145\/1873951.1874018"},{"key":"3586_CR35","unstructured":"Conde M V, Turgutlu K. Exploring vision transformers for fine-grained classification. 2021. ArXiv:2106.10587"},{"key":"3586_CR36","doi-asserted-by":"crossref","unstructured":"Zhang L, Huang S, Liu W, et al. Learning a mixture of granularity-specific experts for fine-grained categorization. In: Proceedings of the IEEE International Conference on Computer Vision, Seoul, 2019. 8330\u20138339","DOI":"10.1109\/ICCV.2019.00842"},{"key":"3586_CR37","doi-asserted-by":"crossref","unstructured":"He X, Peng Y. Only learn one sample: fine-grained visual categorization with one sample training. In: Proceedings of ACM International Conference on Multimedia, Seoul, 2018. 1372\u20131380","DOI":"10.1145\/3240508.3240557"},{"key":"3586_CR38","doi-asserted-by":"crossref","unstructured":"Zheng H, Fu J, Zha Z, et al. Looking for the devil in the details: learning trilinear attention sampling network for fine-grained image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, Long Beach, 2019. 5012\u20135021","DOI":"10.1109\/CVPR.2019.00515"},{"key":"3586_CR39","doi-asserted-by":"crossref","unstructured":"Ding Y, Zhou Y, Zhu Y, et al. Selective sparse sampling for fine-grained image recognition. In: Proceedings of the IEEE International Conference on Computer Vision, Seoul, 2019. 6598\u20136607","DOI":"10.1109\/ICCV.2019.00670"},{"key":"3586_CR40","doi-asserted-by":"crossref","unstructured":"Abnar S, Zuidema W H. Quantifying attention flow in transformers. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, 2020. 4190\u20134197","DOI":"10.18653\/v1\/2020.acl-main.385"},{"key":"3586_CR41","first-page":"73","volume-title":"Siamese Neural Networks: An Overview","author":"D Chicco","year":"2021","unstructured":"Chicco D. Siamese Neural Networks: An Overview. New York: Springer, 2021. 73\u201394"},{"key":"3586_CR42","doi-asserted-by":"publisher","first-page":"665","DOI":"10.1111\/j.1551-6709.2009.01024.x","volume":"33","author":"C M O\u2019Connor","year":"2009","unstructured":"O\u2019Connor C M, Cree G S, McRae K. Conceptual hierarchies in a flat attractor network: dynamics of learning and computations. Cogn Sci, 2009, 33: 665\u2013708","journal-title":"Cogn Sci"},{"key":"3586_CR43","doi-asserted-by":"publisher","first-page":"181","DOI":"10.1016\/j.ipl.2005.11.003","volume":"97","author":"P S Efraimidis","year":"2006","unstructured":"Efraimidis P S, Spirakis P G. Weighted random sampling with a reservoir. Inf Process Lett, 2006, 97: 181\u2013185","journal-title":"Inf Process Lett"},{"key":"3586_CR44","doi-asserted-by":"crossref","unstructured":"Zhong Z, Zheng L, Kang G, et al. Random erasing data augmentation. In: Proceedings of the 34th AAAI Conference on Artificial Intelligence, New York, 2020. 13001\u201313008","DOI":"10.1609\/aaai.v34i07.7000"},{"key":"3586_CR45","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, et al. Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, Las Vegas, 2016. 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"3586_CR46","doi-asserted-by":"crossref","unstructured":"Luo W, Yang X, Mo X, et al. Cross-X learning for fine-grained visual categorization. In: Proceedings of the IEEE International Conference on Computer Vision, Seoul, 2019. 8241\u20138250","DOI":"10.1109\/ICCV.2019.00833"},{"key":"3586_CR47","doi-asserted-by":"crossref","unstructured":"Liu C, Xie H, Zha Z, et al. Filtration and distillation: enhancing region attention for fine-grained visual categorization. In: Proceedings of the 34th AAAI Conference on Artificial Intelligence, New York, 2020. 11555\u201311562","DOI":"10.1609\/aaai.v34i07.6822"},{"key":"3586_CR48","unstructured":"Hu T, Qi H. See better before looking closer: weakly supervised data augmentation network for fine-grained visual classification. 2019. ArXiv:1901.09891"},{"key":"3586_CR49","doi-asserted-by":"crossref","unstructured":"Ge W, Lin X, Yu Y. Weakly supervised complementary parts models for fine-grained image classification from the bottom up. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, Long Beach, 2019. 3034\u20133043","DOI":"10.1109\/CVPR.2019.00315"},{"key":"3586_CR50","doi-asserted-by":"crossref","unstructured":"Du R, Chang D, Bhunia A K, et al. Fine-grained visual classification via progressive multi-granularity training of jigsaw patches. In: Proceedings of the 16th European Conference on Computer Vision, Glasgow, 2020. 12365: 153\u2013168","DOI":"10.1007\/978-3-030-58565-5_10"},{"key":"3586_CR51","doi-asserted-by":"crossref","unstructured":"Li P, Xie J, Wang Q, et al. Towards faster training of global covariance pooling networks by iterative matrix square root normalization. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, Salt Lake City, 2018. 947\u2013955","DOI":"10.1109\/CVPR.2018.00105"},{"key":"3586_CR52","doi-asserted-by":"publisher","first-page":"476","DOI":"10.1109\/TIP.2019.2921876","volume":"29","author":"H Zheng","year":"2020","unstructured":"Zheng H, Fu J, Zha Z J, et al. Learning rich part hierarchies with progressive attention networks for fine-grained image recognition. IEEE Trans Image Process, 2020, 29: 476\u2013488","journal-title":"IEEE Trans Image Process"}],"container-title":["Science China Information Sciences"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11432-022-3586-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11432-022-3586-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11432-022-3586-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,4,18]],"date-time":"2024-04-18T20:32:01Z","timestamp":1713472321000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11432-022-3586-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,1,31]]},"references-count":52,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2023,3]]}},"alternative-id":["3586"],"URL":"https:\/\/doi.org\/10.1007\/s11432-022-3586-y","relation":{},"ISSN":["1674-733X","1869-1919"],"issn-type":[{"value":"1674-733X","type":"print"},{"value":"1869-1919","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,1,31]]},"assertion":[{"value":"25 January 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 May 2022","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"16 August 2022","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"31 January 2023","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"132107"}}