{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T14:30:37Z","timestamp":1768314637910,"version":"3.49.0"},"reference-count":57,"publisher":"Institute of Electronics, Information and Communications Engineers (IEICE)","issue":"3","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEICE Trans. Inf. &amp; Syst."],"published-print":{"date-parts":[[2022,3,1]]},"DOI":"10.1587\/transinf.2021edp7166","type":"journal-article","created":{"date-parts":[[2022,2,28]],"date-time":"2022-02-28T22:25:09Z","timestamp":1646087109000},"page":"713-726","source":"Crossref","is-referenced-by-count":15,"title":["Recursive Multi-Scale Channel-Spatial Attention for Fine-Grained Image Classification"],"prefix":"10.1587","volume":"E105.D","author":[{"given":"Dichao","family":"LIU","sequence":"first","affiliation":[{"name":"Graduate School of Informatics, Nagoya University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yu","family":"WANG","sequence":"additional","affiliation":[{"name":"College of Information Science and Engineering, Ritsumeikan University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kenji","family":"MASE","sequence":"additional","affiliation":[{"name":"Graduate School of Informatics, Nagoya University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jien","family":"KATO","sequence":"additional","affiliation":[{"name":"College of Information Science and Engineering, Ritsumeikan University"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"532","reference":[{"key":"1","doi-asserted-by":"crossref","unstructured":"[1] W. Ge, X. Lin, and Y. Yu, \u201cWeakly supervised complementary parts models for fine-grained image classification from the bottom up,\u201d Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp.3034-3043, 2019. 10.1109\/cvpr.2019.00315","DOI":"10.1109\/CVPR.2019.00315"},{"key":"2","doi-asserted-by":"publisher","unstructured":"[2] Y. Ding, Z. Ma, S. Wen, J. Xie, D. Chang, Z. Si, M. Wu, and H. Ling, \u201cAp-cnn: weakly supervised attention pyramid convolutional neural network for fine-grained visual classification,\u201d IEEE Transactions on Image Processing, vol.30, pp.2826-2836, 2021. 10.1109\/tip.2021.3055617","DOI":"10.1109\/TIP.2021.3055617"},{"key":"3","doi-asserted-by":"publisher","unstructured":"[3] T. Rao, X. Li, H. Zhang, and M. Xu, \u201cMulti-level region-based convolutional neural network for image emotion classification,\u201d Neurocomputing, vol.333, pp.429-439, 2019. 10.1016\/j.neucom.2018.12.053","DOI":"10.1016\/j.neucom.2018.12.053"},{"key":"4","doi-asserted-by":"crossref","unstructured":"[4] H. Zheng, J. Fu, Z.-J. Zha, and J. Luo, \u201cLooking for the devil in the details: Learning trilinear attention sampling network for fine-grained image recognition,\u201d Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp.5012-5021, 2019. 10.1109\/cvpr.2019.00515","DOI":"10.1109\/CVPR.2019.00515"},{"key":"5","doi-asserted-by":"publisher","unstructured":"[5] X. He, Y. Peng, and J. Zhao, \u201cFast fine-grained image classification via weakly supervised discriminative localization,\u201d IEEE Transactions on Circuits and Systems for Video Technology, vol.29, no.5, pp.1394-1407, 2018. 10.1109\/tcsvt.2018.2834480","DOI":"10.1109\/TCSVT.2018.2834480"},{"key":"6","doi-asserted-by":"crossref","unstructured":"[6] Z. Yang, T. Luo, D. Wang, Z. Hu, J. Gao, and L. Wang, \u201cLearning to navigate for fine-grained classification,\u201d Proceedings of the European Conference on Computer Vision, vol.11218, pp.438-454, 2018. 10.1007\/978-3-030-01264-9_26","DOI":"10.1007\/978-3-030-01264-9_26"},{"key":"7","doi-asserted-by":"crossref","unstructured":"[7] J. Fu, H. Zheng, and T. Mei, \u201cLook closer to see better: Recurrent attention convolutional neural network for fine-grained image recognition,\u201d Proceedings of the IEEE conference on computer vision and pattern recognition, pp.4438-4446, 2017. 10.1109\/cvpr.2017.476","DOI":"10.1109\/CVPR.2017.476"},{"key":"8","doi-asserted-by":"publisher","unstructured":"[8] Y. Zhang, X.-S. Wei, J. Wu, J. Cai, J. Lu, V.-A. Nguyen, and M.N. Do, \u201cWeakly supervised fine-grained categorization with part-based image representation,\u201d IEEE Transactions on Image Processing, vol.25, no.4, pp.1713-1725, 2016. 10.1109\/tip.2016.2531289","DOI":"10.1109\/TIP.2016.2531289"},{"key":"9","doi-asserted-by":"crossref","unstructured":"[9] M. Simon and E. Rodner, \u201cNeural activation constellations: Unsupervised part model discovery with convolutional networks,\u201d Proceedings of the IEEE international conference on computer vision, pp.1143-1151, 2015. 10.1109\/iccv.2015.136","DOI":"10.1109\/ICCV.2015.136"},{"key":"10","unstructured":"[10] T. Xiao, Y. Xu, K. Yang, J. Zhang, Y. Peng, and Z. Zhang, \u201cThe application of two-level attention models in deep convolutional neural network for fine-grained image classification,\u201d Proceedings of the IEEE conference on computer vision and pattern recognition, pp.842-850, 2015. 10.1109\/cvpr.2015.7298685"},{"key":"11","doi-asserted-by":"crossref","unstructured":"[11] L. Xie, Q. Tian, R. Hong, S. Yan, and B. Zhang, \u201cHierarchical part matching for fine-grained visual categorization,\u201d Proceedings of the IEEE international conference on computer vision, pp.1641-1648, 2013. 10.1109\/iccv.2013.206","DOI":"10.1109\/ICCV.2013.206"},{"key":"12","doi-asserted-by":"crossref","unstructured":"[12] S. Huang, Z. Xu, D. Tao, and Y. Zhang, \u201cPart-stacked cnn for fine-grained visual categorization,\u201d Proceedings of the IEEE conference on computer vision and pattern recognition, pp.1173-1182, 2016. 10.1109\/cvpr.2016.132","DOI":"10.1109\/CVPR.2016.132"},{"key":"13","doi-asserted-by":"crossref","unstructured":"[13] D. Lin, X. Shen, C. Lu, and J. Jia, \u201cDeep lac: Deep localization, alignment and classification for fine-grained recognition,\u201d Proceedings of the IEEE conference on computer vision and pattern recognition, pp.1666-1674, 2015. 10.1109\/cvpr.2015.7298775","DOI":"10.1109\/CVPR.2015.7298775"},{"key":"14","doi-asserted-by":"crossref","unstructured":"[14] O.M. Parkhi, A. Vedaldi, C. Jawahar, and A. Zisserman, \u201cThe truth about cats and dogs,\u201d 2011 International Conference on Computer Vision, pp.1427-1434, IEEE, 2011. 10.1109\/iccv.2011.6126398","DOI":"10.1109\/ICCV.2011.6126398"},{"key":"15","doi-asserted-by":"crossref","unstructured":"[15] H. Zhang, T. Xu, M. Elhoseiny, X. Huang, S. Zhang, A. Elgammal, and D. Metaxas, \u201cSpda-cnn: Unifying semantic part detection and abstraction for fine-grained recognition,\u201d Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp.1143-1152, 2016. 10.1109\/cvpr.2016.129","DOI":"10.1109\/CVPR.2016.129"},{"key":"16","doi-asserted-by":"crossref","unstructured":"[16] N. Zhang, J. Donahue, R. Girshick, and T. Darrell, \u201cPart-based r-cnns for fine-grained category detection,\u201d European conference on computer vision, vol.8689, pp.834-849, Springer, 2014. 10.1007\/978-3-319-10590-1_54","DOI":"10.1007\/978-3-319-10590-1_54"},{"key":"17","unstructured":"[17] M. Jaderberg, K. Simonyan, A. Zisserman, et al., \u201cSpatial transformer networks,\u201d Advances in neural information processing systems, vol.28, pp.2017-2025, 2015."},{"key":"18","doi-asserted-by":"publisher","unstructured":"[18] Y. Peng, X. He, and J. Zhao, \u201cObject-part attention model for fine-grained image classification,\u201d IEEE Transactions on Image Processing, vol.27, no.3, pp.1487-1500, 2017. 10.1109\/tip.2017.2774041","DOI":"10.1109\/TIP.2017.2774041"},{"key":"19","doi-asserted-by":"crossref","unstructured":"[19] X. Zhang, H. Xiong, W. Zhou, W. Lin, and Q. Tian, \u201cPicking deep filter responses for fine-grained image recognition,\u201d Proceedings of the IEEE conference on computer vision and pattern recognition, pp.1134-1142, 2016. 10.1109\/cvpr.2016.128","DOI":"10.1109\/CVPR.2016.128"},{"key":"20","doi-asserted-by":"crossref","unstructured":"[20] K. He, G. Gkioxari, P. Doll\u00e1r, and R. Girshick, \u201cMask r-cnn,\u201d Proceedings of the IEEE international conference on computer vision, pp.2961-2969, 2017. 10.1109\/iccv.2017.322","DOI":"10.1109\/ICCV.2017.322"},{"key":"21","doi-asserted-by":"publisher","unstructured":"[21] C. Sutton and A. McCallum, \u201cAn introduction to conditional random fields,\u201d Mach. Learn, vol.4, no.4, pp.267-373, 2012. 10.1561\/2200000013","DOI":"10.1561\/2200000013"},{"key":"22","doi-asserted-by":"publisher","unstructured":"[22] S. Hochreiter and J. Schmidhuber, \u201cLong short-term memory,\u201d Neural computation, vol.9, no.8, pp.1735-1780, 1997. 10.1162\/neco.1997.9.8.1735","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"23","doi-asserted-by":"crossref","unstructured":"[23] H. Zhao, Y. Zhang, S. Liu, J. Shi, C.C. Loy, D. Lin, and J. Jia, \u201cPsanet: Point-wise spatial attention network for scene parsing,\u201d Proceedings of the European Conference on Computer Vision (ECCV), vol.11213, pp.270-286, 2018. 10.1007\/978-3-030-01240-3_17","DOI":"10.1007\/978-3-030-01240-3_17"},{"key":"24","doi-asserted-by":"crossref","unstructured":"[24] J. Hu, L. Shen, and G. Sun, \u201cSqueeze-and-excitation networks,\u201d Proceedings of the IEEE conference on computer vision and pattern recognition, pp.7132-7141, 2018. 10.1109\/cvpr.2018.00745","DOI":"10.1109\/CVPR.2018.00745"},{"key":"25","doi-asserted-by":"crossref","unstructured":"[25] Y. Dai, F. Gieseke, S. Oehmcke, Y. Wu, and K. Barnard, \u201cAttentional feature fusion,\u201d Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp.3560-3569, 2021. 10.1109\/wacv48630.2021.00360","DOI":"10.1109\/WACV48630.2021.00360"},{"key":"26","doi-asserted-by":"publisher","unstructured":"[26] J. Park, S. Woo, J.-Y. Lee, and I.S. Kweon, \u201cA simple and light-weight attention module for convolutional neural networks,\u201d International Journal of Computer Vision, vol.128, no.4, pp.783-798, 2020. 10.1007\/s11263-019-01283-0","DOI":"10.1007\/s11263-019-01283-0"},{"key":"27","doi-asserted-by":"crossref","unstructured":"[27] S. Woo, J. Park, J.-Y. Lee, and I.S. Kweon, \u201cCbam: Convolutional block attention module,\u201d Proceedings of the European conference on computer vision (ECCV), vol.11211, pp.3-19, 2018. 10.1007\/978-3-030-01234-2_1","DOI":"10.1007\/978-3-030-01234-2_1"},{"key":"28","doi-asserted-by":"crossref","unstructured":"[28] K. He, X. Zhang, S. Ren, and J. Sun, \u201cDeep residual learning for image recognition,\u201d Proceedings of the IEEE conference on computer vision and pattern recognition, pp.770-778, 2016. 10.1109\/cvpr.2016.90","DOI":"10.1109\/CVPR.2016.90"},{"key":"29","unstructured":"[29] K. Simonyan and A. Zisserman, \u201cVery deep convolutional networks for large-scale image recognition,\u201d arXiv preprint arXiv:1409.1556, 2014."},{"key":"30","unstructured":"[30] C. Wah, S. Branson, P. Welinder, P. Perona, and S. Belongie, \u201cThe Caltech-UCSD Birds-200-2011 Dataset,\u201d Tech. Rep. CNS-TR-2011-001, California Institute of Technology, 2011."},{"key":"31","doi-asserted-by":"crossref","unstructured":"[31] J. Krause, M. Stark, J. Deng, and L. Fei-Fei, \u201c3d object representations for fine-grained categorization,\u201d 4th International IEEE Workshop on 3D Representation and Recognition (3dRR-13), Sydney, Australia, 2013. 10.1109\/iccvw.2013.77","DOI":"10.1109\/ICCVW.2013.77"},{"key":"32","doi-asserted-by":"crossref","unstructured":"[32] R. Du, D. Chang, A.K. Bhunia, J. Xie, Z. Ma, Y.-Z. Song, and J. Guo, \u201cFine-grained visual classification via progressive multi-granularity training of jigsaw patches,\u201d European Conference on Computer Vision, vol.12365, pp.153-168, Springer, 2020. 10.1007\/978-3-030-58565-5_10","DOI":"10.1007\/978-3-030-58565-5_10"},{"key":"33","doi-asserted-by":"crossref","unstructured":"[33] T. Zhang, D. Chang, Z. Ma, and J. Guo, \u201cProgressive co-attention network for fine-grained visual classification,\u201d arXiv preprint arXiv:2101.08527, 2021.","DOI":"10.1109\/VCIP53242.2021.9675376"},{"key":"34","doi-asserted-by":"crossref","unstructured":"[34] T.H. Kim, M.S. Sajjadi, M. Hirsch, and B. Scholkopf, \u201cSpatio-temporal transformer network for video restoration,\u201d Proceedings of the European Conference on Computer Vision (ECCV), vol.11207, pp.111-127, 2018. 10.1007\/978-3-030-01219-9_7","DOI":"10.1007\/978-3-030-01219-9_7"},{"key":"35","doi-asserted-by":"crossref","unstructured":"[35] K.M. Schatz, E. Quintanilla, S. Vyas, and Y.S. Rawat, \u201cA recurrent transformer network for novel view action synthesis,\u201d Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, Aug. 23-28, 2020, Proceedings, Part XXVII 16, vol.12372, pp.410-426, Springer, 2020. 10.1007\/978-3-030-58583-9_25","DOI":"10.1007\/978-3-030-58583-9_25"},{"key":"36","unstructured":"[36] A. Vaswani, N. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A.N. Gomez, \u0141. Kaiser, and I. Polosukhin, \u201cAttention is all you need,\u201d Advances in neural information processing systems, pp.5998-6008, 2017."},{"key":"37","unstructured":"[37] A. Dosovitskiy, L. Beyer, A. Kolesnikov, D. Weissenborn, X. Zhai, T. Unterthiner, M. Dehghani, M. Minderer, G. Heigold, S. Gelly, et al., \u201cAn image is worth 16x16 words: Transformers for image recognition at scale,\u201d arXiv preprint arXiv:2010.11929, 2020."},{"key":"38","unstructured":"[38] J. He, J.N. Chen, S. Liu, A. Kortylewski, C. Yang, Y. Bai, C. Wang, and A. Yuille, \u201cTransfg: A transformer architecture for fine-grained recognition,\u201d arXiv preprint arXiv:2103.07976, 2021."},{"key":"39","doi-asserted-by":"publisher","unstructured":"[39] S.R. Safavian and D. Landgrebe, \u201cA survey of decision tree classifier methodology,\u201d IEEE transactions on systems, man, and cybernetics, vol.21, no.3, pp.660-674, 1991. 10.1109\/21.97458","DOI":"10.1109\/21.97458"},{"key":"40","doi-asserted-by":"crossref","unstructured":"[40] M. Nauta, R. van Bree, and C. Seifert, \u201cNeural prototype trees for interpretable fine-grained image recognition,\u201d Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp.14933-14943, 2021. 10.1109\/cvpr46437.2021.01469","DOI":"10.1109\/CVPR46437.2021.01469"},{"key":"41","doi-asserted-by":"crossref","unstructured":"[41] R. Ji, L. Wen, L. Zhang, D. Du, Y. Wu, C. Zhao, X. Liu, and F. Huang, \u201cAttention convolutional binary neural tree for fine-grained visual categorization,\u201d Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp.10468-10477, 2020. 10.1109\/cvpr42600.2020.01048","DOI":"10.1109\/CVPR42600.2020.01048"},{"key":"42","doi-asserted-by":"crossref","unstructured":"[42] F. Xu, M. Wang, W. Zhang, Y. Cheng, and W. Chu, \u201cDiscrimination-aware mechanism for fine-grained representation learning,\u201d Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp.813-822, 2021. 10.1109\/cvpr46437.2021.00087","DOI":"10.1109\/CVPR46437.2021.00087"},{"key":"43","doi-asserted-by":"crossref","unstructured":"[43] Y. Zhao, K. Yan, F. Huang, and J. Li, \u201cGraph-based high-order relation discovery for fine-grained recognition,\u201d Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp.15079-15088, 2021. 10.1109\/cvpr46437.2021.01483","DOI":"10.1109\/CVPR46437.2021.01483"},{"key":"44","doi-asserted-by":"crossref","unstructured":"[44] J. Deng, W. Dong, R. Socher, L.J. Li, K. Li, and L. Fei-Fei, \u201cImagenet: A large-scale hierarchical image database,\u201d 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp.248-255, 2009. 10.1109\/cvpr.2009.5206848","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"45","unstructured":"[45] S. Ioffe and C. Szegedy, \u201cBatch normalization: Accelerating deep network training by reducing internal covariate shift,\u201d International conference on machine learning, pp.448-456, PMLR, 2015."},{"key":"46","doi-asserted-by":"crossref","unstructured":"[46] Q. Xu and L. Zhang, \u201cThe effect of different hidden unit number of sparse autoencoder,\u201d The 27th Chinese Control and Decision Conference (2015 CCDC), pp.2464-2467, 2015. 10.1109\/ccdc.2015.7162335","DOI":"10.1109\/CCDC.2015.7162335"},{"key":"47","doi-asserted-by":"crossref","unstructured":"[47] M.D. Zeiler and R. Fergus, \u201cVisualizing and understanding convolutional networks,\u201d European conference on computer vision, vol.8689, pp.818-833, Springer, 2014. 10.1007\/978-3-319-10590-1_53","DOI":"10.1007\/978-3-319-10590-1_53"},{"key":"48","unstructured":"[48] A. Paszke, S. Gross, S. Chintala, G. Chanan, E. Yang, Z. DeVito, Z. Lin, A. Desmaison, L. Antiga, and A. Lerer, \u201cAutomatic differentiation in pytorch,\u201d 2017."},{"key":"49","doi-asserted-by":"crossref","unstructured":"[49] T. He, Z. Zhang, H. Zhang, Z. Zhang, J. Xie, and M. Li, \u201cBag of tricks for image classification with convolutional neural networks,\u201d Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp.558-567, 2019. 10.1109\/cvpr.2019.00065","DOI":"10.1109\/CVPR.2019.00065"},{"key":"50","doi-asserted-by":"crossref","unstructured":"[50] C. Szegedy, W. Liu, Y. Jia, P. Sermanet, S. Reed, D. Anguelov, D. Erhan, V. Vanhoucke, and A. Rabinovich, \u201cGoing deeper with convolutions,\u201d Proceedings of the IEEE conference on computer vision and pattern recognition, pp.1-9, 2015. 10.1109\/cvpr.2015.7298594","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"51","unstructured":"[51] R.R. Selvaraju, A. Das, R. Vedantam, M. Cogswell, D. Parikh, and D. Batra, \u201cGrad-cam: Why did you say that?,\u201d arXiv preprint arXiv:1611.07450, 2016."},{"key":"52","doi-asserted-by":"publisher","unstructured":"[52] B. Zhao, J. Feng, X. Wu, and S. Yan, \u201cA survey on deep learning-based fine-grained object classification and semantic segmentation,\u201d International Journal of Automation and Computing, vol.14, no.2, pp.119-135, 2017. 10.1007\/s11633-017-1053-3","DOI":"10.1007\/s11633-017-1053-3"},{"key":"53","doi-asserted-by":"crossref","unstructured":"[53] A. Angelova and S. Zhu, \u201cEfficient object detection and segmentation for fine-grained recognition,\u201d Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp.811-818, 2013. 10.1109\/cvpr.2013.110","DOI":"10.1109\/CVPR.2013.110"},{"key":"54","unstructured":"[54] R. Wightman, \u201cPytorch image models.\u201d https:\/\/github.com\/rwightman\/pytorch-image-models, 2019."},{"key":"55","unstructured":"[55] P. Micikevicius, S. Narang, J. Alben, G. Diamos, E. Elsen, D. Garcia, B. Ginsburg, M. Houston, O. Kuchaiev, G. Venkatesh, et al., \u201cMixed precision training,\u201d arXiv preprint arXiv:1710.03740, 2017."},{"key":"56","unstructured":"[56] R. M\u00fcller, S. Kornblith, and G. Hinton, \u201cWhen does label smoothing help?,\u201d arXiv preprint arXiv:1906.02629, 2019."},{"key":"57","unstructured":"[57] I. Loshchilov and F. Hutter, \u201cSGDR: stochastic gradient descent with warm restarts,\u201d 5th International Conference on Learning Representations, 2017."}],"container-title":["IEICE Transactions on Information and Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E105.D\/3\/E105.D_2021EDP7166\/_pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,3,5]],"date-time":"2022-03-05T04:00:16Z","timestamp":1646452816000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E105.D\/3\/E105.D_2021EDP7166\/_article"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,3,1]]},"references-count":57,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2022]]}},"URL":"https:\/\/doi.org\/10.1587\/transinf.2021edp7166","relation":{},"ISSN":["0916-8532","1745-1361"],"issn-type":[{"value":"0916-8532","type":"print"},{"value":"1745-1361","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,3,1]]},"article-number":"2021EDP7166"}}