{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,17]],"date-time":"2026-03-17T19:05:15Z","timestamp":1773774315917,"version":"3.50.1"},"reference-count":63,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2020,1,28]],"date-time":"2020-01-28T00:00:00Z","timestamp":1580169600000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2020,1,28]],"date-time":"2020-01-28T00:00:00Z","timestamp":1580169600000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2020,4]]},"DOI":"10.1007\/s11263-019-01283-0","type":"journal-article","created":{"date-parts":[[2020,1,28]],"date-time":"2020-01-28T11:02:55Z","timestamp":1580209375000},"page":"783-798","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":143,"title":["A Simple and Light-Weight Attention Module for Convolutional Neural Networks"],"prefix":"10.1007","volume":"128","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9808-6823","authenticated-orcid":false,"given":"Jongchan","family":"Park","sequence":"first","affiliation":[]},{"given":"Sanghyun","family":"Woo","sequence":"additional","affiliation":[]},{"given":"Joon-Young","family":"Lee","sequence":"additional","affiliation":[]},{"given":"In So","family":"Kweon","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2020,1,28]]},"reference":[{"key":"1283_CR1","unstructured":"Ba, J., Mnih, V., & Kavukcuoglu, K. (2015). Multiple object recognition with visual attention. In Proceedings of international conference on learning representations (ICLR)."},{"key":"1283_CR2","unstructured":"Bahdanau, D., Cho, K., & Bengio, Y. (2014). Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:1409.0473."},{"key":"1283_CR3","doi-asserted-by":"crossref","unstructured":"Bell, S., Lawrence\u00a0Zitnick, C., Bala, K., & Girshick, R. (2016). Inside-outside net: Detecting objects in context with skip pooling and recurrent neural networks. In Proceedings of computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2016.314"},{"key":"1283_CR4","doi-asserted-by":"crossref","unstructured":"Chen, L., Zhang, H., Xiao, J., Nie, L., Shao, J., & Chua, T. S. (2017a) Sca-cnn: Spatial and channel-wise attention in convolutional networks for image captioning. In Proceedings of computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2017.667"},{"key":"1283_CR5","doi-asserted-by":"crossref","unstructured":"Chen, L., Zhang, H., Xiao, J., Nie, L., Shao, J., Liu, W., & Chua, T. S. (2017b) Sca-cnn: Spatial and channel-wise attention in convolutional networks for image captioning. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 5659\u20135667).","DOI":"10.1109\/CVPR.2017.667"},{"key":"1283_CR6","unstructured":"Chen, L. C., Papandreou, G., Kokkinos, I., Murphy, K., & Yuille, A. L. (2016). Deeplab: Semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected crfs. arXiv preprint arXiv:1606.00915."},{"key":"1283_CR7","doi-asserted-by":"crossref","unstructured":"Chollet, F. (2017). Xception: Deep learning with depthwise separable convolutions. In Proceedings of computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2017.195"},{"key":"1283_CR8","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1038\/nrn755","volume":"3","author":"M Corbetta","year":"2002","unstructured":"Corbetta, M., & Shulman, G. L. (2002). Control of goal-directed and stimulus-driven attention in the brain. Nature Reviews Neuroscience, 3, 3.","journal-title":"Nature Reviews Neuroscience"},{"key":"1283_CR9","unstructured":"Dai, J., Qi, H., Xiong, Y., Li, Y., Zhang, G., Hu, H., Wei, Y. (2017). Deformable convolutional networks. CoRR, abs\/170306211 1(2):3."},{"key":"1283_CR10","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L. J., Li, K., & Fei-Fei, L. (2009). Imagenet: A large-scale hierarchical image database. In Proceedings of computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"1283_CR11","unstructured":"Goodfellow, I., Pouget-Abadie, J., Mirza, M., Xu, B., Warde-Farley, D., Ozair, S., Courville, A., & Bengio, Y. (2014). Generative adversarial nets. In Advances in neural information processing systems (pp. 2672\u20132680)."},{"key":"1283_CR12","unstructured":"Gregor, K., Danihelka, I., Graves, A., Rezende, D. J., & Wierstra, D. (2015). Draw: A recurrent neural network for image generation. In Proceedings of international conference on machine learning (ICML)."},{"key":"1283_CR13","doi-asserted-by":"crossref","unstructured":"Han, D., Kim, J., & Kim, J. (2017). Deep pyramidal residual networks. In 2017 IEEE conference on computer vision and pattern recognition (CVPR) (pp. 6307\u20136315). IEEE.","DOI":"10.1109\/CVPR.2017.668"},{"key":"1283_CR14","doi-asserted-by":"crossref","unstructured":"Hariharan, B., Arbel\u00e1ez, P., Girshick, R., & Malik, J. (2015). Hypercolumns for object segmentation and fine-grained localization. In Proceedings of computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2015.7298642"},{"key":"1283_CR15","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J. (2016a). Deep residual learning for image recognition. In Proceedings of computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2016.90"},{"key":"1283_CR16","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016b). Identity mappings in deep residual networks. In Proceedings of European conference on computer vision (ECCV).","DOI":"10.1007\/978-3-319-46493-0_38"},{"issue":"9","key":"1283_CR17","doi-asserted-by":"publisher","first-page":"1095","DOI":"10.1016\/0042-6989(89)90058-8","volume":"29","author":"J Hirsch","year":"1989","unstructured":"Hirsch, J., & Curcio, C. A. (1989). The spatial resolution capacity of human foveal retina. Vision Research, 29(9), 1095\u20131101.","journal-title":"Vision Research"},{"key":"1283_CR18","unstructured":"Howard, A. G., Zhu, M., Chen, B., Kalenichenko, D., Wang, W., Weyand, T., Andreetto, M., & Adam, H. (2017). Mobilenets: Efficient convolutional neural networks for mobile vision applications. arXiv preprint arXiv:1704.04861."},{"key":"1283_CR19","unstructured":"Hu, J., Shen, L., Albanie, S., Sun, G., & Vedaldi, A. (2018a). Gather-excite: Exploiting feature context in convolutional neural networks. In Advances in neural information processing systems (pp. 9422\u20139432)."},{"key":"1283_CR20","doi-asserted-by":"crossref","unstructured":"Hu, J., Shen, L., & Sun, G. (2018b). Squeeze-and-excitation networks. In Proceedings of computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2018.00745"},{"key":"1283_CR21","doi-asserted-by":"crossref","unstructured":"Huang, G., Liu, Z., Weinberger, K. Q., & van\u00a0der Maaten, L. (2017). Densely connected convolutional networks. In Proceedings of computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2017.243"},{"key":"1283_CR22","doi-asserted-by":"crossref","unstructured":"Huang, G., Sun, Y., Liu, Z., Sedra, D., & Weinberger, K. Q. (2016). Deep networks with stochastic depth. In Proceedings of European conference on computer vision (ECCV).","DOI":"10.1007\/978-3-319-46493-0_39"},{"issue":"3","key":"1283_CR23","doi-asserted-by":"publisher","first-page":"574","DOI":"10.1113\/jphysiol.1959.sp006308","volume":"148","author":"DH Hubel","year":"1959","unstructured":"Hubel, D. H., & Wiesel, T. N. (1959). Receptive fields of single neurones in the cat\u2019s striate cortex. The Journal of Physiology, 148(3), 574\u2013591.","journal-title":"The Journal of Physiology"},{"key":"1283_CR24","unstructured":"Iandola, F. N., Han, S., Moskewicz, M. W., Ashraf, K., Dally, W. J., & Keutzer, K. (2016). Squeezenet: Alexnet-level accuracy with 50x fewer parameters and<0.5mb model size. arXiv preprint arXiv:1602.07360."},{"key":"1283_CR25","unstructured":"Ioffe, S., & Szegedy, C. (2015). Batch normalization: Accelerating deep network training by reducing internal covariate shift. In Proceedings of international conference on machine learning (ICML)."},{"key":"1283_CR26","doi-asserted-by":"crossref","unstructured":"Itti, L., Koch, C., & Niebur, E. (1998). A model of saliency-based visual attention for rapid scene analysis. In IEEE transactions on pattern analysis machine intelligence (TPAMI).","DOI":"10.1109\/34.730558"},{"key":"1283_CR27","unstructured":"Jaderberg, M., Simonyan, K., Zisserman, A., et\u00a0al. (2015a). Spatial transformer networks. In Proceedings of neural information processing systems (NIPS)."},{"key":"1283_CR28","unstructured":"Jaderberg, M., Simonyan, K., Zisserman, A., et\u00a0al. (2015b). Spatial transformer networks. In Advances in neural information processing systems (pp. 2017\u20132025)."},{"key":"1283_CR29","unstructured":"Jia, X., De\u00a0Brabandere, B., Tuytelaars, T., & Gool, L. V. (2016). Dynamic filter networks. In Advances in neural information processing systems (pp. 667\u2013675)."},{"key":"1283_CR30","unstructured":"Kingma, D. P., & Ba, J. (2014). Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980."},{"key":"1283_CR31","unstructured":"Krizhevsky, A., & Hinton, G. (2009). Learning multiple layers of features from tiny images. Technical report, University of Toronto."},{"key":"1283_CR32","unstructured":"Krizhevsky, A., Sutskever, I., & Hinton, G. E. (2012). Imagenet classification with deep convolutional neural networks. In Proceedings of neural information processing systems (NIPS)."},{"key":"1283_CR33","unstructured":"Larochelle, H., & Hinton, G. E. (2010). Learning to combine foveal glimpses with a third-order Boltzmann machine. In Proceedings of neural information processing systems (NIPS)."},{"key":"1283_CR34","doi-asserted-by":"crossref","unstructured":"Ledig, C., Theis, L., Husz\u00e1r, F., Caballero, J., Cunningham, A., Acosta, A., Aitken, A. P., Tejani, A., Totz, J., Wang, Z., et\u00a0al. (2017). Photo-realistic single image super-resolution using a generative adversarial network. In Proceedings of computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2017.19"},{"key":"1283_CR35","doi-asserted-by":"crossref","unstructured":"Li, W., Zhu, X., & Gong, S. (2018). Harmonious attention network for person re-identification. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 2285\u20132294).","DOI":"10.1109\/CVPR.2018.00243"},{"key":"1283_CR36","doi-asserted-by":"crossref","unstructured":"Lin, T. Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., & Zitnick, C. L. (2014). Microsoft coco: Common objects in context. In Proceedings of European conference on computer vision (ECCV).","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"1283_CR37","doi-asserted-by":"crossref","unstructured":"Liu, W., Anguelov, D., Erhan, D., Szegedy, C., Reed, S., Fu, C. Y., & Berg, A. C. (2016). Ssd: Single shot multibox detector. In Proceedings of European conference on computer vision (ECCV).","DOI":"10.1007\/978-3-319-46448-0_2"},{"key":"1283_CR38","doi-asserted-by":"crossref","unstructured":"Long, J., Shelhamer, E., & Darrell, T. (2015). Fully convolutional networks for semantic segmentation. In Proceedings of computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"1283_CR39","unstructured":"Marr, D., & Vision, A. (1982). A computational investigation into the human representation and processing of visual information (Vol. 1, No. 2). WH San Francisco: Freeman and Company."},{"key":"1283_CR40","unstructured":"Mnih, V., Heess, N., Graves, A., et\u00a0al. (2014). Recurrent models of visual attention. Advances in neural information processing systems. In Proceedings of neural information processing systems (NIPS)."},{"key":"1283_CR41","unstructured":"Morcos, A. S., Barrett, D. G., Rabinowitz, N. C., & Botvinick, M. (2018) On the importance of single directions for generalization. In Proceedings of international conference on learning representations (ICLR)."},{"key":"1283_CR42","doi-asserted-by":"crossref","unstructured":"Nam, H., Ha, J. W., & Kim, J. (2017). Dual attention networks for multimodal reasoning and matching. In Proceedings of computer vision and pattern recognition (CVPR) (pp. 2156\u20132164).","DOI":"10.1109\/CVPR.2017.232"},{"key":"1283_CR43","unstructured":"Ren, S., He, K., Girshick, R., & Sun, J. (2015). Faster r-cnn: Towards real-time object detection with region proposal networks. In Proceedings of neural information processing systems (NIPS)."},{"issue":"1\u20133","key":"1283_CR44","doi-asserted-by":"publisher","first-page":"17","DOI":"10.1080\/135062800394667","volume":"7","author":"RA Rensink","year":"2000","unstructured":"Rensink, R. A. (2000). The dynamic representation of scenes. Visual Cognition, 7(1\u20133), 17\u201342.","journal-title":"Visual Cognition"},{"issue":"11","key":"1283_CR45","doi-asserted-by":"publisher","first-page":"1019","DOI":"10.1038\/14819","volume":"2","author":"M Riesenhuber","year":"1999","unstructured":"Riesenhuber, M., & Poggio, T. (1999). Hierarchical models of object recognition in cortex. Nature Neuroscience, 2(11), 1019\u20131025.","journal-title":"Nature Neuroscience"},{"key":"1283_CR46","doi-asserted-by":"crossref","unstructured":"Sandler, M., Howard, A., Zhu, M., Zhmoginov, A., & Chen, L. C. (2018). Mobilenetv2: Inverted residuals and linear bottlenecks. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 4510\u20134520).","DOI":"10.1109\/CVPR.2018.00474"},{"key":"1283_CR47","doi-asserted-by":"crossref","unstructured":"Simon, M., & Rodner, E. (2015). Neural activation constellations: Unsupervised part model discovery with convolutional networks. In Proceedings of the IEEE international conference on computer vision (pp. 1143\u20131151).","DOI":"10.1109\/ICCV.2015.136"},{"key":"1283_CR48","unstructured":"Simonyan, K., & Zisserman, A. (2015). Very deep convolutional networks for large-scale image recognition. In Proceedings of international conference on learning representations (ICLR)."},{"key":"1283_CR49","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Liu, W., Jia, Y., Sermanet, P., Reed, S., Anguelov, D., Erhan, D., Vanhoucke, V., & Rabinovich, A. (2015). Going deeper with convolutions. In Proceedings of computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"1283_CR50","doi-asserted-by":"crossref","unstructured":"Wang, F., Jiang, M., Qian, C., Yang, S., Li, C., Zhang, H., Wang, X., & Tang, X. (2017). Residual attention network for image classification. In Proceedings of computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2017.683"},{"key":"1283_CR51","doi-asserted-by":"crossref","unstructured":"Woo, S., Hwang, S., & Kweon, I. S. (2018a) Stairnet: Top-down semantic aggregation for accurate one shot detection. In Proceedings of winter conference on applications of computer vision (WACV).","DOI":"10.1109\/WACV.2018.00125"},{"key":"1283_CR52","doi-asserted-by":"crossref","unstructured":"Woo, S., Park, J., Lee, J. Y., & Kweon, I. S. (2018b) Cbam: Convolutional block attention module. In Proceedings of European conference on computer vision (ECCV).","DOI":"10.1007\/978-3-030-01234-2_1"},{"key":"1283_CR53","doi-asserted-by":"crossref","unstructured":"Xiao, T., Liu, Y., Zhou, B., Jiang, Y., & Sun, J. (2018). Unified perceptual parsing for scene understanding. In Proceedings of European conference on computer vision (ECCV).","DOI":"10.1007\/978-3-030-01228-1_26"},{"key":"1283_CR54","doi-asserted-by":"crossref","unstructured":"Xie, S., Girshick, R., Doll\u00e1r, P., Tu, Z., He, K. (2017). Aggregated residual transformations for deep neural networks. In Proceedings of computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2017.634"},{"key":"1283_CR55","unstructured":"Xu, K., Ba, J., Kiros, R., Cho, K., Courville, A., Salakhudinov, R., Zemel, R., & Bengio, Y. (2015). Show, attend and tell: Neural image caption generation with visual attention. In Proceedings of international conference on machine learning (ICML)."},{"key":"1283_CR56","doi-asserted-by":"crossref","unstructured":"Yang, Z., He, X., Gao, J., Deng, L., & Smola, A. (2016). Stacked attention networks for image question answering. In Proceedings of computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2016.10"},{"key":"1283_CR57","unstructured":"Yu, F, & Koltun, V. (2016). Multi-scale context aggregation by dilated convolutions. In Proceedings of international conference on learning representations (ICLR)."},{"key":"1283_CR58","doi-asserted-by":"crossref","unstructured":"Zagoruyko, S, & Komodakis, N. (2016). Wide residual networks. In Proceedings of British machine vision conference (BMVC).","DOI":"10.5244\/C.30.87"},{"key":"1283_CR59","unstructured":"Zeiler, M. D. (2012) Adadelta: An adaptive learning rate method. arXiv preprint arXiv:1212.5701."},{"key":"1283_CR60","doi-asserted-by":"crossref","unstructured":"Zhang, X., Xiong, H., Zhou, W., Lin, W., & Tian, Q. (2016). Picking deep filter responses for fine-grained image recognition. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 1134\u20131142).","DOI":"10.1109\/CVPR.2016.128"},{"key":"1283_CR61","unstructured":"Zhou, B., Zhao, H., Puig, X., Fidler, S., Barriuso, A., & Torralba, A. (2018). Semantic segmentation on MIT ADE20K dataset in PyTorch. https:\/\/github.com\/CSAILVision\/semantic-segmentation-pytorch\/."},{"key":"1283_CR62","doi-asserted-by":"publisher","first-page":"302","DOI":"10.1007\/s11263-018-1140-0","volume":"127","author":"B Zhou","year":"2019","unstructured":"Zhou, B., Zhao, H., Puig, X., Fidler, S., Barriuso, A., & Torralba, A. (2019). Semantic understanding of scenes through the ade20k dataset. International Journal of Computer Vision (IJCV), 127, 302\u2013321. https:\/\/doi.org\/10.1007\/s11263-018-1140-0.","journal-title":"International Journal of Computer Vision (IJCV)"},{"key":"1283_CR63","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Zhao, C., Wang, J., Zhao, X., Wu, Y., & Lu, H. (2017). Couplenet: Coupling global structure with local parts for object detection. In Proceedings of international conference on computer vision (ICCV).","DOI":"10.1109\/ICCV.2017.444"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-019-01283-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11263-019-01283-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-019-01283-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,1,27]],"date-time":"2021-01-27T00:34:26Z","timestamp":1611707666000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11263-019-01283-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,1,28]]},"references-count":63,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2020,4]]}},"alternative-id":["1283"],"URL":"https:\/\/doi.org\/10.1007\/s11263-019-01283-0","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020,1,28]]},"assertion":[{"value":"20 January 2019","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 December 2019","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 January 2020","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}