{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,4,19]],"date-time":"2025-04-19T04:52:52Z","timestamp":1745038372536,"version":"3.40.3"},"publisher-location":"Cham","reference-count":52,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031189159"},{"type":"electronic","value":"9783031189166"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-18916-6_45","type":"book-chapter","created":{"date-parts":[[2022,10,26]],"date-time":"2022-10-26T23:03:53Z","timestamp":1666825433000},"page":"558-573","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Cross-Stage Class-Specific Attention for\u00a0Image Semantic Segmentation"],"prefix":"10.1007","author":[{"given":"Zhengyi","family":"Shi","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Li","family":"Sun","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qingli","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2022,10,27]]},"reference":[{"key":"45_CR1","doi-asserted-by":"crossref","unstructured":"Zhao, H., Shi, J., Qi, X., Wang, X., Jia, J.: Pyramid scene parsing network. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2017)","DOI":"10.1109\/CVPR.2017.660"},{"key":"45_CR2","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"833","DOI":"10.1007\/978-3-030-01234-2_49","volume-title":"Computer Vision \u2013 ECCV 2018","author":"L-C Chen","year":"2018","unstructured":"Chen, L.-C., Zhu, Y., Papandreou, G., Schroff, F., Adam, H.: Encoder-decoder with atrous separable convolution for semantic image segmentation. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11211, pp. 833\u2013851. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01234-2_49"},{"key":"45_CR3","doi-asserted-by":"crossref","unstructured":"Yang, M., Yu, K., Zhang, C., Li, Z., Yang, K.: Denseaspp for semantic segmentation in street scenes. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3684\u20133692 (2018)","DOI":"10.1109\/CVPR.2018.00388"},{"key":"45_CR4","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"432","DOI":"10.1007\/978-3-030-01228-1_26","volume-title":"Computer Vision \u2013 ECCV 2018","author":"T Xiao","year":"2018","unstructured":"Xiao, T., Liu, Y., Zhou, B., Jiang, Y., Sun, J.: Unified perceptual parsing for scene understanding. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11209, pp. 432\u2013448. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01228-1_26"},{"key":"45_CR5","doi-asserted-by":"crossref","unstructured":"Kirillov, A., Girshick, R., He, K., Dollar, P.: Panoptic feature pyramid networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2019)","DOI":"10.1109\/CVPR.2019.00656"},{"key":"45_CR6","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Doll\u00e1r, P., Girshick, R., He, K., Hariharan, B., Belongie, S.: Feature pyramid networks for object detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 2117\u20132125 (2017)","DOI":"10.1109\/CVPR.2017.106"},{"key":"45_CR7","doi-asserted-by":"crossref","unstructured":"Tian, Z., Shen, C., Chen, H., He, T.: FCOS: fully convolutional one-stage object detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV) (2019)","DOI":"10.1109\/ICCV.2019.00972"},{"key":"45_CR8","unstructured":"Chen, L.-C., Papandreou, G., Kokkinos, I., Murphy, K., Yuille, A.L.: Semantic image segmentation with deep convolutional nets and fully connected CRFs. arXiv preprint arXiv:1412.7062 (2014)"},{"key":"45_CR9","doi-asserted-by":"crossref","unstructured":"Chen, L.-C., Papandreou, G., Kokkinos, I., Murphy, K., Yuille, A.L.: DeepLab: semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected CRFs. IEEE Trans. Pattern Anal. Mach. Intell. 40(4), 834\u2013848 (2017)","DOI":"10.1109\/TPAMI.2017.2699184"},{"key":"45_CR10","unstructured":"Chen, L.-C., Papandreou, G., Schroff, F., Adam, H.: Rethinking atrous convolution for semantic image segmentation. arXiv preprint arXiv:1706.05587 (2017)"},{"key":"45_CR11","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Goyal, P., Girshick, R., He, K., Doll\u00e1r, P.: Focal loss for dense object detection. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), pp. 2980\u20132988 (2017)","DOI":"10.1109\/ICCV.2017.324"},{"key":"45_CR12","doi-asserted-by":"crossref","unstructured":"Wang, X., Girshick, R., Gupta, A., He, K.: Non-local neural networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 7794\u20137803 (2018)","DOI":"10.1109\/CVPR.2018.00813"},{"key":"45_CR13","unstructured":"Huang, Z., Wang, X., Huang, L., Huang, C., Wei, Y., Liu, W.: CCNet: criss-cross attention for semantic segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 603\u2013612 (2014)"},{"key":"45_CR14","doi-asserted-by":"crossref","unstructured":"Fu, J., Liu, J., Tian, H., Li, Y., Bao, Y., Fang, Z., Lu, H.: Dual attention network for scene segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3146\u20133154 (2019)","DOI":"10.1109\/CVPR.2019.00326"},{"key":"45_CR15","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"173","DOI":"10.1007\/978-3-030-58539-6_11","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Y Yuan","year":"2020","unstructured":"Yuan, Y., Chen, X., Wang, J.: Object-contextual representations for semantic segmentation. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12351, pp. 173\u2013190. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58539-6_11"},{"key":"45_CR16","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, pp. 5998\u20136008 (2017)"},{"key":"45_CR17","unstructured":"Devlin, J., Chang, M.-W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"45_CR18","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. arXiv preprint arXiv:1910.10683 (2019)"},{"key":"45_CR19","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"45_CR20","unstructured":"Touvron, H., Cord, M., Douze, M., Massa, F., Sablayrolles, A., J\u00e9gou, H.: Training data-efficient image transformers & distillation through attention. In: International Conference on Machine Learning, pp. 10347\u201310357. PMLR (2021)"},{"key":"45_CR21","doi-asserted-by":"crossref","unstructured":"Yuan, L., et al.: Tokens-to-token VIT: training vision transformers from scratch on imagenet. arXiv preprint arXiv:2101.11986 (2021)","DOI":"10.1109\/ICCV48922.2021.00060"},{"key":"45_CR22","doi-asserted-by":"crossref","unstructured":"He, S., Luo, H., Wang, P., Wang, F., Li, H., Jiang, W.: TransReID: transformer-based object re-identification. arXiv preprint arXiv:2102.04378 (2021)","DOI":"10.1109\/ICCV48922.2021.01474"},{"key":"45_CR23","unstructured":"Zhu, K., et al.: AAformer: auto-aligned transformer for person re-identification. arXiv preprint arXiv:2104.00921s (2021)"},{"key":"45_CR24","unstructured":"Lee, K., Chang, H., Jiang, L., Zhang, H., Tu, Z., Liu, C.: VitGAN: training GANs with vision transformers. arXiv preprint arXiv:2107.04589 (2021)"},{"key":"45_CR25","unstructured":"Jiang, Y., Chang, S., Wang, Z.: TransGAN: two transformers can make one strong GAN. arXiv preprint arXiv:2102.07074 (2021)"},{"key":"45_CR26","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"45_CR27","unstructured":"Chu, X., et al.: Twins: revisiting the design of spatial attention in vision transformers. arXiv preprint arXiv:2104.13840 (2021)"},{"key":"45_CR28","doi-asserted-by":"crossref","unstructured":"Wang, W., et al.: Pyramid vision transformer: a versatile backbone for dense prediction without convolutions. arXiv preprint arXiv:2102.12122 (2021)","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"45_CR29","doi-asserted-by":"crossref","unstructured":"Wang, Y., Zhang, X., Yang, T., Sun, J.: Anchor DETR: query design for transformer-based detector. arXiv preprint arXiv:2109.07107 (2021)","DOI":"10.1609\/aaai.v36i3.20158"},{"key":"45_CR30","doi-asserted-by":"crossref","unstructured":"Zheng, S., et al.: Rethinking semantic segmentation from a sequence-to-sequence perspective with transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 6881\u20136890 (2021)","DOI":"10.1109\/CVPR46437.2021.00681"},{"key":"45_CR31","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"45_CR32","unstructured":"Xie, E., Wang, W., Yu, Z., Anandkumar, A., Alvarez, J.M., Luo, P.: SegFormer: simple and efficient design for semantic segmentation with transformers. In: Advances in Neural Information Processing Systems (2021)"},{"key":"45_CR33","doi-asserted-by":"crossref","unstructured":"Kuo, W., Angelova, A., Malik, J., Lin, T.-Y.: ShapeMask: learning to segment novel objects by refining shape priors. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV) pp. 9207\u20139216 (2019)","DOI":"10.1109\/ICCV.2019.00930"},{"key":"45_CR34","doi-asserted-by":"crossref","unstructured":"Li, K., Hariharan, B., Malik, J.: Iterative instance segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3659\u20133667 (2016)","DOI":"10.1109\/CVPR.2016.398"},{"key":"45_CR35","doi-asserted-by":"crossref","unstructured":"Kirillov, A., Wu, Y., He, K., Girshick, R.: Pointrend: image segmentation as rendering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 9799\u20139808 (2020)","DOI":"10.1109\/CVPR42600.2020.00982"},{"key":"45_CR36","doi-asserted-by":"crossref","unstructured":"Zhang, F., et al.: ACFNet: attentional class feature network for semantic segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 6798\u20136807 (2019)","DOI":"10.1109\/ICCV.2019.00690"},{"key":"45_CR37","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.-J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 248\u2013255 (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"45_CR38","doi-asserted-by":"crossref","unstructured":"Zhou, B., Zhao, H., Puig, X., Fidler, S., Barriuso, A., Torralba, A.: Scene parsing through ade20k dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 633\u2013641 (2017)","DOI":"10.1109\/CVPR.2017.544"},{"key":"45_CR39","doi-asserted-by":"crossref","unstructured":"Mottaghi, R., et al.: The role of context for object detection and semantic segmentation in the wild. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 891\u2013898 (2014)","DOI":"10.1109\/CVPR.2014.119"},{"key":"45_CR40","doi-asserted-by":"crossref","unstructured":"Caesar, H., Uijlings, J., Ferrari, V.: Coco-stuff: thing and stuff classes in contex. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1209\u20131218 (2018)","DOI":"10.1109\/CVPR.2018.00132"},{"key":"45_CR41","unstructured":"MMSegmentation Contributors (2020). https:\/\/github.com\/open-mmlab\/mmsegmentation"},{"key":"45_CR42","doi-asserted-by":"crossref","unstructured":"Long, Shelhamer, J., Darrell, E., T.: Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3431\u20133440 (2015)","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"45_CR43","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"191","DOI":"10.1007\/978-3-030-58555-6_12","volume-title":"Computer Vision \u2013 ECCV 2020","author":"M Yin","year":"2020","unstructured":"Yin, M., et al.: Disentangled non-local neural networks. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12360, pp. 191\u2013207. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58555-6_12"},{"key":"45_CR44","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"45_CR45","doi-asserted-by":"crossref","unstructured":"Sun, K., Xiao, B., Liu, D., Wang, J.: Deep high-resolution representation learning for human pose estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5693\u20135703 (2019)","DOI":"10.1109\/CVPR.2019.00584"},{"key":"45_CR46","doi-asserted-by":"crossref","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-net: convolutional networks for biomedical image segmentation. In: International Conference on Medical Image Computing and Computer-Assisted Intervention, pp. 234\u2013241 (2015)","DOI":"10.1007\/978-3-319-24574-4_28"},{"issue":"12","key":"45_CR47","doi-asserted-by":"publisher","first-page":"2481","DOI":"10.1109\/TPAMI.2016.2644615","volume":"39","author":"V Badrinarayanan","year":"2017","unstructured":"Badrinarayanan, V., Kendall, A., Cipolla, R.: SegNet: a deep convolutional encoder-decoder architecture for image segmentation. IEEE Trans. Pattern Anal. Mach. Intell. 39(12), 2481\u20132495 (2017)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"45_CR48","unstructured":"Yuan, Y., Huang, L., Guo, J., Zhang, C., Chen, X., Wang, J.: OCnet: object context network for scene parsing. arXiv preprint arXiv:1809.00916 (2018)"},{"key":"45_CR49","unstructured":"Loshchilov, I., Hutter, F.: Fixing weight decay regularization in Adam (2018)"},{"key":"45_CR50","doi-asserted-by":"crossref","unstructured":"Cordts, M., et al.: The cityscapes dataset for semantic urban scene understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2016)","DOI":"10.1109\/CVPR.2016.350"},{"key":"45_CR51","unstructured":"Cheng, B., Schwing, A.G., Kirillov, A.: Per-pixel classification is not all you need for semantic segmentation. arXiv preprint arXiv:2107.06278 (2021)"},{"key":"45_CR52","doi-asserted-by":"crossref","unstructured":"Wu, Y.-H., et al.: P2T: pyramid pooling transformer for scene understanding. arXiv preprint arXiv:2106.12011 (2021)","DOI":"10.1109\/TPAMI.2022.3202765"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-18916-6_45","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,26]],"date-time":"2022-10-26T23:48:03Z","timestamp":1666828083000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-18916-6_45"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031189159","9783031189166"],"references-count":52,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-18916-6_45","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"27 October 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Shenzhen","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/en.prcv.cn\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"microsoft","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"564","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"233","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"41% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.03","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.35","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}