{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,22]],"date-time":"2026-01-22T00:55:03Z","timestamp":1769043303569,"version":"3.49.0"},"publisher-location":"Singapore","reference-count":46,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819556922","type":"print"},{"value":"9789819556939","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-5693-9_34","type":"book-chapter","created":{"date-parts":[[2026,1,20]],"date-time":"2026-01-20T21:23:19Z","timestamp":1768944199000},"page":"489-503","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["CATFormer: Context Aggregation and\u00a0Transmission Transformer"],"prefix":"10.1007","author":[{"given":"Hongbing","family":"Duan","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qingbei","family":"Guo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bo","family":"Yang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,1,21]]},"reference":[{"key":"34_CR1","unstructured":"Chen, K., et al.: MMDetection: open mmlab detection toolbox and benchmark. arXiv preprint arXiv:1906.07155 (2019)"},{"key":"34_CR2","doi-asserted-by":"publisher","unstructured":"Chen, Q., et al.: Mixformer: mixing features across windows and dimensions. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (CVPR), pp. 5239\u20135249 (2022). https:\/\/doi.org\/10.1109\/CVPR52688.2022.00518","DOI":"10.1109\/CVPR52688.2022.00518"},{"key":"34_CR3","first-page":"9355","volume":"34","author":"X Chu","year":"2021","unstructured":"Chu, X., et al.: Twins: revisiting the design of spatial attention in vision transformers. Adv. Neural. Inf. Process. Syst. 34, 9355\u20139366 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"34_CR4","unstructured":"Contributors, M.: MMSegmentation: Openmmlab semantic segmentation toolbox and benchmark. https:\/\/github.com\/open-mmlab\/mmsegmentation (2020)"},{"key":"34_CR5","doi-asserted-by":"crossref","unstructured":"Ding, M., Xiao, B., Codella, N., Luo, P., Wang, J., Yuan, L.: Davit: dual attention vision transformers. In: Proceedings of the IEEE\/CVF Conference on European Conference on Computer Vision (ECCV), pp. 74\u201392. Springer (2022)","DOI":"10.1007\/978-3-031-20053-3_5"},{"key":"34_CR6","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16x16 words: transformers for image recognition at scale. In: International Conference on Learning Representations (ICLR), pp. 13713\u201313722 (2020)"},{"key":"34_CR7","unstructured":"Goyal, P., et al.: Accurate, large minibatch sgd: Training imagenet in 1 hour. arXiv preprint arXiv:1706.02677 (2017)"},{"key":"34_CR8","doi-asserted-by":"crossref","unstructured":"Grainger, R., Paniagua, T., Song, X., Cuntoor, N., Lee, M.W., Wu, T.: Paca-vit: learning patch-to-cluster attention in vision transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 18568\u201318578, June 2023","DOI":"10.1109\/CVPR52729.2023.01781"},{"key":"34_CR9","doi-asserted-by":"crossref","unstructured":"Guo, J., et al.: Cmt: convolutional neural networks meet vision transformers. 2022 ieee. In: CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 12165\u201312175 (2022)","DOI":"10.1109\/CVPR52688.2022.01186"},{"key":"34_CR10","doi-asserted-by":"crossref","unstructured":"Hassani, A., Walton, S., Li, J., Li, S., Shi, H.: Neighborhood attention transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 6185\u20136194, June 2023","DOI":"10.1109\/CVPR52729.2023.00599"},{"key":"34_CR11","unstructured":"Hatamizadeh, A., e al.: Fastervit: fast vision transformers with hierarchical attention. arXiv preprint arXiv:2306.06189 (2023)"},{"key":"34_CR12","doi-asserted-by":"crossref","unstructured":"Hatamizadeh, A., Kautz, J.: Mambavision: a hybrid mamba-transformer vision backbone. arXiv preprint arXiv:2407.08083 (2024)","DOI":"10.1109\/CVPR52734.2025.02352"},{"key":"34_CR13","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., Girshick, R.: Mask r-cnn. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 2961\u20132969 (2017)","DOI":"10.1109\/ICCV.2017.322"},{"key":"34_CR14","doi-asserted-by":"crossref","unstructured":"Huang, G., Sun, Y., Liu, Z., Sedra, D., Weinberger, K.Q.: Deep networks with stochastic depth. In: Proceedings of the IEEE\/CVF Conference on European Conference on Computer Vision (ECCV), pp. 646\u2013661. Springer (2016)","DOI":"10.1007\/978-3-319-46493-0_39"},{"key":"34_CR15","unstructured":"Huang, H., Zhou, X., Cao, J., He, R., Tan, T.: Vision transformer with super token sampling. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 22690\u201322699, June 2023"},{"key":"34_CR16","doi-asserted-by":"crossref","unstructured":"Kim, M., Seo, P.H., Schmid, C., Cho, M.: Learning correlation structures for vision transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18941\u201318951 (2024)","DOI":"10.1109\/CVPR52733.2024.01792"},{"key":"34_CR17","doi-asserted-by":"publisher","unstructured":"Kirillov, A., Girshick, R., He, K., Doll\u00e1r, P.: Panoptic feature pyramid networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 6392\u20136401 (2019). https:\/\/doi.org\/10.1109\/CVPR.2019.00656","DOI":"10.1109\/CVPR.2019.00656"},{"key":"34_CR18","doi-asserted-by":"publisher","unstructured":"Li, K., Wang, Y., Zhang, J., Gao, P., Song, G., Liu, Y., Li, H., Qiao, Y.: Uniformer: unifying convolution and self-attention for visual recognition. IEEE Trans. Pattern Anal. Mach. Intell. 1\u201318 (2023). https:\/\/doi.org\/10.1109\/TPAMI.2023.3282631","DOI":"10.1109\/TPAMI.2023.3282631"},{"key":"34_CR19","first-page":"12934","volume":"35","author":"Y Li","year":"2022","unstructured":"Li, Y., et al.: Efficientformer: vision transformers at mobilenet speed. Adv. Neural. Inf. Process. Syst. 35, 12934\u201312949 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"34_CR20","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., et al.: Microsoft coco: common objects in context. In: European Conference on Computer Vision (ECCV), pp. 740\u2013755. Springer (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"34_CR21","doi-asserted-by":"publisher","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 9992\u201310002 (2021). https:\/\/doi.org\/10.1109\/ICCV48922.2021.00986","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"34_CR22","doi-asserted-by":"crossref","unstructured":"Pan, J., et al.: Edgevits: competing light-weight cnns on mobile devices with vision transformers. In: European Conference on Computer Vision, pp. 294\u2013311. Springer (2022)","DOI":"10.1007\/978-3-031-20083-0_18"},{"key":"34_CR23","doi-asserted-by":"crossref","unstructured":"Ren, S., Yang, X., Liu, S., Wang, X.: Sg-former: self-guided transformer with evolving token reallocation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6003\u20136014 (2023)","DOI":"10.1109\/ICCV51070.2023.00552"},{"key":"34_CR24","doi-asserted-by":"crossref","unstructured":"Ren, S., Zhou, D., He, S., Feng, J., Wang, X.: Shunted self-attention via multi-scale token aggregation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10853\u201310862 (2022)","DOI":"10.1109\/CVPR52688.2022.01058"},{"key":"34_CR25","doi-asserted-by":"crossref","unstructured":"Russakovsky, O., et al.: Imagenet large scale visual recognition challenge. Int. J. Comput. Vis. 115(3), 211\u2013252 (2015)","DOI":"10.1007\/s11263-015-0816-y"},{"key":"34_CR26","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., Wojna, Z.: Rethinking the inception architecture for computer vision. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2818\u20132826 (2016)","DOI":"10.1109\/CVPR.2016.308"},{"key":"34_CR27","unstructured":"Touvron, H., Cord, M., Douze, M., Massa, F., Sablayrolles, A., Jegou, H.: Training data-efficient image transformers & distillation through attention. In: International Conference on Machine Learning, pp. 10347\u201310357 (2021)"},{"key":"34_CR28","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Proceedings of the 31st International Conference on Neural Information Processing Systems (NIPS), pp. 6000\u2014-6010. Curran Associates Inc. (2017)"},{"key":"34_CR29","doi-asserted-by":"crossref","unstructured":"Wang, C., Xu, H., Zhang, X., Wang, L., Zheng, Z., Liu, H.: Convolutional embedding makes hierarchical vision transformer stronger. In: Proceedings of the IEEE\/CVF Conference on European Conference on Computer Vision (ECCV), pp. 739\u2013756. Springer (2022)","DOI":"10.1007\/978-3-031-20044-1_42"},{"key":"34_CR30","doi-asserted-by":"crossref","unstructured":"Wang, W., et al.: Internimage: exploring large-scale vision foundation models with deformable convolutions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 14408\u201314419, June 2023","DOI":"10.1109\/CVPR52729.2023.01385"},{"key":"34_CR31","doi-asserted-by":"crossref","unstructured":"Wang, W., et al.: Pyramid vision transformer: a versatile backbone for dense prediction without convolutions. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 568\u2013578 (2021)","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"34_CR32","unstructured":"Wang, W., et al.: Crossformer: a versatile vision transformer hinging on cross-scale attention. In: International Conference on Learning Representations (ICLR) (2022)"},{"key":"34_CR33","doi-asserted-by":"crossref","unstructured":"Wang, Z., et al.: vheat: building vision models upon heat conduction (2024)","DOI":"10.1109\/CVPR52734.2025.00907"},{"key":"34_CR34","doi-asserted-by":"crossref","unstructured":"Xia, Z., Pan, X., Song, S., Li, L.E., Huang, G.: Vision transformer with deformable attention. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4794\u20134803 (2022)","DOI":"10.1109\/CVPR52688.2022.00475"},{"key":"34_CR35","doi-asserted-by":"crossref","unstructured":"Xiao, T., Liu, Y., Zhou, B., Jiang, Y., Sun, J.: Unified perceptual parsing for scene understanding. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 418\u2013434 (2018)","DOI":"10.1007\/978-3-030-01228-1_26"},{"key":"34_CR36","unstructured":"Yang, C., et al.: Moat: alternating mobile convolution and attention brings strong vision models. In: International Conference on Learning Representations (ICLR), May 2023"},{"key":"34_CR37","unstructured":"Yang, J., et al.: Focal self-attention for local-global interactions in vision transformers. arXiv preprint arXiv:2107.00641 (2021)"},{"key":"34_CR38","doi-asserted-by":"crossref","unstructured":"Yang, R., et al.: Scalablevit: rethinking the context-oriented generalization of vision transformer. arXiv preprint arXiv:2203.10790 (2022)","DOI":"10.1007\/978-3-031-20053-3_28"},{"issue":"5","key":"34_CR39","doi-asserted-by":"publisher","first-page":"3795","DOI":"10.1109\/TCSVT.2023.3321190","volume":"34","author":"J Yuan","year":"2024","unstructured":"Yuan, J., Zhu, A., Xu, Q., Wattanachote, K., Gong, Y.: Ctif-net: a cnn-transformer iterative fusion network for salient object detection. IEEE Trans. Circuits Syst. Video Technol. 34(5), 3795\u20133805 (2024). https:\/\/doi.org\/10.1109\/TCSVT.2023.3321190","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"34_CR40","doi-asserted-by":"crossref","unstructured":"Yun, S., Han, D., Oh, S.J., Chun, S., Choe, J., Yoo, Y.: Cutmix: regularization strategy to train strong classifiers with localizable features. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 6022\u20136031 (2019)","DOI":"10.1109\/ICCV.2019.00612"},{"key":"34_CR41","unstructured":"Zhang, H., Cisse, M., Dauphin, Y.N., Lopez-Paz, D.: mixup: beyond empirical risk minimization. In: International Conference on Learning Representations (ICLR) (2018)"},{"key":"34_CR42","doi-asserted-by":"crossref","unstructured":"Zhang, S., Liu, H., Lin, S., He, K.: You only need less attention at each stage in vision transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 6057\u20136066, June 2024","DOI":"10.1109\/CVPR52733.2024.00579"},{"key":"34_CR43","doi-asserted-by":"crossref","unstructured":"Zhong, Z., Zheng, L., Kang, G., Li, S., Yang, Y.: Random erasing data augmentation. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), pp. 13001\u201313008, August 2020","DOI":"10.1609\/aaai.v34i07.7000"},{"issue":"3","key":"34_CR44","doi-asserted-by":"publisher","first-page":"302","DOI":"10.1007\/s11263-018-1140-0","volume":"127","author":"B Zhou","year":"2019","unstructured":"Zhou, B., et al.: Semantic understanding of scenes through the ade20k dataset. Int. J. Comput. Vision 127(3), 302\u2013321 (2019)","journal-title":"Int. J. Comput. Vision"},{"key":"34_CR45","doi-asserted-by":"publisher","unstructured":"Zhu, L., Wang, X., Ke, Z., Zhang, W., Lau, R.: Biformer: vision transformer with bi-level routing attention. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10323\u201310333 (2023). https:\/\/doi.org\/10.1109\/CVPR52729.2023.00995","DOI":"10.1109\/CVPR52729.2023.00995"},{"key":"34_CR46","unstructured":"Zhu, L., Liao, B., Zhang, Q., Wang, X., Liu, W., Wang, X.: Vision mamba: efficient visual representation learning with bidirectional state space model. In: Salakhutdinov, R., et al. (eds.) Proceedings of the 41st International Conference on Machine Learning. Proceedings of Machine Learning Research, vol.\u00a0235, pp. 62429\u201362442. PMLR, 21\u201327 July 2024"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-5693-9_34","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,20]],"date-time":"2026-01-20T21:23:24Z","timestamp":1768944204000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-5693-9_34"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819556922","9789819556939"],"references-count":46,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-5693-9_34","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"21 January 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Shanghai","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 October 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/2025.prcv.cn\/index.asp","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}