{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,8]],"date-time":"2026-05-08T16:27:07Z","timestamp":1778257627268,"version":"3.51.4"},"publisher-location":"Cham","reference-count":113,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031198175","type":"print"},{"value":"9783031198182","type":"electronic"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-19818-2_17","type":"book-chapter","created":{"date-parts":[[2022,10,21]],"date-time":"2022-10-21T16:21:10Z","timestamp":1666369270000},"page":"288-307","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":90,"title":["k-means Mask Transformer"],"prefix":"10.1007","author":[{"given":"Qihang","family":"Yu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Huiyu","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Siyuan","family":"Qiao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Maxwell","family":"Collins","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yukun","family":"Zhu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hartwig","family":"Adam","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Alan","family":"Yuille","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Liang-Chieh","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2022,10,22]]},"reference":[{"key":"17_CR1","doi-asserted-by":"crossref","unstructured":"Achanta, R., Shaji, A., Smith, K., Lucchi, A., Fua, P., S\u00fcsstrunk, S.: Slic superpixels compared to state-of-the-art superpixel methods. In: IEEE TPAMI (2012)","DOI":"10.1109\/TPAMI.2012.120"},{"key":"17_CR2","doi-asserted-by":"crossref","unstructured":"Ainslie, J., Ontanon, S., Alberti, C., Pham, P., Ravula, A., Sanghai, S.: Etc: Encoding long and structured data in transformers. In: EMNLP (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.19"},{"key":"17_CR3","doi-asserted-by":"crossref","unstructured":"Arnab, A., Dehghani, M., Heigold, G., Sun, C., Lu\u010di\u0107, M., Schmid, C.: Vivit: A video vision transformer. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"17_CR4","unstructured":"Bahdanau, D., Cho, K., Bengio, Y.: Neural machine translation by jointly learning to align and translate. In: ICLR (2015)"},{"key":"17_CR5","doi-asserted-by":"crossref","unstructured":"Bai, M., Urtasun, R.: Deep watershed transform for instance segmentation. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.305"},{"key":"17_CR6","doi-asserted-by":"crossref","unstructured":"Ballard, D.H.: Generalizing the hough transform to detect arbitrary shapes. In: Pattern Recognition (1981)","DOI":"10.1016\/0031-3203(81)90009-1"},{"key":"17_CR7","doi-asserted-by":"crossref","unstructured":"Bello, I., Zoph, B., Vaswani, A., Shlens, J., Le, Q.V.: Attention augmented convolutional networks. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00338"},{"key":"17_CR8","unstructured":"Beltagy, I., Peters, M.E., Cohan, A.: Longformer: The long-document transformer. arXiv:2004.05150 (2020)"},{"key":"17_CR9","unstructured":"Buades, A., Coll, B., Morel, J.M.: A non-local algorithm for image denoising. In: CVPR (2005)"},{"key":"17_CR10","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"17_CR11","unstructured":"Chen, J., et al.: Transunet: Transformers make strong encoders for medical image segmentation. arXiv:2102.04306 (2021)"},{"key":"17_CR12","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"695","DOI":"10.1007\/978-3-030-58545-7_40","volume-title":"Computer Vision \u2013 ECCV 2020","author":"L-C Chen","year":"2020","unstructured":"Chen, L.-C., et al.: Naive-student: leveraging semi-supervised learning in video sequences for urban scene segmentation. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12354, pp. 695\u2013714. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58545-7_40"},{"key":"17_CR13","unstructured":"Chen, L.C., Papandreou, G., Kokkinos, I., Murphy, K., Yuille, A.L.: Semantic image segmentation with deep convolutional nets and fully connected crfs. In: ICLR (2015)"},{"key":"17_CR14","doi-asserted-by":"crossref","unstructured":"Chen, L.C., Papandreou, G., Kokkinos, I., Murphy, K., Yuille, A.L.: Deeplab: Semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected crfs. In: IEEE TPAMI (2017)","DOI":"10.1109\/TPAMI.2017.2699184"},{"key":"17_CR15","unstructured":"Chen, L.C., Papandreou, G., Schroff, F., Adam, H.: Rethinking atrous convolution for semantic image segmentation. arXiv:1706.05587 (2017)"},{"key":"17_CR16","unstructured":"Chen, L.C., Wang, H., Qiao, S.: Scaling wide residual networks for panoptic segmentation. arXiv:2011.11675 (2020)"},{"key":"17_CR17","doi-asserted-by":"crossref","unstructured":"Chen, L.C., Yang, Y., Wang, J., Xu, W., Yuille, A.L.: Attention to scale: Scale-aware semantic image segmentation. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.396"},{"key":"17_CR18","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"833","DOI":"10.1007\/978-3-030-01234-2_49","volume-title":"Computer Vision \u2013 ECCV 2018","author":"L-C Chen","year":"2018","unstructured":"Chen, L.-C., Zhu, Y., Papandreou, G., Schroff, F., Adam, H.: Encoder-decoder with atrous separable convolution for semantic image segmentation. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11211, pp. 833\u2013851. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01234-2_49"},{"key":"17_CR19","unstructured":"Chen, Y., Kalantidis, Y., Li, J., Yan, S., Feng, J.: A$$\\hat{}$$ 2-nets: Double attention networks. In: NeurIPS (2018)"},{"key":"17_CR20","unstructured":"Cheng, B., Choudhuri, A., Misra, I., Kirillov, A., Girdhar, R., Schwing, A.G.: Mask2former for video instance segmentation. arXiv:2112.10764 (2021)"},{"key":"17_CR21","unstructured":"Cheng, B., et al.: Panoptic-DeepLab. In: ICCV COCO + Mapillary Joint Recognition Challenge Workshop (2019)"},{"key":"17_CR22","doi-asserted-by":"crossref","unstructured":"Cheng, B., et al.: Panoptic-DeepLab: A Simple, Strong, and Fast Baseline for Bottom-Up Panoptic Segmentation. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01249"},{"key":"17_CR23","doi-asserted-by":"crossref","unstructured":"Cheng, B., Misra, I., Schwing, A.G., Kirillov, A., Girdhar, R.: Masked-attention mask transformer for universal image segmentation. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"17_CR24","unstructured":"Cheng, B., Schwing, A.G., Kirillov, A.: Per-pixel classification is not all you need for semantic segmentation. In: NeurIPS (2021)"},{"key":"17_CR25","doi-asserted-by":"crossref","unstructured":"Cheng, J., Dong, L., Lapata, M.: Long short-term memory-networks for machine reading. In: EMNLP (2016)","DOI":"10.18653\/v1\/D16-1053"},{"key":"17_CR26","unstructured":"Child, R., Gray, S., Radford, A., Sutskever, I.: Generating long sequences with sparse transformers. arXiv:1904.10509 (2019)"},{"key":"17_CR27","doi-asserted-by":"crossref","unstructured":"Chollet, F.: Xception: Deep learning with depthwise separable convolutions. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.195"},{"key":"17_CR28","doi-asserted-by":"crossref","unstructured":"Cordts, M., et al.: The cityscapes dataset for semantic urban scene understanding. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.350"},{"key":"17_CR29","doi-asserted-by":"crossref","unstructured":"Cubuk, E.D., Zoph, B., Mane, D., Vasudevan, V., Le, Q.V.: Autoaugment: Learning augmentation policies from data. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00020"},{"key":"17_CR30","doi-asserted-by":"crossref","unstructured":"Dai, Z., Yang, Z., Yang, Y., Carbonell, J.G., Le, Q., Salakhutdinov, R.: Transformer-xl: Attentive language models beyond a fixed-length context. In: ACL (2019)","DOI":"10.18653\/v1\/P19-1285"},{"key":"17_CR31","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: Pre-training of deep bidirectional transformers for language understanding. In: NAACL (2019)"},{"key":"17_CR32","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: Transformers for image recognition at scale. In: ICLR (2021)"},{"key":"17_CR33","doi-asserted-by":"crossref","unstructured":"Fan, H., et al.: Multiscale vision transformers. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"17_CR34","doi-asserted-by":"crossref","unstructured":"Fang, H.S., Sun, J., Wang, R., Gou, M., Li, Y.L., Lu, C.: Instaboost: Boosting instance segmentation via probability map guided copy-pasting. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00077"},{"key":"17_CR35","doi-asserted-by":"crossref","unstructured":"Fu, J., et al.: Dual attention network for scene segmentation. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00326"},{"key":"17_CR36","doi-asserted-by":"crossref","unstructured":"Gao, N., et al.: Ssap: Single-shot instance segmentation with affinity pyramid. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00073"},{"key":"17_CR37","doi-asserted-by":"crossref","unstructured":"Gao, P., Zheng, M., Wang, X., Dai, J., Li, H.: Fast convergence of detr with spatially modulated co-attention. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00360"},{"key":"17_CR38","doi-asserted-by":"crossref","unstructured":"Ghiasi, G., et al.: Simple copy-paste is a strong data augmentation method for instance segmentation. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00294"},{"key":"17_CR39","unstructured":"Gupta, A., Berant, J.: Gmat: Global memory augmentation for transformers. arXiv:2006.03274 (2020)"},{"key":"17_CR40","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., Girshick, R.: Mask r-cnn. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.322"},{"key":"17_CR41","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"17_CR42","unstructured":"Ho, J., Kalchbrenner, N., Weissenborn, D., Salimans, T.: Axial attention in multidimensional transformers. arXiv:1912.12180 (2019)"},{"key":"17_CR43","doi-asserted-by":"crossref","unstructured":"Hu, H., Gu, J., Zhang, Z., Dai, J., Wei, Y.: Relation networks for object detection. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00378"},{"key":"17_CR44","doi-asserted-by":"crossref","unstructured":"Hu, H., Zhang, Z., Xie, Z., Lin, S.: Local relation networks for image recognition. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00356"},{"key":"17_CR45","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"646","DOI":"10.1007\/978-3-319-46493-0_39","volume-title":"Computer Vision \u2013 ECCV 2016","author":"G Huang","year":"2016","unstructured":"Huang, G., Sun, Yu., Liu, Z., Sedra, D., Weinberger, K.Q.: Deep networks with stochastic depth. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9908, pp. 646\u2013661. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46493-0_39"},{"key":"17_CR46","doi-asserted-by":"crossref","unstructured":"Huang, Z., Wang, X., Huang, L., Huang, C., Wei, Y., Liu, W.: Ccnet: Criss-cross attention for semantic segmentation. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00069"},{"key":"17_CR47","doi-asserted-by":"crossref","unstructured":"Hwang, J.J., et al.: SegSort: Segmentation by discriminative sorting of segments. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00743"},{"key":"17_CR48","unstructured":"Jang, E., Gu, S., Poole, B.: Categorical reparameterization with gumbel-softmax. In: ICLR (2017)"},{"key":"17_CR49","unstructured":"Jia, X., De Brabandere, B., Tuytelaars, T., Gool, L.V.: Dynamic filter networks. In: NeurIPS (2016)"},{"key":"17_CR50","unstructured":"Kendall, A., Gal, Y., Cipolla, R.: Multi-task learning using uncertainty to weigh losses for scene geometry and semantics. In: CVPR (2018)"},{"key":"17_CR51","doi-asserted-by":"crossref","unstructured":"Keuper, M., Levinkov, E., Bonneel, N., Lavou\u00e9, G., Brox, T., Andres, B.: Efficient decomposition of image and mesh graphs by lifted multicuts. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.204"},{"key":"17_CR52","doi-asserted-by":"crossref","unstructured":"Kim, D., et al.: TubeFormer-DeepLab: Video Mask Transformer. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01354"},{"key":"17_CR53","unstructured":"Kingma, D.P., Ba, J.: Adam: A method for stochastic optimization. In: ICLR (2015)"},{"key":"17_CR54","doi-asserted-by":"crossref","unstructured":"Kirillov, A., Girshick, R., He, K., Doll\u00e1r, P.: Panoptic feature pyramid networks. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00656"},{"key":"17_CR55","doi-asserted-by":"crossref","unstructured":"Kirillov, A., He, K., Girshick, R., Rother, C., Doll\u00e1r, P.: Panoptic segmentation. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00963"},{"key":"17_CR56","doi-asserted-by":"crossref","unstructured":"Kirillov, A., Wu, Y., He, K., Girshick, R.: Pointrend: Image segmentation as rendering. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00982"},{"key":"17_CR57","unstructured":"Kitaev, N., Kaiser, \u0141., Levskaya, A.: Reformer: The efficient transformer. In: ICLR (2020)"},{"issue":"11","key":"17_CR58","doi-asserted-by":"publisher","first-page":"2278","DOI":"10.1109\/5.726791","volume":"86","author":"Y LeCun","year":"1998","unstructured":"LeCun, Y., Bottou, L., Bengio, Y., Haffner, P.: Gradient-based learning applied to document recognition. Proc. IEEE 86(11), 2278\u20132324 (1998)","journal-title":"Proc. IEEE"},{"key":"17_CR59","unstructured":"Leibe, B., Leonardis, A., Schiele, B.: Combined object categorization and segmentation with an implicit shape model. In: Workshop on statistical learning in computer vision, ECCV (2004)"},{"key":"17_CR60","doi-asserted-by":"crossref","unstructured":"Li, Q., Qi, X., Torr, P.H.: Unifying training and inference for panoptic segmentation. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01333"},{"key":"17_CR61","doi-asserted-by":"crossref","unstructured":"Li, X., et al.: Video k-net: A simple, strong, and unified baseline for video segmentation. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01828"},{"key":"17_CR62","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: Attention-guided unified network for panoptic segmentation. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00719"},{"key":"17_CR63","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: Fully convolutional networks for panoptic segmentation. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00028"},{"key":"17_CR64","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: Neural architecture search for lightweight non-local networks. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01031"},{"key":"17_CR65","unstructured":"Li, Z., et al.: Panoptic segformer. In: CVPR (2022)"},{"key":"17_CR66","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"17_CR67","doi-asserted-by":"crossref","unstructured":"Liu, H., et al.: An end-to-end network for panoptic segmentation. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00633"},{"key":"17_CR68","doi-asserted-by":"crossref","unstructured":"Liu, S., Qi, L., Qin, H., Shi, J., Jia, J.: Path aggregation network for instance segmentation. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00913"},{"key":"17_CR69","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"708","DOI":"10.1007\/978-3-030-01219-9_42","volume-title":"Computer Vision \u2013 ECCV 2018","author":"Y Liu","year":"2018","unstructured":"Liu, Y., Yang, S., Li, B., Zhou, W., Xu, J., Li, H., Lu, Y.: Affinity derivation and graph merge for instance segmentation. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11207, pp. 708\u2013724. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01219-9_42"},{"key":"17_CR70","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: Hierarchical vision transformer using shifted windows. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"17_CR71","doi-asserted-by":"crossref","unstructured":"Liu, Z., Mao, H., Wu, C.Y., Feichtenhofer, C., Darrell, T., Xie, S.: A convnet for the 2020s. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01167"},{"issue":"2","key":"17_CR72","doi-asserted-by":"publisher","first-page":"129","DOI":"10.1109\/TIT.1982.1056489","volume":"28","author":"S Lloyd","year":"1982","unstructured":"Lloyd, S.: Least squares quantization in pcm. IEEE Trans. Inf. Theory 28(2), 129\u2013137 (1982)","journal-title":"IEEE Trans. Inf. Theory"},{"key":"17_CR73","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: ICLR (2019)"},{"key":"17_CR74","doi-asserted-by":"crossref","unstructured":"Luong, M.T., Pham, H., Manning, C.D.: Effective approaches to attention-based neural machine translation. In: EMNLP (2015)","DOI":"10.18653\/v1\/D15-1166"},{"key":"17_CR75","doi-asserted-by":"crossref","unstructured":"Neuhold, G., Ollmann, T., Rota Bulo, S., Kontschieder, P.: The mapillary vistas dataset for semantic understanding of street scenes. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.534"},{"key":"17_CR76","doi-asserted-by":"crossref","unstructured":"Neven, D., Brabandere, B.D., Proesmans, M., Gool, L.V.: Instance segmentation by jointly optimizing spatial embeddings and clustering bandwidth. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00904"},{"key":"17_CR77","unstructured":"Parmar, N., et al.: Image transformer. In: ICML (2018)"},{"key":"17_CR78","doi-asserted-by":"crossref","unstructured":"Porzi, L., Bul\u00f2, S.R., Colovic, A., Kontschieder, P.: Seamless scene segmentation. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00847"},{"key":"17_CR79","doi-asserted-by":"crossref","unstructured":"Qiao, S., Chen, L.C., Yuille, A.: Detectors: Detecting objects with recursive feature pyramid and switchable atrous convolution. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.01008"},{"key":"17_CR80","unstructured":"Ramachandran, P., Parmar, N., Vaswani, A., Bello, I., Levskaya, A., Shlens, J.: Stand-alone self-attention in vision models. In: NeurIPS (2019)"},{"key":"17_CR81","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., et al.: Imagenet large scale visual recognition challenge. IJCV 115, 211\u2013252 (2015)","journal-title":"IJCV"},{"key":"17_CR82","doi-asserted-by":"crossref","unstructured":"Shaw, P., Uszkoreit, J., Vaswani, A.: Self-attention with relative position representations. In: NAACL (2018)","DOI":"10.18653\/v1\/N18-2074"},{"key":"17_CR83","unstructured":"Shen, Z., Zhang, M., Zhao, H., Yi, S., Li, H.: Efficient attention: Attention with linear complexities. In: WACV (2021)"},{"key":"17_CR84","doi-asserted-by":"crossref","unstructured":"Sofiiuk, K., Barinova, O., Konushin, A.: Adaptis: Adaptive instance selection network. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00745"},{"key":"17_CR85","doi-asserted-by":"crossref","unstructured":"Strudel, R., Garcia, R., Laptev, I., Schmid, C.: Segmenter: Transformer for semantic segmentation. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00717"},{"key":"17_CR86","unstructured":"Sutskever, I., Vinyals, O., Le, Q.V.: Sequence to sequence learning with neural networks. In: NeurIPS (2014)"},{"key":"17_CR87","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"282","DOI":"10.1007\/978-3-030-58452-8_17","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Z Tian","year":"2020","unstructured":"Tian, Z., Shen, C., Chen, H.: Conditional convolutions for instance segmentation. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 282\u2013298. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_17"},{"key":"17_CR88","doi-asserted-by":"crossref","unstructured":"Uhrig, J., Rehder, E., Fr\u00f6hlich, B., Franke, U., Brox, T.: Box2pix: Single-shot instance segmentation by assigning pixels to object boxes. In: IEEE Intelligent Vehicles Symposium (IV) (2018)","DOI":"10.1109\/IVS.2018.8500621"},{"key":"17_CR89","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NeurIPS (2017)"},{"key":"17_CR90","doi-asserted-by":"crossref","unstructured":"Vincent, L., Soille, P.: Watersheds in digital spaces: an efficient algorithm based on immersion simulations. In: IEEE TPAMI (1991)","DOI":"10.1109\/34.87344"},{"key":"17_CR91","doi-asserted-by":"crossref","unstructured":"Wang, H., Luo, R., Maire, M., Shakhnarovich, G.: Pixel consensus voting for panoptic segmentation. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00948"},{"key":"17_CR92","doi-asserted-by":"crossref","unstructured":"Wang, H., Zhu, Y., Adam, H., Yuille, A., Chen, L.C.: Max-deeplab: End-to-end panoptic segmentation with mask transformers. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00542"},{"key":"17_CR93","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"108","DOI":"10.1007\/978-3-030-58548-8_7","volume-title":"Computer Vision \u2013 ECCV 2020","author":"H Wang","year":"2020","unstructured":"Wang, H., Zhu, Y., Green, B., Adam, H., Yuille, A., Chen, L.-C.: Axial-DeepLab: stand-alone axial-attention for panoptic segmentation. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12349, pp. 108\u2013126. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58548-8_7"},{"key":"17_CR94","unstructured":"Wang, S., Li, B., Khabsa, M., Fang, H., Ma, H.: Linformer: Self-attention with linear complexity. arXiv:2006.04768 (2020)"},{"key":"17_CR95","doi-asserted-by":"crossref","unstructured":"Wang, W., et al.: Pvtv 2: Improved baselines with pyramid vision transformer. arXiv:2106.13797 (2021)","DOI":"10.1007\/s41095-022-0274-8"},{"key":"17_CR96","doi-asserted-by":"crossref","unstructured":"Wang, X., Girshick, R., Gupta, A., He, K.: Non-local neural networks. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00813"},{"key":"17_CR97","unstructured":"Wang, X., Zhang, R., Kong, T., Li, L., Shen, C.: SOLOv2: Dynamic and fast instance segmentation. In: NeurIPS (2020)"},{"key":"17_CR98","unstructured":"Weber, M., et al.: DeepLab2: A TensorFlow Library for Deep Labeling. arXiv: 2106.09748 (2021)"},{"key":"17_CR99","unstructured":"Xie, E., Wang, W., Yu, Z., Anandkumar, A., Alvarez, J.M., Luo, P.: Segformer: Simple and efficient design for semantic segmentation with transformers. In: NeurIPS (2021)"},{"key":"17_CR100","doi-asserted-by":"crossref","unstructured":"Xiong, Y., et al.: Upsnet: A unified panoptic segmentation network. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00902"},{"key":"17_CR101","doi-asserted-by":"crossref","unstructured":"Yang, C., et al.: Lite vision transformer with enhanced self-attention. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01169"},{"key":"17_CR102","unstructured":"Yang, T.J., et al.: Deeperlab: Single-shot image parser. arXiv:1902.05093 (2019)"},{"key":"17_CR103","doi-asserted-by":"crossref","unstructured":"Yang, Y., Li, H., Li, X., Zhao, Q., Wu, J., Lin, Z.: Sognet: Scene overlap graph network for panoptic segmentation. In: AAAI (2020)","DOI":"10.1609\/aaai.v34i07.6955"},{"key":"17_CR104","doi-asserted-by":"crossref","unstructured":"Yu, Q., et al.: Cmt-deeplab: Clustering mask transformers for panoptic segmentation. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00259"},{"key":"17_CR105","unstructured":"Yu, Q., Xia, Y., Bai, Y., Lu, Y., Yuille, A.L., Shen, W.: Glance-and-gaze vision transformer. In: NeurIPS (2021)"},{"key":"17_CR106","unstructured":"Zaheer, M., et al.: Big bird: Transformers for longer sequences. In: NeurIPS (2020)"},{"key":"17_CR107","unstructured":"Zhang, W., Pang, J., Chen, K., Loy, C.C.: K-net: Towards unified image segmentation. In: NeurIPS (2021)"},{"key":"17_CR108","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"270","DOI":"10.1007\/978-3-030-01240-3_17","volume-title":"Computer Vision \u2013 ECCV 2018","author":"H Zhao","year":"2018","unstructured":"Zhao, H., et al.: PSANet: Point-wise spatial attention network for scene parsing. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11213, pp. 270\u2013286. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01240-3_17"},{"key":"17_CR109","doi-asserted-by":"crossref","unstructured":"Zheng, S., et al.: Rethinking semantic segmentation from a sequence-to-sequence perspective with transformers. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00681"},{"key":"17_CR110","unstructured":"Zhu, S.C., Yuille, A.: Region competition: Unifying snakes, region growing, and bayes\/mdl for multiband image segmentation. In: IEEE TPAMI (1996)"},{"key":"17_CR111","doi-asserted-by":"crossref","unstructured":"Zhu, X., Cheng, D., Zhang, Z., Lin, S., Dai, J.: An empirical study of spatial attention mechanisms in deep networks. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00679"},{"key":"17_CR112","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., Dai, J.: Deformable detr: Deformable transformers for end-to-end object detection. In: ICLR (2021)"},{"key":"17_CR113","doi-asserted-by":"crossref","unstructured":"Zhu, Z., Xu, M., Bai, S., Huang, T., Bai, X.: Asymmetric non-local neural networks for semantic segmentation. In: CVPR (2019)","DOI":"10.1109\/ICCV.2019.00068"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-19818-2_17","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,13]],"date-time":"2024-03-13T14:25:38Z","timestamp":1710339938000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-19818-2_17"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031198175","9783031198182"],"references-count":113,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-19818-2_17","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"22 October 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}