{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T16:24:59Z","timestamp":1778084699421,"version":"3.51.4"},"publisher-location":"Cham","reference-count":51,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729485","type":"print"},{"value":"9783031729492","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72949-2_3","type":"book-chapter","created":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T15:22:17Z","timestamp":1730301737000},"page":"39-56","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["MTA-CLIP: Language-Guided Semantic Segmentation with\u00a0Mask-Text Alignment"],"prefix":"10.1007","author":[{"given":"Anurag","family":"Das","sequence":"first","affiliation":[]},{"given":"Xinting","family":"Hu","sequence":"additional","affiliation":[]},{"given":"Li","family":"Jiang","sequence":"additional","affiliation":[]},{"given":"Bernt","family":"Schiele","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,31]]},"reference":[{"key":"3_CR1","doi-asserted-by":"crossref","unstructured":"Bertasius, G., Shi, J., Torresani, L.: Semantic segmentation with boundary neural fields. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3602\u20133610 (2016)","DOI":"10.1109\/CVPR.2016.392"},{"issue":"4","key":"3_CR2","doi-asserted-by":"publisher","first-page":"834","DOI":"10.1109\/TPAMI.2017.2699184","volume":"40","author":"LC Chen","year":"2017","unstructured":"Chen, L.C., Papandreou, G., Kokkinos, I., Murphy, K., Yuille, A.L.: Deeplab: semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected crfs. IEEE Trans. Pattern Anal. Mach. Intell. 40(4), 834\u2013848 (2017). https:\/\/doi.org\/10.1109\/TPAMI.2017.2699184","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"4","key":"3_CR3","doi-asserted-by":"publisher","first-page":"834","DOI":"10.1109\/TPAMI.2017.2699184","volume":"40","author":"LC Chen","year":"2018","unstructured":"Chen, L.C., Papandreou, G., Kokkinos, I., Murphy, K., Yuille, A.L.: Deeplab: semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected crfs. IEEE Trans. Pattern Anal. Mach. Intell. 40(4), 834\u2013848 (2018). https:\/\/doi.org\/10.1109\/TPAMI.2017.2699184","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"3_CR4","doi-asserted-by":"crossref","unstructured":"Chen, L.C., Zhu, Y., Papandreou, G., Schroff, F., Adam, H.: Encoder-decoder with atrous separable convolution for semantic image segmentation. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 801\u2013818 (2018)","DOI":"10.1007\/978-3-030-01234-2_49"},{"key":"3_CR5","doi-asserted-by":"crossref","unstructured":"Cheng, B., Misra, I., Schwing, A.G., Kirillov, A., Girdhar, R.: Masked-attention mask transformer for universal image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1290\u20131299 (2022)","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"3_CR6","first-page":"17864","volume":"34","author":"B Cheng","year":"2021","unstructured":"Cheng, B., Schwing, A., Kirillov, A.: Per-pixel classification is not all you need for semantic segmentation. Adv. Neural. Inf. Process. Syst. 34, 17864\u201317875 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"3_CR7","doi-asserted-by":"crossref","unstructured":"Cho, S., Shin, H., Hong, S., Arnab, A., Seo, P.H., Kim, S.: Cat-seg: cost aggregation for open-vocabulary semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4113\u20134123 (2024)","DOI":"10.1109\/CVPR52733.2024.00394"},{"key":"3_CR8","doi-asserted-by":"crossref","unstructured":"Cordts, M., et al.: The cityscapes dataset for semantic urban scene understanding. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3213\u20133223 (2016)","DOI":"10.1109\/CVPR.2016.350"},{"key":"3_CR9","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: Imagenet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"3_CR10","doi-asserted-by":"crossref","unstructured":"Ding, H., Jiang, X., Liu, A.Q., Thalmann, N.M., Wang, G.: Boundary-aware feature propagation for scene segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6819\u20136829 (2019)","DOI":"10.1109\/ICCV.2019.00692"},{"key":"3_CR11","doi-asserted-by":"crossref","unstructured":"Ding, J., Xue, N., Xia, G., Dai, D.: Decoupling zero-shot semantic segmentation. In: CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 11573\u201311582. IEEE (2021)","DOI":"10.1109\/CVPR52688.2022.01129"},{"key":"3_CR12","unstructured":"Dosovitskiy, A., et al.: An image is worth 16$$\\times $$16 words: transformers for image recognition at scale. In: ICLR (2021)"},{"key":"3_CR13","doi-asserted-by":"crossref","unstructured":"Gao, P., et al.: Clip-adapter: better vision-language models with feature adapters. Int. J. Comput. Vision 1\u201315 (2023)","DOI":"10.1007\/s11263-023-01891-x"},{"key":"3_CR14","doi-asserted-by":"publisher","unstructured":"Ghiasi, G., Gu, X., Cui, Y., Lin, T.Y.: Scaling open-vocabulary image segmentation with image-level labels. In: European Conference on Computer Vision, pp. 540\u2013557. Springer, Heidelberg (2022). https:\/\/doi.org\/10.1007\/978-3-031-20059-5_31","DOI":"10.1007\/978-3-031-20059-5_31"},{"key":"3_CR15","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"3_CR16","unstructured":"Houlsby, N., et al.: Parameter-efficient transfer learning for nlp. In: International Conference on Machine Learning, pp. 2790\u20132799. PMLR (2019)"},{"key":"3_CR17","doi-asserted-by":"crossref","unstructured":"Khattak, M.U., Rasheed, H., Maaz, M., Khan, S., Khan, F.S.: Maple: multi-modal prompt learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19113\u201319122 (2023)","DOI":"10.1109\/CVPR52729.2023.01832"},{"key":"3_CR18","doi-asserted-by":"crossref","unstructured":"Kirillov, A., Girshick, R., He, K., Doll\u00e1r, P.: Panoptic feature pyramid networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6399\u20136408 (2019)","DOI":"10.1109\/CVPR.2019.00656"},{"key":"3_CR19","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: Imagenet classification with deep convolutional neural networks. Adv. Neural Inf. Process. Syst. 25 (2012)"},{"key":"3_CR20","doi-asserted-by":"crossref","unstructured":"Kwon, H., Song, T., Jeong, S., Kim, J., Jang, J., Sohn, K.: Probabilistic prompt learning for dense prediction. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6768\u20136777 (2023)","DOI":"10.1109\/CVPR52729.2023.00654"},{"key":"3_CR21","doi-asserted-by":"publisher","unstructured":"Lester, B., Al-Rfou, R., Constant, N.: The power of scale for parameter-efficient prompt tuning. In: Moens, M.F., Huang, X., Specia, L., Yih, S.W.t. (eds.) Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, pp. 3045\u20133059. Association for Computational Linguistics, Online and Punta Cana (2021). https:\/\/doi.org\/10.18653\/v1\/2021.emnlp-main.243. https:\/\/aclanthology.org\/2021.emnlp-main.243","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"3_CR22","unstructured":"Li, B., Weinberger, K.Q., Belongie, S., Koltun, V., Ranftl, R.: Language-driven semantic segmentation. In: International Conference on Learning Representations (2022)"},{"key":"3_CR23","doi-asserted-by":"crossref","unstructured":"Li, F., et al.: Mask dino: towards a unified transformer-based framework for object detection and segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3041\u20133050 (2023)","DOI":"10.1109\/CVPR52729.2023.00297"},{"key":"3_CR24","doi-asserted-by":"crossref","unstructured":"Li, X., Wang, W., Hu, X., Yang, J.: Selective kernel networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 510\u2013519 (2019)","DOI":"10.1109\/CVPR.2019.00060"},{"key":"3_CR25","doi-asserted-by":"publisher","unstructured":"Li, X.L., Liang, P.: Prefix-tuning: Optimizing continuous prompts for generation. In: Zong, C., Xia, F., Li, W., Navigli, R. (eds.) Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing, vol. 1: Long Papers, pp. 4582\u20134597. Association for Computational Linguistics, Online (2021). https:\/\/doi.org\/10.18653\/v1\/2021.acl-long.353. https:\/\/aclanthology.org\/2021.acl-long.353","DOI":"10.18653\/v1\/2021.acl-long.353"},{"key":"3_CR26","doi-asserted-by":"crossref","unstructured":"Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3431\u20133440 (2015)","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"3_CR27","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: International Conference on Learning Representations (2019). https:\/\/openreview.net\/forum?id=Bkg6RiCqY7"},{"key":"3_CR28","doi-asserted-by":"crossref","unstructured":"Lu, Y., Liu, J., Zhang, Y., Liu, Y., Tian, X.: Prompt distribution learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5206\u20135215 (2022)","DOI":"10.1109\/CVPR52688.2022.00514"},{"key":"3_CR29","doi-asserted-by":"crossref","unstructured":"Peng, C., Zhang, X., Yu, G., Luo, G., Sun, J.: Large kernel matters\u2013improve semantic segmentation by global convolutional network. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4353\u20134361 (2017)","DOI":"10.1109\/CVPR.2017.189"},{"key":"3_CR30","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"3_CR31","doi-asserted-by":"crossref","unstructured":"Rao, Y., et al.: Denseclip: language-guided dense prediction with context-aware prompting. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18082\u201318091 (2022)","DOI":"10.1109\/CVPR52688.2022.01755"},{"key":"3_CR32","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. In: Bengio, Y., LeCun, Y. (eds.) 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, 7\u20139 May 2015, Conference Track Proceedings (2015)"},{"key":"3_CR33","doi-asserted-by":"crossref","unstructured":"Strudel, R., Garcia, R., Laptev, I., Schmid, C.: Segmenter: transformer for semantic segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7262\u20137272 (2021)","DOI":"10.1109\/ICCV48922.2021.00717"},{"key":"3_CR34","doi-asserted-by":"crossref","unstructured":"Takikawa, T., Acuna, D., Jampani, V., Fidler, S.: Gated-scnn: gated shape cnns for semantic segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5229\u20135238 (2019)","DOI":"10.1109\/ICCV.2019.00533"},{"key":"3_CR35","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"108","DOI":"10.1007\/978-3-030-58548-8_7","volume-title":"Computer Vision \u2013 ECCV 2020","author":"H Wang","year":"2020","unstructured":"Wang, H., Zhu, Y., Green, B., Adam, H., Yuille, A., Chen, L.-C.: Axial-DeepLab: stand-alone axial-attention for panoptic segmentation. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12349, pp. 108\u2013126. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58548-8_7"},{"key":"3_CR36","doi-asserted-by":"crossref","unstructured":"Wang, J., et al.: Deep high-resolution representation learning for visual recognition. TPAMI 43, 3349\u20133364 (2019)","DOI":"10.1109\/TPAMI.2020.2983686"},{"key":"3_CR37","doi-asserted-by":"crossref","unstructured":"Wang, W., et\u00a0al.: Image as a foreign language: beit pretraining for vision and vision-language tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19175\u201319186 (2023)","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"3_CR38","doi-asserted-by":"crossref","unstructured":"Xiao, T., Liu, Y., Zhou, B., Jiang, Y., Sun, J.: Unified perceptual parsing for scene understanding. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 418\u2013434 (2018)","DOI":"10.1007\/978-3-030-01228-1_26"},{"key":"3_CR39","first-page":"12077","volume":"34","author":"E Xie","year":"2021","unstructured":"Xie, E., Wang, W., Yu, Z., Anandkumar, A., Alvarez, J.M., Luo, P.: Segformer: simple and efficient design for semantic segmentation with transformers. Adv. Neural. Inf. Process. Syst. 34, 12077\u201312090 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"3_CR40","doi-asserted-by":"crossref","unstructured":"Xu, M., Zhang, Z., Wei, F., Hu, H., Bai, X.: Side adapter network for open-vocabulary semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2945\u20132954 (2023)","DOI":"10.1109\/CVPR52729.2023.00288"},{"key":"3_CR41","doi-asserted-by":"publisher","unstructured":"Xu, M., et al.: A simple baseline for open-vocabulary semantic segmentation with pre-trained vision-language model. In: European Conference on Computer Vision, pp. 736\u2013753. Springer, Heidelberg (2022). https:\/\/doi.org\/10.1007\/978-3-031-19818-2_42","DOI":"10.1007\/978-3-031-19818-2_42"},{"key":"3_CR42","doi-asserted-by":"crossref","unstructured":"Yu, C., Wang, J., Gao, C., Yu, G., Shen, C., Sang, N.: Context prior for scene segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12416\u201312425 (2020)","DOI":"10.1109\/CVPR42600.2020.01243"},{"key":"3_CR43","doi-asserted-by":"crossref","unstructured":"Yu, C., et al.: Lite-hrnet: a lightweight high-resolution network. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10440\u201310450 (2021)","DOI":"10.1109\/CVPR46437.2021.01030"},{"key":"3_CR44","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"489","DOI":"10.1007\/978-3-030-58610-2_29","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Y Yuan","year":"2020","unstructured":"Yuan, Y., Xie, J., Chen, X., Wang, J.: SegFix: model-agnostic boundary refinement for segmentation. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12357, pp. 489\u2013506. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58610-2_29"},{"key":"3_CR45","doi-asserted-by":"crossref","unstructured":"Zhao, H., Shi, J., Qi, X., Wang, X., Jia, J.: Pyramid scene parsing network. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2881\u20132890 (2017)","DOI":"10.1109\/CVPR.2017.660"},{"key":"3_CR46","doi-asserted-by":"crossref","unstructured":"Zheng, S., et\u00a0al.: Rethinking semantic segmentation from a sequence-to-sequence perspective with transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6881\u20136890 (2021)","DOI":"10.1109\/CVPR46437.2021.00681"},{"key":"3_CR47","unstructured":"Zheng\u00a0Ding, Jieke\u00a0Wang, Z.T.: Open-vocabulary universal image segmentation with maskclip. In: International Conference on Machine Learning (2023)"},{"key":"3_CR48","doi-asserted-by":"publisher","first-page":"302","DOI":"10.1007\/s11263-018-1140-0","volume":"127","author":"B Zhou","year":"2019","unstructured":"Zhou, B., et al.: Semantic understanding of scenes through the ade20k dataset. Int. J. Comput. Vision 127, 302\u2013321 (2019)","journal-title":"Int. J. Comput. Vision"},{"key":"3_CR49","doi-asserted-by":"publisher","unstructured":"Zhou, C., Loy, C.C., Dai, B.: Extract free dense labels from clip. In: European Conference on Computer Vision, pp. 696\u2013712. Springer, Heidelberg (2022). https:\/\/doi.org\/10.1007\/978-3-031-19815-1_40","DOI":"10.1007\/978-3-031-19815-1_40"},{"key":"3_CR50","doi-asserted-by":"crossref","unstructured":"Zhou, K., Yang, J., Loy, C.C., Liu, Z.: Conditional prompt learning for vision-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16816\u201316825 (2022)","DOI":"10.1109\/CVPR52688.2022.01631"},{"issue":"9","key":"3_CR51","doi-asserted-by":"publisher","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","volume":"130","author":"K Zhou","year":"2022","unstructured":"Zhou, K., Yang, J., Loy, C.C., Liu, Z.: Learning to prompt for vision-language models. Int. J. Comput. Vision 130(9), 2337\u20132348 (2022)","journal-title":"Int. J. Comput. Vision"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72949-2_3","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T15:39:15Z","timestamp":1730302755000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72949-2_3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,31]]},"ISBN":["9783031729485","9783031729492"],"references-count":51,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72949-2_3","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,31]]},"assertion":[{"value":"31 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}