{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T15:56:26Z","timestamp":1780502186947,"version":"3.54.1"},"publisher-location":"Cham","reference-count":69,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031728891","type":"print"},{"value":"9783031728907","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,12,7]],"date-time":"2024-12-07T00:00:00Z","timestamp":1733529600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,7]],"date-time":"2024-12-07T00:00:00Z","timestamp":1733529600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72890-7_15","type":"book-chapter","created":{"date-parts":[[2024,12,6]],"date-time":"2024-12-06T19:45:52Z","timestamp":1733514352000},"page":"248-266","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":15,"title":["DIAL: Dense Image-Text ALignment for\u00a0Weakly Supervised Semantic Segmentation"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2719-7646","authenticated-orcid":false,"given":"Soojin","family":"Jang","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6868-286X","authenticated-orcid":false,"given":"Jungmin","family":"Yun","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7887-5884","authenticated-orcid":false,"given":"Junehyoung","family":"Kwon","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6571-0156","authenticated-orcid":false,"given":"Eunju","family":"Lee","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2114-0120","authenticated-orcid":false,"given":"Youngbin","family":"Kim","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,12,7]]},"reference":[{"key":"15_CR1","doi-asserted-by":"crossref","unstructured":"Ahn, J., Cho, S., Kwak, S.: Weakly supervised learning of instance segmentation with inter-pixel relations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2209\u20132218 (2019)","DOI":"10.1109\/CVPR.2019.00231"},{"key":"15_CR2","doi-asserted-by":"crossref","unstructured":"Ahn, J., Kwak, S.: Learning pixel-level semantic affinity with image-level supervision for weakly supervised semantic segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4981\u20134990 (2018)","DOI":"10.1109\/CVPR.2018.00523"},{"key":"15_CR3","first-page":"23716","volume":"35","author":"JB Alayrac","year":"2022","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning. Adv. Neural. Inf. Process. Syst. 35, 23716\u201323736 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"15_CR4","doi-asserted-by":"crossref","unstructured":"Araslanov, N., Roth, S.: Single-stage semantic segmentation from image labels. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4253\u20134262 (2020)","DOI":"10.1109\/CVPR42600.2020.00431"},{"key":"15_CR5","doi-asserted-by":"crossref","unstructured":"Caron, M., et al.: Emerging properties in self-supervised vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9650\u20139660 (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"issue":"4","key":"15_CR6","doi-asserted-by":"publisher","first-page":"834","DOI":"10.1109\/TPAMI.2017.2699184","volume":"40","author":"LC Chen","year":"2017","unstructured":"Chen, L.C., Papandreou, G., Kokkinos, I., Murphy, K., Yuille, A.L.: Deeplab: semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected crfs. IEEE Trans. Pattern Anal. Mach. Intell. 40(4), 834\u2013848 (2017)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"15_CR7","doi-asserted-by":"crossref","unstructured":"Chen, L., Lei, C., Li, R., Li, S., Zhang, Z., Zhang, L.: Fpr: false positive rectification for weakly supervised semantic segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1108\u20131118 (2023)","DOI":"10.1109\/ICCV51070.2023.00108"},{"key":"15_CR8","doi-asserted-by":"crossref","unstructured":"Chen, Q., Yang, L., Lai, J.H., Xie, X.: Self-supervised image-specific prototype exploration for weakly supervised semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4288\u20134298 (2022)","DOI":"10.1109\/CVPR52688.2022.00425"},{"key":"15_CR9","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: Simclr: a simple framework for contrastive learning of visual representations. In: International Conference on Learning Representations, vol.\u00a02 (2020)"},{"key":"15_CR10","doi-asserted-by":"crossref","unstructured":"Chen, Y., et al.: Revisiting multimodal representation in contrastive learning: from patch and token embeddings to finite discrete tokens. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15095\u201315104 (2023)","DOI":"10.1109\/CVPR52729.2023.01449"},{"key":"15_CR11","doi-asserted-by":"crossref","unstructured":"Chen, Z., Sun, Q.: Extracting class activation maps from non-discriminative features as well. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3135\u20133144 (2023)","DOI":"10.1109\/CVPR52729.2023.00306"},{"key":"15_CR12","doi-asserted-by":"crossref","unstructured":"Chen, Z., Wang, T., Wu, X., Hua, X.S., Zhang, H., Sun, Q.: Class re-activation maps for weakly-supervised semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 969\u2013978 (2022)","DOI":"10.1109\/CVPR52688.2022.00104"},{"key":"15_CR13","doi-asserted-by":"crossref","unstructured":"Desai, K., Johnson, J.: Virtex: learning visual representations from textual annotations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11162\u201311173 (2021)","DOI":"10.1109\/CVPR46437.2021.01101"},{"key":"15_CR14","doi-asserted-by":"crossref","unstructured":"Ding, X., Zhang, X., Han, J., Ding, G.: Scaling up your kernels to 31x31: revisiting large kernel design in cnns. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11963\u201311975 (2022)","DOI":"10.1109\/CVPR52688.2022.01166"},{"key":"15_CR15","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16x16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"15_CR16","doi-asserted-by":"crossref","unstructured":"Du, Y., Fu, Z., Liu, Q., Wang, Y.: Weakly supervised semantic segmentation by pixel-to-prototype contrast. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4320\u20134329 (2022)","DOI":"10.1109\/CVPR52688.2022.00428"},{"key":"15_CR17","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1007\/s11263-009-0275-4","volume":"88","author":"M Everingham","year":"2010","unstructured":"Everingham, M., Van Gool, L., Williams, C.K., Winn, J., Zisserman, A.: The pascal visual object classes (voc) challenge. Int. J. Comput. Vision 88, 303\u2013338 (2010)","journal-title":"Int. J. Comput. Vision"},{"key":"15_CR18","first-page":"21271","volume":"33","author":"JB Grill","year":"2020","unstructured":"Grill, J.B., et al.: Bootstrap your own latent-a new approach to self-supervised learning. Adv. Neural. Inf. Process. Syst. 33, 21271\u201321284 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"15_CR19","doi-asserted-by":"crossref","unstructured":"Hariharan, B., Arbel\u00e1ez, P., Bourdev, L., Maji, S., Malik, J.: Semantic contours from inverse detectors. In: 2011 international conference on computer vision. pp. 991\u2013998. IEEE (2011)","DOI":"10.1109\/ICCV.2011.6126343"},{"key":"15_CR20","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.: Momentum contrast for unsupervised visual representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9729\u20139738 (2020)","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"15_CR21","doi-asserted-by":"publisher","DOI":"10.1016\/j.engappai.2022.105706","volume":"119","author":"S Jang","year":"2023","unstructured":"Jang, S., Kwon, J., Jin, K., Kim, Y.: Weakly supervised semantic segmentation via graph recalibration with scaling weight unit. Eng. Appl. Artif. Intell. 119, 105706 (2023)","journal-title":"Eng. Appl. Artif. Intell."},{"key":"15_CR22","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: International Conference on Machine Learning, pp. 4904\u20134916. PMLR (2021)"},{"key":"15_CR23","doi-asserted-by":"crossref","unstructured":"Jiang, P.T., Yang, Y., Hou, Q., Wei, Y.: L2g: a simple local-to-global knowledge transfer framework for weakly supervised semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16886\u201316896 (2022)","DOI":"10.1109\/CVPR52688.2022.01638"},{"key":"15_CR24","first-page":"18590","volume":"34","author":"ZH Jiang","year":"2021","unstructured":"Jiang, Z.H., et al.: All tokens matter: token labeling for training better vision transformers. Adv. Neural. Inf. Process. Syst. 34, 18590\u201318602 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"15_CR25","doi-asserted-by":"crossref","unstructured":"Kim, B., Han, S., Kim, J.: Discriminative region suppression for weakly-supervised semantic segmentation. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a035, pp. 1754\u20131761 (2021)","DOI":"10.1609\/aaai.v35i2.16269"},{"key":"15_CR26","doi-asserted-by":"publisher","first-page":"149","DOI":"10.1016\/j.patrec.2023.02.018","volume":"167","author":"BJ Kim","year":"2023","unstructured":"Kim, B.J., Choi, H., Jang, H., Lee, D.G., Jeong, W., Kim, S.W.: Dead pixel test using effective receptive field. Pattern Recogn. Lett. 167, 149\u2013156 (2023)","journal-title":"Pattern Recogn. Lett."},{"key":"15_CR27","doi-asserted-by":"crossref","unstructured":"Kweon, H., Yoon, S.H., Yoon, K.J.: Weakly supervised semantic segmentation via adversarial learning of classifier and reconstructor. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11329\u201311339 (2023)","DOI":"10.1109\/CVPR52729.2023.01090"},{"key":"15_CR28","doi-asserted-by":"crossref","unstructured":"Kwon, J., Lee, E., Cho, Y., Kim, Y.: Learning to detour: shortcut mitigating augmentation for weakly supervised semantic segmentation. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 819\u2013828 (2024)","DOI":"10.1109\/WACV57701.2024.00087"},{"key":"15_CR29","doi-asserted-by":"crossref","unstructured":"Lee, J., Kim, E., Yoon, S.: Anti-adversarially manipulated attributions for weakly and semi-supervised semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4071\u20134080 (2021)","DOI":"10.1109\/CVPR46437.2021.00406"},{"key":"15_CR30","doi-asserted-by":"crossref","unstructured":"Lee, J., Oh, S.J., Yun, S., Choe, J., Kim, E., Yoon, S.: Weakly supervised semantic segmentation using out-of-distribution data. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16897\u201316906 (2022)","DOI":"10.1109\/CVPR52688.2022.01639"},{"key":"15_CR31","doi-asserted-by":"crossref","unstructured":"Lee, M., Kim, D., Shim, H.: Threshold matters in wsss: manipulating the activation for the robust and accurate segmentation model against thresholds. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4330\u20134339 (2022)","DOI":"10.1109\/CVPR52688.2022.00429"},{"key":"15_CR32","doi-asserted-by":"crossref","unstructured":"Lee, S., Lee, M., Lee, J., Shim, H.: Railroad is not a train: saliency as pseudo-pixel supervision for weakly supervised semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5495\u20135505 (2021)","DOI":"10.1109\/CVPR46437.2021.00545"},{"key":"15_CR33","unstructured":"Li, B., Weinberger, K.Q., Belongie, S., Koltun, V., Ranftl, R.: Language-driven semantic segmentation. arXiv preprint arXiv:2201.03546 (2022)"},{"key":"15_CR34","first-page":"16037","volume":"35","author":"J Li","year":"2022","unstructured":"Li, J., Jie, Z., Wang, X., Wei, X., Ma, L.: Expansion and shrinkage of localization for weakly-supervised semantic segmentation. Adv. Neural. Inf. Process. Syst. 35, 16037\u201316051 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"15_CR35","unstructured":"Li, L.H., Yatskar, M., Yin, D., Hsieh, C.J., Chang, K.W.: Visualbert: a simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557 (2019)"},{"key":"15_CR36","doi-asserted-by":"crossref","unstructured":"Liang, F., et al.: Open-vocabulary semantic segmentation with mask-adapted clip. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7061\u20137070 (2023)","DOI":"10.1109\/CVPR52729.2023.00682"},{"key":"15_CR37","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"15_CR38","doi-asserted-by":"crossref","unstructured":"Lin, Y., et al.: Clip is also an efficient segmenter: a text-driven approach for weakly supervised semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15305\u201315314 (2023)","DOI":"10.1109\/CVPR52729.2023.01469"},{"key":"15_CR39","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: Vilbert: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Adv. Neural Inf. Process. Syst. 32 (2019)"},{"key":"15_CR40","doi-asserted-by":"crossref","unstructured":"Mukhoti, J., et al.: Open vocabulary semantic segmentation with patch aligned contrastive learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19413\u201319423 (2023)","DOI":"10.1109\/CVPR52729.2023.01860"},{"key":"15_CR41","unstructured":"Oord, A.V.D., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)"},{"issue":"5","key":"15_CR42","doi-asserted-by":"publisher","first-page":"1181","DOI":"10.1007\/s11263-022-01590-z","volume":"130","author":"J Pan","year":"2022","unstructured":"Pan, J., et al.: Learning self-supervised low-rank network for single-stage weakly and semi-supervised semantic segmentation. Int. J. Comput. Vision 130(5), 1181\u20131195 (2022)","journal-title":"Int. J. Comput. Vision"},{"key":"15_CR43","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"15_CR44","first-page":"12116","volume":"34","author":"M Raghu","year":"2021","unstructured":"Raghu, M., Unterthiner, T., Kornblith, S., Zhang, C., Dosovitskiy, A.: Do vision transformers see like convolutional neural networks? Adv. Neural. Inf. Process. Syst. 34, 12116\u201312128 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"15_CR45","unstructured":"Ridnik, T., Ben-Baruch, E., Noy, A., Zelnik-Manor, L.: Imagenet-21k pretraining for the masses. arXiv preprint arXiv:2104.10972 (2021)"},{"key":"15_CR46","doi-asserted-by":"publisher","unstructured":"Rossetti, S., Zappia, D., Sanzari, M., Schaerf, M., Pirri, F.: Max pooling with vision transformers reconciles class and shape in weakly supervised semantic segmentation. In: European Conference on Computer Vision, pp. 446\u2013463. Springer, Heidelberg (2022). https:\/\/doi.org\/10.1007\/978-3-031-20056-4_26","DOI":"10.1007\/978-3-031-20056-4_26"},{"issue":"4","key":"15_CR47","doi-asserted-by":"publisher","first-page":"1127","DOI":"10.1007\/s11263-022-01586-9","volume":"130","author":"L Ru","year":"2022","unstructured":"Ru, L., Du, B., Zhan, Y., Wu, C.: Weakly-supervised semantic segmentation with visual words learning and hybrid pooling. Int. J. Comput. Vision 130(4), 1127\u20131144 (2022)","journal-title":"Int. J. Comput. Vision"},{"key":"15_CR48","doi-asserted-by":"crossref","unstructured":"Ru, L., Zhan, Y., Yu, B., Du, B.: Learning affinity from attention: end-to-end weakly-supervised semantic segmentation with transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16846\u201316855 (2022)","DOI":"10.1109\/CVPR52688.2022.01634"},{"key":"15_CR49","doi-asserted-by":"crossref","unstructured":"Ru, L., Zheng, H., Zhan, Y., Du, B.: Token contrast for weakly-supervised semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3093\u20133102 (2023)","DOI":"10.1109\/CVPR52729.2023.00302"},{"key":"15_CR50","doi-asserted-by":"crossref","unstructured":"Tang, M., Perazzi, F., Djelouah, A., Ben\u00a0Ayed, I., Schroers, C., Boykov, Y.: On regularized losses for weakly-supervised cnn segmentation. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 507\u2013522 (2018)","DOI":"10.1007\/978-3-030-01270-0_31"},{"key":"15_CR51","unstructured":"Touvron, H., Cord, M., Douze, M., Massa, F., Sablayrolles, A., J\u00e9gou, H.: Training data-efficient image transformers & distillation through attention. In: International Conference on Machine Learning, pp. 10347\u201310357. PMLR (2021)"},{"key":"15_CR52","unstructured":"Vaswani, A., et al.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"15_CR53","unstructured":"Veit, A., Wilber, M.J., Belongie, S.: Residual networks behave like ensembles of relatively shallow networks. Adv. Neural Inf. Process. Syst. 29 (2016)"},{"key":"15_CR54","doi-asserted-by":"crossref","unstructured":"Wang, Y., Zhang, J., Kan, M., Shan, S., Chen, X.: Self-supervised equivariant attention mechanism for weakly supervised semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12275\u201312284 (2020)","DOI":"10.1109\/CVPR42600.2020.01229"},{"key":"15_CR55","doi-asserted-by":"crossref","unstructured":"Wang, Z., et al.: Cris: clip-driven referring image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11686\u201311695 (2022)","DOI":"10.1109\/CVPR52688.2022.01139"},{"key":"15_CR56","doi-asserted-by":"crossref","unstructured":"Wei, Y., Xiao, H., Shi, H., Jie, Z., Feng, J., Huang, T.S.: Revisiting dilated convolution: A simple approach for weakly-and semi-supervised semantic segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 7268\u20137277 (2018)","DOI":"10.1109\/CVPR.2018.00759"},{"key":"15_CR57","doi-asserted-by":"publisher","first-page":"119","DOI":"10.1016\/j.patcog.2019.01.006","volume":"90","author":"Z Wu","year":"2019","unstructured":"Wu, Z., Shen, C., Van Den Hengel, A.: Wider or deeper: revisiting the resnet model for visual recognition. Pattern Recogn. 90, 119\u2013133 (2019)","journal-title":"Pattern Recogn."},{"key":"15_CR58","first-page":"12077","volume":"34","author":"E Xie","year":"2021","unstructured":"Xie, E., Wang, W., Yu, Z., Anandkumar, A., Alvarez, J.M., Luo, P.: Segformer: simple and efficient design for semantic segmentation with transformers. Adv. Neural. Inf. Process. Syst. 34, 12077\u201312090 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"15_CR59","doi-asserted-by":"crossref","unstructured":"Xie, J., Hou, X., Ye, K., Shen, L.: Clims: cross language image matching for weakly supervised semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4483\u20134492 (2022)","DOI":"10.1109\/CVPR52688.2022.00444"},{"key":"15_CR60","doi-asserted-by":"crossref","unstructured":"Xie, J., Xiang, J., Chen, J., Hou, X., Zhao, X., Shen, L.: C2am: contrastive learning of class-agnostic activation map for weakly supervised object localization and semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 989\u2013998 (2022)","DOI":"10.1109\/CVPR52688.2022.00106"},{"key":"15_CR61","doi-asserted-by":"crossref","unstructured":"Xu, J., et al.: Groupvit: semantic segmentation emerges from text supervision. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18134\u201318144 (2022)","DOI":"10.1109\/CVPR52688.2022.01760"},{"key":"15_CR62","doi-asserted-by":"crossref","unstructured":"Xu, L., Ouyang, W., Bennamoun, M., Boussaid, F., Xu, D.: Multi-class token transformer for weakly supervised semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4310\u20134319 (2022)","DOI":"10.1109\/CVPR52688.2022.00427"},{"key":"15_CR63","doi-asserted-by":"crossref","unstructured":"Xu, L., Ouyang, W., Bennamoun, M., Boussaid, F., Xu, D.: Learning multi-modal class-specific tokens for weakly supervised dense object localization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19596\u201319605 (2023)","DOI":"10.1109\/CVPR52729.2023.01877"},{"key":"15_CR64","doi-asserted-by":"crossref","unstructured":"Yang, J., et al.: Vision-language pre-training with triple contrastive learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15671\u201315680 (2022)","DOI":"10.1109\/CVPR52688.2022.01522"},{"key":"15_CR65","doi-asserted-by":"crossref","unstructured":"Yi, M., Cui, Q., Wu, H., Yang, C., Yoshie, O., Lu, H.: A simple framework for text-supervised semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7071\u20137080 (2023)","DOI":"10.1109\/CVPR52729.2023.00683"},{"key":"15_CR66","doi-asserted-by":"crossref","unstructured":"Yun, S., Park, S.H., Seo, P.H., Shin, J.: Ifseg: image-free semantic segmentation via vision-language model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2967\u20132977 (2023)","DOI":"10.1109\/CVPR52729.2023.00290"},{"key":"15_CR67","doi-asserted-by":"crossref","unstructured":"Zhang, B., Xiao, J., Wei, Y., Sun, M., Huang, K.: Reliability does matter: an end-to-end weakly supervised semantic segmentation approach. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a034, pp. 12765\u201312772 (2020)","DOI":"10.1609\/aaai.v34i07.6971"},{"key":"15_CR68","doi-asserted-by":"crossref","unstructured":"Zhou, B., Khosla, A., Lapedriza, A., Oliva, A., Torralba, A.: Learning deep features for discriminative localization. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2921\u20132929 (2016)","DOI":"10.1109\/CVPR.2016.319"},{"key":"15_CR69","doi-asserted-by":"crossref","unstructured":"Zhou, T., Zhang, M., Zhao, F., Li, J.: Regional semantic contrast and aggregation for weakly supervised semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4299\u20134309 (2022)","DOI":"10.1109\/CVPR52688.2022.00426"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72890-7_15","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,6]],"date-time":"2024-12-06T20:06:07Z","timestamp":1733515567000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72890-7_15"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,7]]},"ISBN":["9783031728891","9783031728907"],"references-count":69,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72890-7_15","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12,7]]},"assertion":[{"value":"7 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}