{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T16:07:51Z","timestamp":1775146071269,"version":"3.50.1"},"publisher-location":"Singapore","reference-count":118,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819609710","type":"print"},{"value":"9789819609727","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,12,10]],"date-time":"2024-12-10T00:00:00Z","timestamp":1733788800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,10]],"date-time":"2024-12-10T00:00:00Z","timestamp":1733788800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-0972-7_27","type":"book-chapter","created":{"date-parts":[[2024,12,9]],"date-time":"2024-12-09T08:06:56Z","timestamp":1733731616000},"page":"463-484","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":10,"title":["Strong but\u00a0Simple: A Baseline for\u00a0Domain Generalized Dense Perception by\u00a0CLIP-Based Transfer Learning"],"prefix":"10.1007","author":[{"given":"Christoph","family":"H\u00fcmmer","sequence":"first","affiliation":[]},{"given":"Manuel","family":"Schwonberg","sequence":"additional","affiliation":[]},{"given":"Liangwei","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Hu","family":"Cao","sequence":"additional","affiliation":[]},{"given":"Alois","family":"Knoll","sequence":"additional","affiliation":[]},{"given":"Hanno","family":"Gottschalk","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,10]]},"reference":[{"key":"27_CR1","unstructured":"Alayrac, J.B., Donahue, J., Luc, P., Miech, A., Barr, I., Hasson, Y., Lenc, K., Mensch, A., Millican, K., Reynolds, M., Ring, R., Rutherford, E., Cabi, S., Han, T., Gong, Z., Samangooei, S., Monteiro, M., Menick, J.L., Borgeaud, S., Brock, A., Nematzadeh, A., Sharifzadeh, S., Bi\u0144kowski, M.a., Barreira, R., Vinyals, O., Zisserman, A., Simonyan, K.: Flamingo: a visual language model for few-shot learning. In: Proc. of NeurIPS (2022)"},{"key":"27_CR2","unstructured":"Bao, H., Wang, W., Dong, L., Wei, F.: Vl-beit: Generative vision-language pretraining (2022)"},{"key":"27_CR3","doi-asserted-by":"crossref","unstructured":"Benigmim, Y., Roy, S., Essid, S., Kalogeiton, V., Lathuili\u00e8re, S.: Collaborating foundation models for domain generalized semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 3108\u20133119 (2024)","DOI":"10.1109\/CVPR52733.2024.00300"},{"key":"27_CR4","unstructured":"Bi, Q., You, S., Gevers, T.: Learning Content-enhanced Mask Transformer for Domain Generalized Urban-Scene Segmentation. arXiv:2307.00371 pp. 1\u201318 (2023)"},{"key":"27_CR5","unstructured":"Bi, Q., Zhou, B., Yi, J., Ji, W., Zhan, H., Xia, G.S.: Good: Towards domain generalized orientated object detection. arXiv preprint arXiv:2402.12765 (2024)"},{"key":"27_CR6","unstructured":"Borse, S., Cai, H., Zhang, Y., Porikli, F.: Hs3: Learning with proper task complexity in hierarchically supervised semantic segmentation. arXiv preprint arXiv:2111.02333 (2021)"},{"key":"27_CR7","doi-asserted-by":"crossref","unstructured":"Borse, S., Wang, Y., Zhang, Y., Porikli, F.: Inverseform: A loss function for structured boundary-aware segmentation. In: Proc. of CVPR. pp. 5901\u20135911 (2021)","DOI":"10.1109\/CVPR46437.2021.00584"},{"key":"27_CR8","doi-asserted-by":"crossref","unstructured":"Caron, M., Touvron, H., Misra, I., J\u00e9gou, H., Mairal, J., Bojanowski, P., Joulin, A.: Emerging properties in self-supervised vision transformers. In: Proc. of ICCV. pp. 9650\u20139660 (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"27_CR9","doi-asserted-by":"crossref","unstructured":"Chattopadhyay*, P., Sarangmath*, K., Vijaykumar, V., Hoffman, J.: Pasta: Proportional amplitude spectrum training augmentation for syn-to-real domain generalization. In: Proc. of ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01767"},{"key":"27_CR10","doi-asserted-by":"crossref","unstructured":"Chen, C., Zheng, Z., Ding, X., Huang, Y., Dou, Q.: Harmonizing transferability and discriminability for adapting object detectors. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 8869\u20138878 (2020)","DOI":"10.1109\/CVPR42600.2020.00889"},{"key":"27_CR11","unstructured":"Chen, K., Wang, J., Pang, J., Cao, Y., Xiong, Y., Li, X., Sun, S., Feng, W., Liu, Z., Xu, J., et\u00a0al.: Mmdetection: Open mmlab detection toolbox and benchmark. arXiv preprint arXiv:1906.07155 (2019)"},{"key":"27_CR12","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: Proc. of ICML. pp. 1597\u20131607. PMLR (2020)"},{"key":"27_CR13","doi-asserted-by":"crossref","unstructured":"Chen, X., Xie, S., He, K.: An empirical study of training self-supervised vision transformers. in 2021 ieee. In: Proc. of ICCV. pp. 9620\u20139629","DOI":"10.1109\/ICCV48922.2021.00950"},{"key":"27_CR14","unstructured":"Chen, Z., Duan, Y., Wang, W., He, J., Lu, T., Dai, J., Qiao, Y.: Vision transformer adapter for dense predictions. arXiv preprint arXiv:2205.08534 (2022)"},{"key":"27_CR15","doi-asserted-by":"crossref","unstructured":"Cheng, B., Misra, I., Schwing, A.G., Kirillov, A., Girdhar, R.: Masked-attention mask transformer for universal image segmentation. In: Proc. of CVPR. pp. 1290\u20131299 (2022)","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"27_CR16","doi-asserted-by":"crossref","unstructured":"Choi, S., Jung, S., Yun, H., Kim, J.T., Kim, S., Choo, J.: RobustNet: Improving Domain Generalization in Urban-Scene Segmentation via Instance Selective Whitening. In: Proc. of CVPR. pp. 11580\u201311590 (Jun 2021)","DOI":"10.1109\/CVPR46437.2021.01141"},{"key":"27_CR17","unstructured":"Contributors, M.: Mmsegmentation: Openmmlab semantic segmentation toolbox and benchmark (2020)"},{"key":"27_CR18","doi-asserted-by":"crossref","unstructured":"Cordts, M., Omran, M., Ramos, S., Rehfeld, T., Enzweiler, M., Benenson, R., Franke, U., Roth, S., Schiele, B.: The cityscapes dataset for semantic urban scene understanding (2016)","DOI":"10.1109\/CVPR.2016.350"},{"key":"27_CR19","doi-asserted-by":"crossref","unstructured":"Cubuk, E.D., Zoph, B., Shlens, J., Le, Q.: Randaugment: Practical automated data augmentation with a reduced search space. In: Proc. of NeurIPS. vol.\u00a033 (2020)","DOI":"10.1109\/CVPRW50498.2020.00359"},{"key":"27_CR20","doi-asserted-by":"crossref","unstructured":"Ding, J., Xue, N., Xia, G.S., Schiele, B., Dai, D.: Hgformer: Hierarchical grouping transformer for domain generalized semantic segmentation. In: Proc. of CVPR. pp. 15413\u201315423 (2023)","DOI":"10.1109\/CVPR52729.2023.01479"},{"key":"27_CR21","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., Uszkoreit, J., Houlsby, N.: An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In: Proc. of ICLR. pp. 1\u201321 (May 2021)"},{"key":"27_CR22","doi-asserted-by":"crossref","unstructured":"Fahes, M., Vu, T.H., Bursuc, A., P\u00e9rez, P., de\u00a0Charette, R.: A simple recipe for language-guided domain generalized segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 23428\u201323437 (2024)","DOI":"10.1109\/CVPR52733.2024.02211"},{"key":"27_CR23","doi-asserted-by":"crossref","unstructured":"Fahes, M., Vu, T.H., Bursuc, A., P\u00e9rez, P., de\u00a0Charette, R.: P\u00f8da: Prompt-driven zero-shot domain adaptation (2023)","DOI":"10.1109\/ICCV51070.2023.01707"},{"key":"27_CR24","doi-asserted-by":"crossref","unstructured":"Fang, Y., Sun, Q., Wang, X., Huang, T., Wang, X., Cao, Y.: Eva-02: A visual representation for neon genesis. arXiv preprint arXiv:2303.11331 (2023)","DOI":"10.2139\/ssrn.4813567"},{"key":"27_CR25","unstructured":"Feng, Y., Jiang, J., Tang, M., Jin, R., Gao, Y.: Rethinking supervised pre-training for better downstream transferring. In: Proc. of ICLR (2021)"},{"key":"27_CR26","unstructured":"Geng, X., Liu, H., Lee, L., Schuurmans, D., Levine, S., Abbeel, P.: Multimodal masked autoencoders learn transferable representations (2022)"},{"key":"27_CR27","unstructured":"Goldblum, M., Souri, H., Ni, R., Shu, M., Prabhu, V.U., Somepalli, G., Chattopadhyay, P., Ibrahim, M., Bardes, A., Hoffman, J., Chellappa, R., Wilson, A.G., Goldstein, T.: Battle of the backbones: A large-scale comparison of pretrained models across computer vision tasks. In: Proc. of NeurIPS Datasets and Benchmarks Track (2023), https:\/\/openreview.net\/forum?id=1yOnfDpkVe"},{"key":"27_CR28","unstructured":"G\u00f3mez, J.L., Silva, M., Seoane, A., Borr\u00e1s, A., Noriega, M., Ros, G., Iglesias-Guitian, J.A., L\u00f3pez, A.M.: All for one, and one for all: Urbansyn dataset, the third musketeer of synthetic driving scenes. arXiv preprint arXiv:2312.12176 (2023)"},{"key":"27_CR29","unstructured":"Gong, R., Danelljan, M., Sun, H., Mangas, J.D., Van\u00a0Gool, L.: Prompting Diffusion Representations for Cross-Domain Semantic Segmentation. arXiv:2307.02138 pp. 1\u201317 (2023)"},{"key":"27_CR30","first-page":"21271","volume":"33","author":"JB Grill","year":"2020","unstructured":"Grill, J.B., Strub, F., Altch\u00e9, F., Tallec, C., Richemond, P., Buchatskaya, E., Doersch, C., Avila Pires, B., Guo, Z., Gheshlaghi Azar, M., et al.: Bootstrap your own latent-a new approach to self-supervised learning. In Proc. of NeurIPS 33, 21271\u201321284 (2020)","journal-title":"In Proc. of NeurIPS"},{"key":"27_CR31","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.: Momentum contrast for unsupervised visual representation learning. In: Proc. of CVPR. pp. 9729\u20139738 (2020)","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"27_CR32","doi-asserted-by":"crossref","unstructured":"He, K., Girshick, R., Doll\u00e1r, P.: Rethinking imagenet pre-training. In: Proc. of ICCV. pp. 4918\u20134927 (2019)","DOI":"10.1109\/ICCV.2019.00502"},{"key":"27_CR33","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proc. of CVPR. pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"27_CR34","doi-asserted-by":"crossref","unstructured":"He, W., Jamonnak, S., Gou, L., Ren, L.: Clip-s4: Language-guided self-supervised semantic segmentation. In: Proc. of CVPR. pp. 11207\u201311216 (June 2023)","DOI":"10.1109\/CVPR52729.2023.01078"},{"key":"27_CR35","doi-asserted-by":"crossref","unstructured":"Hendrycks, D., Zou, A., Mazeika, M., Tang, L., Li, B., Song, D., Steinhardt, J.: PixMix: Dreamlike Pictures Comprehensively Improve Safety Measures. In: Proc. of CVPR. pp. 16783\u201316792. New Orleans, LA, USA (Jun 2022)","DOI":"10.1109\/CVPR52688.2022.01628"},{"key":"27_CR36","unstructured":"Hoffman, J., Tzeng, E., Park, T., Zhu, J.Y., Isola, P., Saenko, K., Efros, A.A., Darrell, T.: CyCADA: Cycle-Consistent Adversarial Domain Adaptation. In: Proc. of ICML. pp. 1989\u20131998 (Jul 2018)"},{"key":"27_CR37","doi-asserted-by":"crossref","unstructured":"Hoyer, L., Dai, D., Van\u00a0Gool, L.: DAFormer: Improving Network Architectures and Training Strategies for Domain-Adaptive Semantic Segmentation. In: Proc. of CVPR. pp. 9924\u20139935 (Jun 2022)","DOI":"10.1109\/CVPR52688.2022.00969"},{"key":"27_CR38","doi-asserted-by":"crossref","unstructured":"Hoyer, L., Dai, D., Van\u00a0Gool, L.: HRDA: Context-Aware High-Resolution Domain-Adaptive Semantic Segmentation. In: Proc. of ECCV. pp. 372\u2013391 (Oct 2022)","DOI":"10.1007\/978-3-031-20056-4_22"},{"key":"27_CR39","doi-asserted-by":"crossref","unstructured":"Hoyer, L., Dai, D., Van\u00a0Gool, L.: Domain Adaptive and Generalizable Network Architectures and Training Strategies for Semantic Image Segmentation. arXiv:2304.13615 pp. 1\u201315 (Apr 2023)","DOI":"10.1109\/TPAMI.2023.3320613"},{"key":"27_CR40","doi-asserted-by":"crossref","unstructured":"Hoyer, L., Dai, D., Wang, H., Van\u00a0Gool, L.: MIC: Masked Image Consistency for Context-Enhanced Domain Adaptation. In: Proc. of CVPR. pp. 11721\u201311732 (Jun 2023)","DOI":"10.1109\/CVPR52729.2023.01128"},{"key":"27_CR41","doi-asserted-by":"crossref","unstructured":"Huang, J., Guan, D., Xiao, A., Lu, S.: FSDR: Frequency Space Domain Randomization for Domain Generalization. In: Proc. of CVPR. pp. 6891\u20136902 (Jun 2021)","DOI":"10.1109\/CVPR46437.2021.00682"},{"key":"27_CR42","doi-asserted-by":"crossref","unstructured":"Huang, J., Guan, D., Xiao, A., Lu, S.: Fsdr: Frequency space domain randomization for domain generalization. In: Proc. of CVPR. pp. 6891\u20136902 (2021)","DOI":"10.1109\/CVPR46437.2021.00682"},{"key":"27_CR43","unstructured":"Huh, M., Agrawal, P., Efros, A.A.: What makes imagenet good for transfer learning? arXiv preprint arXiv:1608.08614 (2016)"},{"key":"27_CR44","unstructured":"Jia, C., Yang, Y., Xia, Y., Chen, Y.T., Parekh, Z., Pham, H., Le, Q., Sung, Y.H., Li, Z., Duerig, T.: Scaling up visual and vision-language representation learning with noisy text supervision. In: Proc. of ICML. pp. 4904\u20134916. PMLR (2021)"},{"key":"27_CR45","doi-asserted-by":"crossref","unstructured":"Jia, Y., Hoyer, L., Huang, S., Wang, T., Van\u00a0Gool, L., Schindler, K., Obukhov, A.: Dginstyle: Domain-generalizable semantic segmentation with image diffusion models and stylized semantic control. In: Synthetic Data for Computer Vision Workshop@ CVPR 2024 (2023)","DOI":"10.1007\/978-3-031-72933-1_6"},{"key":"27_CR46","doi-asserted-by":"crossref","unstructured":"Kerssies, T., De\u00a0Geus, D., Dubbelman, G.: How to benchmark vision foundation models for semantic segmentation? In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 1162\u20131171 (2024)","DOI":"10.1109\/CVPRW63382.2024.00123"},{"key":"27_CR47","unstructured":"Kim, N., Son, T., Lan, C., Zeng, W., Kwak, S.: WEDGE: Web-Image Assisted Domain Generalization for Semantic Segmentation. arXiv:2109.14196 pp. 1\u201314 (Sep 2021)"},{"key":"27_CR48","doi-asserted-by":"crossref","unstructured":"Kim, S., Kim, D.h., Kim, H.: Texture learning domain randomization for domain generalized segmentation. Proc. of ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00069"},{"key":"27_CR49","doi-asserted-by":"crossref","unstructured":"Kirillov, A., Girshick, R., He, K., Doll\u00e1r, P.: Panoptic feature pyramid networks. In: Proc. of CVPR. pp. 6399\u20136408 (2019)","DOI":"10.1109\/CVPR.2019.00656"},{"key":"27_CR50","doi-asserted-by":"crossref","unstructured":"Kirillov, A., Mintun, E., Ravi, N., Mao, H., Rolland, C., Gustafson, L., Xiao, T., Whitehead, S., Berg, A.C., Lo, W.Y., et\u00a0al.: Segment anything. arXiv preprint arXiv:2304.02643 (2023)","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"27_CR51","doi-asserted-by":"crossref","unstructured":"Klingner, M., Term\u00f6hlen, J.A., Ritterbach, J., Fingscheidt, T.: Unsupervised batchnorm adaptation (ubna): A domain adaptation method for semantic segmentation without using source domain representations. In: Proc. of WACV. pp. 210\u2013220 (2022)","DOI":"10.1109\/WACVW54805.2022.00027"},{"key":"27_CR52","doi-asserted-by":"crossref","unstructured":"Kornblith, S., Shlens, J., Le, Q.V.: Do better imagenet models transfer better? In: Proc. of CVPR. pp. 2661\u20132671 (2019)","DOI":"10.1109\/CVPR.2019.00277"},{"key":"27_CR53","doi-asserted-by":"crossref","unstructured":"Lee, S., Seong, H., Lee, S., Kim, E.: WildNet: Learning Domain Generalized Semantic Segmentation from the Wild. In: Proc. of CVPR. pp. 9936\u20139946 (Jun 2022)","DOI":"10.1109\/CVPR52688.2022.00970"},{"key":"27_CR54","unstructured":"Li, B., Zhang, Y., Chen, L., Wang, J., Yang, J., Liu, Z.: Otter: A multi-modal model with in-context instruction tuning. arXiv preprint arXiv:2305.03726 (2023)"},{"key":"27_CR55","doi-asserted-by":"crossref","unstructured":"Li, D., Wu, A., Wang, Y., Han, Y.: Prompt-driven dynamic object-centric learning for single domain generalization. In: Proc. of CVPR. pp. 17606\u201317615 (2024)","DOI":"10.1109\/CVPR52733.2024.01667"},{"key":"27_CR56","doi-asserted-by":"crossref","unstructured":"Li, Y., Mao, H., Girshick, R.B., He, K.: Exploring plain vision transformer backbones for object detection. In: Proc. of ECCV (2022)","DOI":"10.1007\/978-3-031-20077-9_17"},{"key":"27_CR57","doi-asserted-by":"crossref","unstructured":"Liang, F., Wu, B., Dai, X., Li, K., Zhao, Y., Zhang, H., Zhang, P., Vajda, P., Marculescu, D.: Open-vocabulary semantic segmentation with mask-adapted clip. In: Proc. of CVPR. pp. 7061\u20137070 (June 2023)","DOI":"10.1109\/CVPR52729.2023.00682"},{"key":"27_CR58","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: Proc. of ICLR (2018)"},{"key":"27_CR59","doi-asserted-by":"crossref","unstructured":"Mattolin, G., Zanella, L., Ricci, E., Wang, Y.: Confmix: Unsupervised domain adaptation for object detection via confidence-based mixing. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision. pp. 423\u2013433 (2023)","DOI":"10.1109\/WACV56688.2023.00050"},{"key":"27_CR60","unstructured":"Michaelis, C., Mitzkus, B., Geirhos, R., Rusak, E., Bringmann, O., Ecker, A.S., Bethge, M., Brendel, W.: Benchmarking robustness in object detection: Autonomous driving when winter is coming. arXiv preprint arXiv:1907.07484 (2019)"},{"key":"27_CR61","doi-asserted-by":"crossref","unstructured":"Neuhold, G., Ollmann, T., Rota\u00a0Bulo, S., Kontschieder, P.: The mapillary vistas dataset for semantic understanding of street scenes. In: Proc. of ICCV. pp. 4990\u20134999 (2017)","DOI":"10.1109\/ICCV.2017.534"},{"key":"27_CR62","doi-asserted-by":"crossref","unstructured":"Niemeijer, J., Schwonberg, M., Term\u00f6hlen, J.A., Schmidt, N.M., Fingscheidt, T.: Generalization by adaptation: Diffusion-based domain extension for domain-generalized semantic segmentation. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision. pp. 2830\u20132840 (2024)","DOI":"10.1109\/WACV57701.2024.00281"},{"key":"27_CR63","unstructured":"Oquab, M., Darcet, T., Moutakanni, T., Vo, H., Szafraniec, M., Khalidov, V., Fernandez, P., Haziza, D., Massa, F., El-Nouby, A., et\u00a0al.: Dinov2: Learning robust visual features without supervision. arXiv preprint arXiv:2304.07193 (2023)"},{"key":"27_CR64","doi-asserted-by":"crossref","unstructured":"Pan, X., Luo, P., Shi, J., Tang, X.: Two at Once: Enhancing Learning and Generalization Capacities via IBN-Net. In: Proc. of ECCV. pp. 464\u2013479 (Sep 2018)","DOI":"10.1007\/978-3-030-01225-0_29"},{"key":"27_CR65","doi-asserted-by":"crossref","unstructured":"Pan, X., Zhan, X., Shi, J., Tang, X., Luo, P.: Switchable Whitening for Deep Representation Learning. In: Proc. of ICCV. pp. 1863\u20131871 (Oct 2019)","DOI":"10.1109\/ICCV.2019.00195"},{"key":"27_CR66","doi-asserted-by":"crossref","unstructured":"Peng, D., Lei, Y., Hayat, M., Guo, Y., Li, W.: Semantic-Aware Domain Generalized Segmentation. In: Proc. of CVPR. pp. 2594\u20132605 (Jun 2022)","DOI":"10.1109\/CVPR52688.2022.00262"},{"key":"27_CR67","first-page":"6594","volume":"30","author":"D Peng","year":"2021","unstructured":"Peng, D., Lei, Y., Liu, L., Zhang, P., Liu, J.: Global and Local Texture Randomization for Synthetic-to-Real Semantic Segmentation. In IEEE TIP 30, 6594\u20136608 (2021)","journal-title":"In IEEE TIP"},{"key":"27_CR68","unstructured":"Peng, Z., Dong, L., Bao, H., Ye, Q., Wei, F.: Beit v2: Masked image modeling with vector-quantized visual tokenizers. arXiv preprint arXiv:2208.06366 (2022)"},{"key":"27_CR69","unstructured":"Peng, Z., Wang, W., Dong, L., Hao, Y., Huang, S., Ma, S., Wei, F.: Kosmos-2: Grounding multimodal large language models to the world. ArXiv abs\/2306 (2023)"},{"key":"27_CR70","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et\u00a0al.: Learning transferable visual models from natural language supervision. In: Proc. of ICML. pp. 8748\u20138763. PMLR (2021)"},{"key":"27_CR71","doi-asserted-by":"crossref","unstructured":"Rao, Y., Zhao, W., Chen, G., Tang, Y., Zhu, Z., Huang, G., Zhou, J., Lu, J.: Denseclip: Language-guided dense prediction with context-aware prompting. In: Proc. of CVPR. pp. 18082\u201318091 (2022)","DOI":"10.1109\/CVPR52688.2022.01755"},{"key":"27_CR72","doi-asserted-by":"crossref","unstructured":"Richter, S.R., Vineet, V., Roth, S., Koltun, V.: Playing for data: Ground truth from computer games. In: Proc. of ECCV. pp. 102\u2013118. Springer (2016)","DOI":"10.1007\/978-3-319-46475-6_7"},{"key":"27_CR73","unstructured":"Ridnik, T., Ben-Baruch, E., Noy, A., Zelnik-Manor, L.: Imagenet-21k pretraining for the masses. arXiv preprint arXiv:2104.10972 (2021)"},{"key":"27_CR74","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models (2021)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"27_CR75","doi-asserted-by":"crossref","unstructured":"Ros, G., Sellart, L., Materzynska, J., Vazquez, D., Lopez, A.M.: The synthia dataset: A large collection of synthetic images for semantic segmentation of urban scenes. In: Proc. of CVPR. pp. 3234\u20133243 (2016)","DOI":"10.1109\/CVPR.2016.352"},{"key":"27_CR76","unstructured":"Sakaridis, C., Bruggemann, D., Yu, F., Van\u00a0Gool, L.: Condition-invariant semantic segmentation. arXiv preprint arXiv:2305.17349 (2023)"},{"key":"27_CR77","doi-asserted-by":"crossref","unstructured":"Sakaridis, C., Dai, D., Van\u00a0Gool, L.: ACDC: The Adverse Conditions Dataset with Correspondences for Semantic Driving Scene Understanding. In: Proc. of ICCV. pp. 10765\u201310775 (Oct 2021)","DOI":"10.1109\/ICCV48922.2021.01059"},{"key":"27_CR78","unstructured":"Schuhmann, C., Beaumont, R., Vencu, R., Gordon, C.W., Wightman, R., Cherti, M., Coombes, T., Katta, A., Mullis, C., Wortsman, M., Schramowski, P., Kundurthy, S.R., Crowson, K., Schmidt, L., Kaczmarczyk, R., Jitsev, J.: LAION-5b: An open large-scale dataset for training next generation image-text models. In: Proc. of NeurIPS Datasets and Benchmarks Track (2022), https:\/\/openreview.net\/forum?id=M3Y74vmsMcY"},{"key":"27_CR79","doi-asserted-by":"crossref","unstructured":"Schwonberg, M., El\u00a0Bouazati, F., Schmidt, N.M., Gottschalk, H.: Augmentation-Based Domain Generalization for Semantic Segmentation. In: Proc. of IV - Workshops. pp.\u00a01\u20138 (Jun 2023)","DOI":"10.1109\/IV55152.2023.10186752"},{"key":"27_CR80","doi-asserted-by":"publisher","first-page":"54296","DOI":"10.1109\/ACCESS.2023.3277785","volume":"11","author":"M Schwonberg","year":"2023","unstructured":"Schwonberg, M., Niemeijer, J., Term\u00f6hlen, J.A., Sch\u00e4fer, J.P., Schmidt, N.M., Gottschalk, H., Fingscheidt, T.: Survey on Unsupervised Domain Adaptation for Semantic Segmentation for Visual Perception in Automated Driving. IEEE Access 11, 54296\u201354336 (2023)","journal-title":"IEEE Access"},{"key":"27_CR81","unstructured":"Shen, S., Li, C., Hu, X., Xie, Y., Yang, J., Zhang, P., Rohrbach, A., Gan, Z., Wang, L., Yuan, L., et\u00a0al.: K-lite: Learning transferable visual models with external knowledge. In: Proc. of NeurIPS (2022)"},{"key":"27_CR82","doi-asserted-by":"crossref","unstructured":"Shi, C., Yang, S.: Edadet: Open-vocabulary object detection using early dense alignment. In: Proc. of ICCV (October 2023)","DOI":"10.1109\/ICCV51070.2023.01441"},{"key":"27_CR83","doi-asserted-by":"crossref","unstructured":"Singh, A., Hu, R., Goswami, V., Couairon, G., Galuba, W., Rohrbach, M., Kiela, D.: FLAVA: A foundational language and vision alignment model. In: Proc. of CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01519"},{"key":"27_CR84","unstructured":"Sun, Q., Chen, H., Zheng, M., Wu, Z., Felsberg, M., Tang, Y.: IBAFormer: Intra-batch Attention Transformer for Domain Generalized Semantic Segmentation. arXiv:2309.06282 pp. 1\u201310 (Sep 2023)"},{"key":"27_CR85","unstructured":"Sun, Q., Melnyk, P., Felsberg, M., Tang, Y.: Augment Features Beyond Color for Domain Generalized Segmentation. arXiv:2307.01703 pp. 1\u201310 (Jul 2023)"},{"key":"27_CR86","unstructured":"Sun, Q., Fang, Y., Wu, L., Wang, X., Cao, Y.: Eva-clip: Improved training techniques for clip at scale. arXiv preprint arXiv:2303.15389 (2023)"},{"key":"27_CR87","unstructured":"Tao, A., Sapra, K., Catanzaro, B.: Hierarchical multi-scale attention for semantic segmentation. arXiv preprint arXiv:2005.10821 (2020)"},{"key":"27_CR88","doi-asserted-by":"crossref","unstructured":"Term\u00f6hlen, J.A., Bartels, T., Fingscheidt, T.: A Re-Parameterized Vision Transformer (ReVT) for Domain-Generalized Semantic Segmentation. arXiv:2308.13331 pp. 1\u201318 (Aug 2023)","DOI":"10.1109\/ICCVW60793.2023.00472"},{"key":"27_CR89","doi-asserted-by":"crossref","unstructured":"Term\u00f6hlen, J.A., Bartels, T., Fingscheidt, T.: A Re-Parameterized Vision Transformer (ReVT) for Domain-Generalized Semantic Segmentation. In: Proc. of ICCV - Workshops. pp. 1\u201310. Paris, France (Oct 2023)","DOI":"10.1109\/ICCVW60793.2023.00472"},{"key":"27_CR90","doi-asserted-by":"crossref","unstructured":"Touvron, H., Cord, M., J\u00e9gou, H.: Deit iii: Revenge of the vit. In: Proc. of ECCV (2022)","DOI":"10.1007\/978-3-031-20053-3_30"},{"key":"27_CR91","doi-asserted-by":"crossref","unstructured":"Tranheden, W., Olsson, V., Pinto, J., Svensson, L.: DACS: Domain Adaptation via Cross-Domain Mixed Sampling. In: Proc. of WACV. pp. 1379\u20131389 (Jan 2021)","DOI":"10.1109\/WACV48630.2021.00142"},{"key":"27_CR92","doi-asserted-by":"crossref","unstructured":"Tsai, Y.H., Hung, W.C., Schulter, S., Sohn, K., Yang, M.H., Chandraker, M.: Learning to Adapt Structured Output Space for Semantic Segmentation. In: Proc. of CVPR. pp. 7472\u20137481 (Jun 2018)","DOI":"10.1109\/CVPR.2018.00780"},{"key":"27_CR93","doi-asserted-by":"crossref","unstructured":"Vidit, V., Engilberge, M., Salzmann, M.: Clip the gap: A single domain generalization approach for object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 3219\u20133229 (2023)","DOI":"10.1109\/CVPR52729.2023.00314"},{"key":"27_CR94","doi-asserted-by":"crossref","unstructured":"Wang, T.: Learning to detect and segment for open vocabulary object detection. In: Proc. of CVPR. pp. 7051\u20137060 (June 2023)","DOI":"10.1109\/CVPR52729.2023.00681"},{"key":"27_CR95","doi-asserted-by":"crossref","unstructured":"Wang, W., Dai, J., Chen, Z., Huang, Z., Li, Z., Zhu, X., Hu, X., Lu, T., Lu, L., Li, H., et\u00a0al.: Internimage: Exploring large-scale vision foundation models with deformable convolutions. In: Proc. of CVPR. pp. 14408\u201314419 (2023)","DOI":"10.1109\/CVPR52729.2023.01385"},{"key":"27_CR96","doi-asserted-by":"crossref","unstructured":"Wang, W., Bao, H., Dong, L., Bjorck, J., Peng, Z., Liu, Q., Aggarwal, K., Mohammed, O.K., Singhal, S., Som, S., Wei, F.: Image as a foreign language: Beit pretraining for vision and vision-language tasks. In: Proc. of CVPR. pp. 19175\u201319186 (June 2023)","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"27_CR97","doi-asserted-by":"crossref","unstructured":"Wang, Z., Yu, M., Wei, Y., Feris, R., Xiong, J., Hwu, W., Huang, T.S., Shi, H.: Differential Treatment for Stuff and Things: A Simple Unsupervised Domain Adaptation Method for Semantic Segmentation. In: Proc. of CVPR. pp. 12635\u201312644 (Jun 2020)","DOI":"10.1109\/CVPR42600.2020.01265"},{"key":"27_CR98","doi-asserted-by":"crossref","unstructured":"Wei, Z., Chen, L., Jin, Y., Ma, X., Liu, T., Ling, P., Wang, B., Chen, H., Zheng, J.: Stronger fewer & superior: Harnessing vision foundation models for domain generalized semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 28619\u201328630 (2024)","DOI":"10.1109\/CVPR52733.2024.02704"},{"key":"27_CR99","doi-asserted-by":"crossref","unstructured":"Wu, A., Deng, C.: Single-domain generalized object detection in urban scene via cyclic-disentangled self-distillation. In: Proceedings of the IEEE\/CVF Conference on computer vision and pattern recognition. pp. 847\u2013856 (2022)","DOI":"10.1109\/CVPR52688.2022.00092"},{"key":"27_CR100","doi-asserted-by":"crossref","unstructured":"Wu, F., Gao, J., Hong, L., Wang, X., Zhou, C., Ye, N.: G-nas: Generalizable neural architecture search for single domain generalization object detection. In: Proceedings of the AAAI Conference on Artificial Intelligence. vol.\u00a038, pp. 5958\u20135966 (2024)","DOI":"10.1609\/aaai.v38i6.28410"},{"key":"27_CR101","unstructured":"Xie, E., Wang, W., Yu, Z., Anandkumar, A., Alvarez, J.M., Luo, P.: SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers. In: Proc. of NeurIPS. pp. 12077\u201312090 (Dec 2021)"},{"key":"27_CR102","doi-asserted-by":"crossref","unstructured":"Xie, Z., Zhang, Z., Cao, Y., Lin, Y., Bao, J., Yao, Z., Dai, Q., Hu, H.: Simmim: A simple framework for masked image modeling. In: Proc. of CVPR. pp. 9653\u20139663 (2022)","DOI":"10.1109\/CVPR52688.2022.00943"},{"key":"27_CR103","doi-asserted-by":"crossref","unstructured":"Xu, Q., Yao, L., Jiang, Z., Jiang, G., Chu, W., Han, W., Zhang, W., Wang, C., Tai, Y.: DIRL: Domain-Invariant Representation Learning for Generalizable Semantic Segmentation. In: Proc. of AAAI. pp. 2884\u20132892 (Jun 2022)","DOI":"10.1609\/aaai.v36i3.20193"},{"key":"27_CR104","doi-asserted-by":"crossref","unstructured":"Yamada, Y., Otani, M.: Does robustness on imagenet transfer to downstream tasks? In: Proc. of CVPR. pp. 9215\u20139224 (2022)","DOI":"10.1109\/CVPR52688.2022.00900"},{"key":"27_CR105","doi-asserted-by":"crossref","unstructured":"Yu, F., Chen, H., Wang, X., Xian, W., Chen, Y., Liu, F., Madhavan, V., Darrell, T.: Bdd100k: A diverse driving dataset for heterogeneous multitask learning. In: Proc. of CVPR. pp. 2636\u20132645 (2020)","DOI":"10.1109\/CVPR42600.2020.00271"},{"key":"27_CR106","unstructured":"Yu, Q., He, J., Deng, X., Shen, X., Chen, L.C.: Convolutions die hard: Open-vocabulary segmentation with single frozen convolutional clip. In: Proc. of NeurIPS (2023)"},{"key":"27_CR107","doi-asserted-by":"crossref","unstructured":"Yue, X., Zhang, Y., Zhao, S., Sangiovanni-Vincentelli, A., Keutzer, K., Gong, B.: Domain Randomization and Pyramid Consistency: Simulation-to-Real Generalization Without Accessing Target Domain Data. In: Proc. of ICCV. pp. 2100\u20132110 (Oct 2019)","DOI":"10.1109\/ICCV.2019.00219"},{"key":"27_CR108","doi-asserted-by":"crossref","unstructured":"Yue, X., Zhang, Y., Zhao, S., Sangiovanni-Vincentelli, A., Keutzer, K., Gong, B.: Domain randomization and pyramid consistency: Simulation-to-real generalization without accessing target domain data. In: Proc. of ICCV. pp. 2100\u20132110 (2019)","DOI":"10.1109\/ICCV.2019.00219"},{"key":"27_CR109","unstructured":"Zeng, Y., Zhang, X., Li, H., Wang, J., Zhang, J., Zhou, W.: X 2-vlm: All-in-one pre-trained model for vision-language tasks. arXiv preprint arXiv:2211.12402 (2022)"},{"key":"27_CR110","unstructured":"Zhang, K., Sun, Y., Wang, R., Li, H., Hu, X.: Multiple fusion adaptation: A strong framework for unsupervised semantic segmentation adaptation. arXiv preprint arXiv:2112.00295 (2021)"},{"key":"27_CR111","doi-asserted-by":"crossref","unstructured":"Zhang, P., Zhang, B., Zhang, T., Chen, D., Wang, Y., Wen, F.: Prototypical Pseudo Label Denoising and Target Structure Learning for Domain Adaptive Semantic Segmentation. In: Proc. of CVPR. pp. 12414\u201312424 (2021)","DOI":"10.1109\/CVPR46437.2021.01223"},{"key":"27_CR112","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Zhong, Z., Zhao, N., Sebe, N., Lee, G.H.: Style-Hallucinated Dual Consistency Learning for Domain Generalized Semantic Segmentation. In: Proc. of ECCV. pp. 535\u2013552 (2022)","DOI":"10.1007\/978-3-031-19815-1_31"},{"key":"27_CR113","doi-asserted-by":"crossref","unstructured":"Zhong, Y., Yang, J., Zhang, P., Li, C., Codella, N., Li, L.H., Zhou, L., Dai, X., Yuan, L., Li, Y., et\u00a0al.: Regionclip: Region-based language-image pretraining. In: Proc. of CVPR. pp. 16793\u201316803 (2022)","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"27_CR114","unstructured":"Zhong, Z., Zhao, Y., Lee, G.H., Sebe, N.: Adversarial Style Augmentation for Domain Generalized Urban-Scene Segmentation. In: Proc. of NeurIPS. pp. 338\u2013350 (Dec 2022)"},{"key":"27_CR115","unstructured":"Zhou, J., Wei, C., Wang, H., Shen, W., Xie, C., Yuille, A., Kong, T.: ibot: Image bert pre-training with online tokenizer. arXiv preprint arXiv:2111.07832 (2021)"},{"key":"27_CR116","doi-asserted-by":"crossref","unstructured":"Zhou, K., Yang, J., Loy, C.C., Liu, Z.: Conditional prompt learning for vision-language models. In Proc. of CVPR pp. 16795\u201316804 (2022), https:\/\/api.semanticscholar.org\/CorpusID:247363011","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"27_CR117","unstructured":"Zhou, Z., Lei, Y., Zhang, B., Liu, L., Liu, Y.: Zegclip: Towards adapting clip for zero-shot semantic segmentation. In: Proc. of CVPR"},{"key":"27_CR118","first-page":"3833","volume":"33","author":"B Zoph","year":"2020","unstructured":"Zoph, B., Ghiasi, G., Lin, T.Y., Cui, Y., Liu, H., Cubuk, E.D., Le, Q.: Rethinking pre-training and self-training. In Proc. of NeurIPS 33, 3833\u20133845 (2020)","journal-title":"In Proc. of NeurIPS"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ACCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-0972-7_27","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,9]],"date-time":"2024-12-09T09:11:37Z","timestamp":1733735497000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-0972-7_27"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,10]]},"ISBN":["9789819609710","9789819609727"],"references-count":118,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-0972-7_27","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12,10]]},"assertion":[{"value":"10 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ACCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Asian Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Hanoi","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Vietnam","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"12 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"accv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}