{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,18]],"date-time":"2026-05-18T14:41:48Z","timestamp":1779115308277,"version":"3.51.4"},"reference-count":109,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2023,10,31]],"date-time":"2023-10-31T00:00:00Z","timestamp":1698710400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,10,31]],"date-time":"2023-10-31T00:00:00Z","timestamp":1698710400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2024,4]]},"DOI":"10.1007\/s11263-023-01866-y","type":"journal-article","created":{"date-parts":[[2023,10,31]],"date-time":"2023-10-31T01:02:14Z","timestamp":1698714134000},"page":"1167-1186","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["InstaFormer++: Multi-Domain Instance-Aware Image-to-Image Translation with Transformer"],"prefix":"10.1007","volume":"132","author":[{"given":"Soohyun","family":"Kim","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jongbeom","family":"Baek","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jihye","family":"Park","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Eunjae","family":"Ha","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Homin","family":"Jung","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Taeyoung","family":"Lee","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2927-6273","authenticated-orcid":false,"given":"Seungryong","family":"Kim","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,10,31]]},"reference":[{"key":"1866_CR1","doi-asserted-by":"crossref","unstructured":"Abdal, R., Zhu, P., Femiani, J., Mitra, N. J., & Wonka, P. (2021). Clip2stylegan: Unsupervised extraction of stylegan edit directions. arXiv preprint arXiv:2112.05219.","DOI":"10.1145\/3528233.3530747"},{"key":"1866_CR2","unstructured":"Ba, J. L., Kiros, J. R., & Hinton, G. E. (2016). Layer normalization. arXiv preprint arXiv:1607.06450."},{"key":"1866_CR3","doi-asserted-by":"crossref","unstructured":"Baek, K., Choi, Y., Uh, Y., Yoo, J., & Shim, H. (2021). Rethinking the truly unsupervised image-to-image translation. In ICCV, pp. 14154\u201314163.","DOI":"10.1109\/ICCV48922.2021.01389"},{"key":"1866_CR4","unstructured":"Bau, D., Andonian, A., Cui, A., Park, Y., Jahanian, A., Oliva, A., & Torralba, A. (2021). Paint by word. arXiv preprint arXiv:2103.10951."},{"key":"1866_CR5","doi-asserted-by":"crossref","unstructured":"Bhattacharjee, D., Kim, S., Vizier, G., & Salzmann, M. (2020). Dunit: Detection-based unsupervised image-to-image translation. In CVPR, pp. 4787\u20134796.","DOI":"10.1109\/CVPR42600.2020.00484"},{"key":"1866_CR6","doi-asserted-by":"publisher","first-page":"41","DOI":"10.1016\/j.cviu.2018.10.009","volume":"179","author":"A Borji","year":"2019","unstructured":"Borji, A. (2019). Pros and cons of gan evaluation measures. Computer Vision and Image Understanding, 179, 41\u201365.","journal-title":"Computer Vision and Image Understanding"},{"key":"1866_CR7","doi-asserted-by":"crossref","unstructured":"Brooks, T., Holynski, A., & Efros, A. A. (2023). Instructpix2pix: Learning to follow image editing instructions. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 18392\u201318402.","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"1866_CR8","doi-asserted-by":"crossref","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., & Zagoruyko, S. (2020). End-to-end object detection with transformers. In ECCV, Springer, pp. 213\u2013229.","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"1866_CR9","doi-asserted-by":"crossref","unstructured":"Chen, H., Wang, Y., Guo, T., Xu, C., Deng, Y., Liu, Z., Ma, S., Xu, C., Xu, C., & Gao, W. (2021). Pre-trained image processing transformer. In CVPR, pp. 12299\u201312310.","DOI":"10.1109\/CVPR46437.2021.01212"},{"key":"1866_CR10","doi-asserted-by":"crossref","unstructured":"Choi, Y., Choi, M., Kim, M., Ha, J. W., Kim, S., & Choo, J. (2018). Stargan: Unified generative adversarial networks for multi-domain image-to-image translation. In CVPR, pp. 8789\u20138797.","DOI":"10.1109\/CVPR.2018.00916"},{"key":"1866_CR11","doi-asserted-by":"crossref","unstructured":"Choi, Y., Uh, Y., Yoo, J., & Ha, J. W. (2020a). Stargan v2: Diverse image synthesis for multiple domains. In Proceedings of the IEEE conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR42600.2020.00821"},{"key":"1866_CR12","doi-asserted-by":"crossref","unstructured":"Choi, Y., Uh, Y., Yoo, J., & Ha, J. W. (2020b). Stargan v2: Diverse image synthesis for multiple domains. In CVPR, pp. 8188\u20138197.","DOI":"10.1109\/CVPR42600.2020.00821"},{"key":"1866_CR13","doi-asserted-by":"crossref","unstructured":"Cordts, M., Omran, M., Ramos, S., Rehfeld, T., Enzweiler, M., Benenson, R., Franke, U., Roth, S., & Schiele, B. (2016). The cityscapes dataset for semantic urban scene understanding. In CVPR, pp. 3213\u20133223.","DOI":"10.1109\/CVPR.2016.350"},{"key":"1866_CR14","doi-asserted-by":"crossref","unstructured":"Couairon, G., Grechka, A., Verbeek, J., Schwenk, H., & Cord, M. (2022). Flexit: Towards flexible semantic image translation. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 18270\u201318279.","DOI":"10.1109\/CVPR52688.2022.01773"},{"key":"1866_CR15","doi-asserted-by":"crossref","unstructured":"Dai, Z., Cai, B., Lin, Y., & Chen, J. (2021a). Up-detr: Unsupervised pre-training for object detection with transformers. In CVPR, pp. 1601\u20131610.","DOI":"10.1109\/CVPR46437.2021.00165"},{"key":"1866_CR16","unstructured":"Dai, Z., Liu, H., Le ,Q. V., & Tan. M. (2021b). Coatnet: Marrying convolution and attention for all data sizes. arXiv preprint arXiv:2106.04803."},{"key":"1866_CR17","first-page":"8780","volume":"34","author":"P Dhariwal","year":"2021","unstructured":"Dhariwal, P., & Nichol, A. (2021). Diffusion models beat gans on image synthesis. Advances in Neural Information Processing Systems, 34, 8780\u20138794.","journal-title":"Advances in Neural Information Processing Systems"},{"issue":"2","key":"1866_CR18","doi-asserted-by":"publisher","first-page":"295","DOI":"10.1109\/TPAMI.2015.2439281","volume":"38","author":"C Dong","year":"2015","unstructured":"Dong, C., Loy, C. C., He, K., & Tang, X. (2015). Image super-resolution using deep convolutional networks. TPAMI, 38(2), 295\u2013307.","journal-title":"TPAMI"},{"key":"1866_CR19","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., et al. (2020). An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929."},{"key":"1866_CR20","doi-asserted-by":"crossref","unstructured":"Gabbay, A., & Hoshen, Y. (2021). Scaling-up disentanglement for image translation. arXiv preprint arXiv:2103.14017.","DOI":"10.1109\/ICCV48922.2021.00671"},{"key":"1866_CR21","doi-asserted-by":"crossref","unstructured":"Gal, R., Patashnik, O., Maron, H., Chechik, G., & Cohen-Or, D. (2021). Stylegan-nada: Clip-guided domain adaptation of image generators. arXiv preprint arXiv:2108.00946.","DOI":"10.1145\/3528223.3530164"},{"key":"1866_CR22","doi-asserted-by":"crossref","unstructured":"Gatys, L. A., Ecker, A. S., & Bethge, M. (2016) Image style transfer using convolutional neural networks. In CVPR, pp. 2414\u20132423.","DOI":"10.1109\/CVPR.2016.265"},{"key":"1866_CR23","doi-asserted-by":"crossref","unstructured":"Geiger, A., Lenz, P., & Urtasun, R. (2012). Are we ready for autonomous driving? The Kitti vision benchmark suite. In CVPR, IEEE, pp. 3354\u20133361.","DOI":"10.1109\/CVPR.2012.6248074"},{"key":"1866_CR24","unstructured":"Gonzalez-Garcia, A., Van De\u00a0Weijer, J., & Bengio, Y. (2018). Image-to-image translation for cross-domain disentanglement. arXiv preprint arXiv:1805.09730."},{"key":"1866_CR25","unstructured":"Goodfellow, I., Pouget-Abadie, J., Mirza, M., Xu, B., Warde-Farley, D., Ozair, S., Courville, A., & Bengio, Y. (2014). Generative adversarial nets. In NeurIPS, pp. 2672\u20132680."},{"key":"1866_CR26","doi-asserted-by":"crossref","unstructured":"Graham, B., El-Nouby, A., Touvron, H., Stock, P., Joulin, A., J\u00e9gou, H., & Douze, M. (2021). Levit: A vision transformer in convnet\u2019s clothing for faster inference. In Proceedings of the IEEE\/CVF international conference on computer vision, pp. 12259\u201312269.","DOI":"10.1109\/ICCV48922.2021.01204"},{"key":"1866_CR27","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., & Girshick, R. (2017). Mask r-cnn. In ICCV, pp. 2961\u20132969.","DOI":"10.1109\/ICCV.2017.322"},{"key":"1866_CR28","unstructured":"Hertz, A., Mokady, R., Tenenbaum, J., Aberman, K., Pritch, Y., & Cohen-Or, D. (2022). Prompt-to-prompt image editing with cross attention control. arXiv preprint arXiv:2208.01626."},{"key":"1866_CR29","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., & Hochreiter, S. (2017). Gans trained by a two time-scale update rule converge to a local nash equilibrium. In NeurIPS, pp. 6626\u20136637."},{"key":"1866_CR30","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., & Abbeel, P. (2020). Denoising diffusion probabilistic models. Advances in Neural Information Processing Systems, 33, 6840\u20136851.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"1866_CR31","unstructured":"Hoffman, J., Tzeng, E., Park, T., Zhu, J. Y., Isola, P., Saenko, K., Efros, A., & Darrell, T. (2018). Cycada: Cycle-consistent adversarial domain adaptation. In ICML, pp. 1989\u20131998."},{"key":"1866_CR32","doi-asserted-by":"crossref","unstructured":"Huang, X., & Belongie, S. (2017). Arbitrary style transfer in real-time with adaptive instance normalization. In ICCV, pp. 1501\u20131510.","DOI":"10.1109\/ICCV.2017.167"},{"key":"1866_CR33","doi-asserted-by":"crossref","unstructured":"Huang, X., Liu, M. Y., Belongie, S., & Kautz, J. (2018). Multimodal unsupervised image-to-image translation. In ECCV, pp. 172\u2013189.","DOI":"10.1007\/978-3-030-01219-9_11"},{"key":"1866_CR34","unstructured":"Hudson, D. A., & Zitnick, C. L. (2021). Generative adversarial transformers. arXiv preprint arXiv:2103.01209."},{"issue":"4","key":"1866_CR35","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3072959.3073659","volume":"36","author":"S Iizuka","year":"2017","unstructured":"Iizuka, S., Simo-Serra, E., & Ishikawa, H. (2017). Globally and locally consistent image completion. ACM Transactions on Graphics (ToG), 36(4), 1\u201314.","journal-title":"ACM Transactions on Graphics (ToG)"},{"key":"1866_CR36","doi-asserted-by":"crossref","unstructured":"Inoue, N., Furuta, R., Yamasaki, T., & Aizawa, K. (2018). Cross-domain weakly-supervised object detection through progressive domain adaptation. In CVPR, pp. 5001\u20135009.","DOI":"10.1109\/CVPR.2018.00525"},{"key":"1866_CR37","doi-asserted-by":"crossref","unstructured":"Isola, P., Zhu, J. Y., Zhou, T., & Efros, A. A. (2017). Image-to-image translation with conditional adversarial networks. In CVPR, pp. 1125\u20131134.","DOI":"10.1109\/CVPR.2017.632"},{"key":"1866_CR38","doi-asserted-by":"crossref","unstructured":"Jeong, S., Kim, Y., Lee, E., & Sohn, K. (2021). Memory-guided unsupervised image-to-image translation. In CVPR, pp. 6558\u20136567.","DOI":"10.1109\/CVPR46437.2021.00649"},{"key":"1866_CR39","doi-asserted-by":"crossref","unstructured":"Jiang, L., Zhang, C., Huang, M., Liu, C., Shi, J., Loy, C. C. (2020). Tsit: A simple and versatile framework for image-to-image translation. In European conference on computer vision, Springer, pp. 206\u2013222.","DOI":"10.1007\/978-3-030-58580-8_13"},{"key":"1866_CR40","unstructured":"Jiang, Y., Chang, S., & Wang, Z. (2021). Transgan: Two transformers can make one strong gan. arXiv preprint arXiv:2102.07074."},{"key":"1866_CR41","unstructured":"Katharopoulos, A., Vyas, A., Pappas, N., & Fleuret, F. (2020). Transformers are rnns: Fast autoregressive transformers with linear attention. In ICML, pp. 5156\u20135165."},{"key":"1866_CR42","doi-asserted-by":"crossref","unstructured":"Kim, J., Kwon\u00a0Lee, J., & Mu\u00a0Lee, K. (2016). Accurate image super-resolution using very deep convolutional networks. In CVPR, pp. 1646\u20131654.","DOI":"10.1109\/CVPR.2016.182"},{"key":"1866_CR43","unstructured":"Kim, S., Baek, J., Park, J., Kim, G., & Kim, S. (2020). Instaformer: Instance-aware image-to-image translation with transformer. In CVPR."},{"key":"1866_CR44","unstructured":"Kim, T., Cha, M., Kim, H., Lee, J. K., & Kim, J. (2017). Learning to discover cross-domain relations with generative adversarial networks. arXiv preprint arXiv:1703.05192."},{"key":"1866_CR45","doi-asserted-by":"crossref","unstructured":"Kim, T., Jeong, M., Kim, S., Choi, S., & Kim, C. (2019). Diversify and match: A domain adaptive representation learning paradigm for object detection. In CVPR, pp. 12456\u201312465.","DOI":"10.1109\/CVPR.2019.01274"},{"key":"1866_CR46","doi-asserted-by":"crossref","unstructured":"Kwon, G., & Ye, J. C. (2021). Clipstyler: Image style transfer with a single text condition. arXiv preprint arXiv:2112.00374.","DOI":"10.1109\/CVPR52688.2022.01753"},{"key":"1866_CR47","doi-asserted-by":"crossref","unstructured":"Lee, H. Y., Tseng, H. Y., Huang, J. B., Singh, M., & Yang, M. H. (2018). Diverse image-to-image translation via disentangled representations. In ECCV, pp. 35\u201351.","DOI":"10.1007\/978-3-030-01246-5_3"},{"issue":"10","key":"1866_CR48","doi-asserted-by":"publisher","first-page":"2402","DOI":"10.1007\/s11263-019-01284-z","volume":"128","author":"HY Lee","year":"2020","unstructured":"Lee, H. Y., Tseng, H. Y., Mao, Q., Huang, J. B., Lu, Y. D., Singh, M., & Yang, M. H. (2020). Drit++: Diverse image-to-image translation via disentangled representations. International Journal of Computer Vision, 128(10), 2402\u20132417.","journal-title":"International Journal of Computer Vision"},{"key":"1866_CR49","unstructured":"Lee, K., Chang, H., Jiang, L., Zhang, H., Tu, Z., & Liu, C. (2021). Vitgan: Training gans with vision transformers. arXiv preprint arXiv:2107.04589."},{"key":"1866_CR50","unstructured":"Li, W., Wang, X., Xia, X., Wu, J., Xiao, X., Zheng, M., & Wen, S. (2022). Sepvit: Separable vision transformer. arXiv preprint arXiv:2203.15380."},{"key":"1866_CR51","doi-asserted-by":"crossref","unstructured":"Li, Y., Liu, H., Wu, Q., Mu, F., Yang, J., Gao, J., Li, C., & Lee, Y. J. (2023). Gligen: Open-set grounded text-to-image generation. arXiv preprint arXiv:2301.07093.","DOI":"10.1109\/CVPR52729.2023.02156"},{"key":"1866_CR52","unstructured":"Liu, H., Dai, Z., So, D. R., & Le, Q. V. (2021a). Pay attention to mlps. arXiv preprint arXiv:2105.08050."},{"key":"1866_CR53","unstructured":"Liu, M. Y., Breuel, T., & Kautz, J. (2017) Unsupervised image-to-image translation networks. In NeurIPS, pp. 700\u2013708."},{"key":"1866_CR54","doi-asserted-by":"crossref","unstructured":"Liu, M. Y., Huang, X., Mallya, A., Karras, T., Aila, T., Lehtinen, J., & Kautz, J. (2019). Few-shot unsueprvised image-to-image translation. In arxiv.","DOI":"10.1109\/ICCV.2019.01065"},{"key":"1866_CR55","unstructured":"Liu, X., Gong, C., Wu, L., Zhang, S., Su, H., & Liu, Q. (2021b). Fusedream: Training-free text-to-image generation with improved clip+ gan space optimization. arXiv preprint arXiv:2112.01573."},{"key":"1866_CR56","doi-asserted-by":"crossref","unstructured":"Liu Y, Sangineto E, Chen, Y., Bao, L., Zhang, H., Sebe, N., Lepri, B., Wang, W., & De\u00a0Nadai, M. (2021c). Smoothing the disentangled latent style space for unsupervised image-to-image translation. In CVPR, pp. 10785\u201310794.","DOI":"10.1109\/CVPR46437.2021.01064"},{"key":"1866_CR57","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., & Guo, B. (2021d). Swin transformer: Hierarchical vision transformer using shifted windows. arXiv preprint arXiv:2103.14030.","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"1866_CR58","unstructured":"Melas-Kyriazi, L. (2021). Do you even need attention? A stack of feed-forward layers does surprisingly well on imagenet. arXiv preprint arXiv:2105.02723."},{"key":"1866_CR59","unstructured":"Mirza, M., & Osindero, S. (2014). Conditional generative adversarial nets. arXiv preprint arXiv:1411.1784."},{"key":"1866_CR60","unstructured":"Mo, S., Cho, M., & Shin, J. (2018). Instagan: Instance-aware image-to-image translation. arXiv preprint arXiv:1812.10889."},{"key":"1866_CR61","unstructured":"Oord, A., Li, Y., & Vinyals, O. (2018). Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748."},{"key":"1866_CR62","doi-asserted-by":"crossref","unstructured":"Park, J., Kim, S., Kim, S., Cho, S., Yoo, J., Uh, Y., & Kim, S. (2023). Lanit: Language-driven image-to-image translation for unlabeled data. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 23401\u201323411.","DOI":"10.1109\/CVPR52729.2023.02241"},{"key":"1866_CR63","doi-asserted-by":"crossref","unstructured":"Park, T., Liu, M. Y., Wang, T. C., & Zhu, J. Y. (2019). Semantic image synthesis with spatially-adaptive normalization. In CVPR, pp. 2337\u20132346.","DOI":"10.1109\/CVPR.2019.00244"},{"key":"1866_CR64","doi-asserted-by":"crossref","unstructured":"Park, T., Efros, A. A., Zhang, R., & Zhu J. Y. (2020). Contrastive learning for unpaired image-to-image translation. arXiv preprint arXiv:2007.15651.","DOI":"10.1007\/978-3-030-58545-7_19"},{"key":"1866_CR65","doi-asserted-by":"crossref","unstructured":"Patashnik, O., Wu, Z., Shechtman, E., Cohen-Or, D., & Lischinski, D. (2021). Styleclip: Text-driven manipulation of stylegan imagery. In Proceedings of the IEEE\/CVF international conference on computer vision, pp. 2085\u20132094.","DOI":"10.1109\/ICCV48922.2021.00209"},{"key":"1866_CR66","doi-asserted-by":"crossref","unstructured":"Pathak, D., Krahenbuhl, P., Donahue, J., Darrell, T., & Efros, A. A. (2016). Context encoders: Feature learning by inpainting. In CVPR, pp. 2536\u20132544.","DOI":"10.1109\/CVPR.2016.278"},{"key":"1866_CR67","doi-asserted-by":"crossref","unstructured":"Peebles, W., & Xie, S. (2022). Scalable diffusion models with transformers. arXiv preprint arXiv:2212.09748.","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"1866_CR68","unstructured":"Radford, A., Kim, J. W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et al. (2021). Learning transferable visual models from natural language supervision. In International conference on machine learning, pp. 8748\u20138763."},{"key":"1866_CR69","unstructured":"Rahaman, N., Baratin, A., Arpit, D., Draxler, F., Lin, M., Hamprecht, F., Bengio, Y., & Courville, A. (2019) On the spectral bias of neural networks. In ICML, pp. 5301\u20135310."},{"key":"1866_CR70","unstructured":"Ramesh, A., Pavlov, M., Goh, G., Gray, S., Voss, C., Radford, A., Chen, M., & Sutskever, I. (2021). Zero-shot text-to-image generation. In International conference on machine learning, pp. 8821\u20138831."},{"key":"1866_CR71","first-page":"91","volume":"28","author":"S Ren","year":"2015","unstructured":"Ren, S., He, K., Girshick, R., & Sun, J. (2015). Faster r-cnn: Towards real-time object detection with region proposal networks. NeurIPS, 28, 91\u201399.","journal-title":"NeurIPS"},{"key":"1866_CR72","unstructured":"Rodriguez, A. L., & Mikolajczyk, K. (2019). Domain adaptation for object detection via style consistency. arXiv preprint arXiv:1911.10033."},{"key":"1866_CR73","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., & Ommer, B. (2021). arXiv:2112.10752."},{"key":"1866_CR74","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., & Ommer, B. (2022). High-resolution image synthesis with latent diffusion models. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 10684\u201310695.","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"1866_CR75","doi-asserted-by":"crossref","unstructured":"Saharia, C., Chan, W., Saxena, S., Li, L., Whang, J., Denton, E., Ghasemipour, S. K. S., Ayan, B. K., Mahdavi, S. S., Lopes, R. G., Salimans, T., Salimans, T., Ho, J., Fleet, D. J., & Norouzi, M. (2022). Photorealistic text-to-image diffusion models with deep language understanding. arXiv preprint arXiv:2205.11487.","DOI":"10.1145\/3528233.3530757"},{"key":"1866_CR76","doi-asserted-by":"crossref","unstructured":"Saito, K., Saenko, K., & Liu, M. Y. (2020). Coco-funit: Few-shot unsupervised image translation with a content conditioned style encoder. In European conference on computer vision, Springer, pp. 382\u2013398.","DOI":"10.1007\/978-3-030-58580-8_23"},{"key":"1866_CR77","doi-asserted-by":"crossref","unstructured":"Sakaridis, C., Dai, D., & Gool, L. V. (2019). Guided curriculum model adaptation and uncertainty-aware evaluation for semantic nighttime image segmentation. In Proceedings of the IEEE\/CVF international conference on computer vision, pp. 7374\u20137383.","DOI":"10.1109\/ICCV.2019.00747"},{"key":"1866_CR78","first-page":"2234","volume":"29","author":"T Salimans","year":"2016","unstructured":"Salimans, T., Goodfellow, I., Zaremba, W., Cheung, V., Radford, A., & Chen, X. (2016). Improved techniques for training gans. NeurIPS, 29, 2234\u20132242.","journal-title":"NeurIPS"},{"key":"1866_CR79","doi-asserted-by":"crossref","unstructured":"Shen, Z., Huang, M., Shi, J., Xue, X., & Huang, T. S. (2019). Towards instance-level image-to-image translation. In CVPR, pp. 3683\u20133692.","DOI":"10.1109\/CVPR.2019.00380"},{"key":"1866_CR80","unstructured":"Tolstikhin, I., Houlsby, N., Kolesnikov, A., Beyer, L., Zhai, X., Unterthiner, T., Yung, J., Steiner, A., Keysers, D., Uszkoreit, J., et al. (2021). Mlp-mixer: An all-mlp architecture for vision. arXiv preprint arXiv:2105.01601."},{"key":"1866_CR81","doi-asserted-by":"crossref","unstructured":"Touvron, H., Bojanowski, P., Caron, M., Cord, M., El-Nouby, A., Grave, E., Izacard, G., Joulin, A., Synnaeve, G., Verbeek, J., et al. (2021a). Resmlp: Feedforward networks for image classification with data-efficient training. arXiv preprint arXiv:2105.03404.","DOI":"10.1109\/TPAMI.2022.3206148"},{"key":"1866_CR82","unstructured":"Touvron, H., Cord, M., Douze, M., Massa, F., Sablayrolles, A., & J\u00e9gou, H. (2021b). Training data-efficient image transformers and distillation through attention. In: ICML, pp. 10347\u201310357."},{"key":"1866_CR83","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., Kaiser, \u0141., & Polosukhin, I. (2017). Attention is all you need. In NeurIPS, pp. 5998\u20136008."},{"key":"1866_CR84","doi-asserted-by":"crossref","unstructured":"Wang, T. C., Liu, M. Y., Zhu, J. Y., Tao, A., Kautz, J., & Catanzaro, B. (2018). High-resolution image synthesis and semantic manipulation with conditional gans. In Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 8798\u20138807.","DOI":"10.1109\/CVPR.2018.00917"},{"key":"1866_CR85","doi-asserted-by":"crossref","unstructured":"Wang, W., Xie, E., Li, X., Fan, D. P., Song, K., Liang, D., Lu, T., Luo, P., & Shao, L. (2021a). Pvtv2: Improved baselines with pyramid vision transformer. arXiv preprint arXiv:2106.13797.","DOI":"10.1007\/s41095-022-0274-8"},{"key":"1866_CR86","doi-asserted-by":"crossref","unstructured":"Wang, W., Xie, E., Li, X., Fan, D. P., Song, K., Liang, D., Lu, T., Luo, P., & Shao, L. (2021b). Pyramid vision transformer: A versatile backbone for dense prediction without convolutions. arXiv preprint arXiv:2102.12122.","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"1866_CR87","doi-asserted-by":"crossref","unstructured":"Wang, Y., Khan, S., Gonzalez-Garcia, A., Weijer, Jvd., & Khan, F. S. (2020). Semi-supervised learning for few-shot image-to-image translation. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 4453\u20134462.","DOI":"10.1109\/CVPR42600.2020.00451"},{"issue":"4","key":"1866_CR88","first-page":"600","volume":"13","author":"Z Wang","year":"2004","unstructured":"Wang, Z., Bovik, A. C., Sheikh, H. R., & Simoncelli, E. P. (2004). Image quality assessment: From error visibility to structural similarity. TIP, 13(4), 600\u2013612.","journal-title":"TIP"},{"key":"1866_CR89","doi-asserted-by":"crossref","unstructured":"Wang, Z., Lu, Y., Li, Q., Tao, X., Guo, Y., Gong, M., & Liu, T. (2022). Cris: Clip-driven referring image segmentation. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 11686\u201311695.","DOI":"10.1109\/CVPR52688.2022.01139"},{"key":"1866_CR90","doi-asserted-by":"crossref","unstructured":"Wei, T., Chen, D., Zhou, W., Liao, J., Tan, Z., Yuan, L., Zhang, W., & Yu, N. (2021). Hairclip: Design your hair by text and reference image. arXiv preprint arXiv:2112.05142.","DOI":"10.1109\/CVPR52688.2022.01754"},{"key":"1866_CR91","unstructured":"Wu, P. W., Lin, Y. J., Chang, C. H., Chang, E. Y., & Liao, S. W. (2019). Relgan: Multi-domain image-to-image translation via relative attributes. In Proceedings of the IEEE\/CVF international conference on computer vision, pp. 5914\u20135922."},{"key":"1866_CR92","doi-asserted-by":"crossref","unstructured":"Xia, Z., Pan, X., Song, S., Li, L. E., & Huang, G. (2022). Vision transformer with deformable attention. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp. 4794\u20134803.","DOI":"10.1109\/CVPR52688.2022.00475"},{"key":"1866_CR93","unstructured":"Xiao, T., Singh, M., Mintun, E., Darrell, T., Doll\u00e1r, P., & Girshick, R. (2021). Early convolutions help transformers see better. arXiv preprint arXiv:2106.14881."},{"key":"1866_CR94","unstructured":"Xie, E., Wang, W., Yu, Z., Anandkumar, A., Alvarez, J. M., & Luo, P. (2021). Segformer: Simple and efficient design for semantic segmentation with transformers. arXiv preprint arXiv:2105.15203."},{"key":"1866_CR95","doi-asserted-by":"crossref","unstructured":"Yi, Z., Zhang, H., Tan, P., & Gong, M. (2017). Dualgan: Unsupervised dual learning for image-to-image translation. In ICCV, pp. 2849\u20132857.","DOI":"10.1109\/ICCV.2017.310"},{"key":"1866_CR96","unstructured":"Yu, X., Chen, Y., Liu, S., Li, T., & Li, G. (2019). Multi-mapping image-to-image translation via learning disentanglement. In Advances in neural information processing systems 32."},{"key":"1866_CR97","doi-asserted-by":"crossref","unstructured":"Zhang, D., Zhang, H., Tang, J., Wang, M., Hua, X., & Sun, Q. (2020a). Feature pyramid transformer. In ECCV, Springer, pp. 323\u2013339.","DOI":"10.1007\/978-3-030-58604-1_20"},{"key":"1866_CR98","doi-asserted-by":"crossref","unstructured":"Zhang, L., & Agrawala, M. (2023). Adding conditional control to text-to-image diffusion models. arXiv preprint arXiv:2302.05543.","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"1866_CR99","doi-asserted-by":"crossref","unstructured":"Zhang, P., Zhang, B., Chen, D., Yuan, L., & Wen, F. (2020b). Cross-domain correspondence learning for exemplar-based image translation. In CVPR, pp. 5143\u20135153.","DOI":"10.1109\/CVPR42600.2020.00519"},{"key":"1866_CR100","doi-asserted-by":"crossref","unstructured":"Zhang, R., Isola, P., & Efros, A. A. (2016). Colorful image colorization. In ECCV, Springer, pp. 649\u2013666.","DOI":"10.1007\/978-3-319-46487-9_40"},{"key":"1866_CR101","doi-asserted-by":"crossref","unstructured":"Zhang, R., Zhu, J. Y., Isola, P., Geng, X., Lin, A. S., Yu, T., & Efros, A. A. (2017). Real-time user-guided image colorization with learned deep priors. arXiv preprint arXiv:1705.02999.","DOI":"10.1145\/3072959.3073703"},{"key":"1866_CR102","unstructured":"Zhao, L., Zhang, Z., Chen, T., Metaxas, D. N., & Zhang, H. (2021). Improved transformer for high-resolution gans. arXiv preprint arXiv:2106.07631."},{"key":"1866_CR103","doi-asserted-by":"crossref","unstructured":"Zheng, C., Cham, T. J., & Cai, J. (2021a). The spatially-correlative loss for various image translation tasks. In CVPR, pp. 16407\u201316417.","DOI":"10.1109\/CVPR46437.2021.01614"},{"key":"1866_CR104","doi-asserted-by":"crossref","unstructured":"Zheng, S., Lu, J., Zhao, H., Zhu, X., Luo, Z., Wang, Y., Fu, Y., Feng, J., Xiang, T., Torr, P. H., et al. (2021b). Rethinking semantic segmentation from a sequence-to-sequence perspective with transformers. In CVPR, pp. 6881\u20136890.","DOI":"10.1109\/CVPR46437.2021.00681"},{"key":"1866_CR105","doi-asserted-by":"crossref","unstructured":"Zhong, Y., Yang, J., Zhang, P., Li, C., Codella, N., Li, L. H., Zhou, L., Dai, X., Yuan, L., Li, Y., et al. (2022). Regionclip: Region-based language-image pretraining. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 16793\u201316803.","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"1866_CR106","doi-asserted-by":"crossref","unstructured":"Zhou, X., Zhang, B., Zhang, T., Zhang, P., Bao, J., Chen, D., Zhang, Z., & Wen, F. (2021). Cocosnet v2: Full-resolution correspondence learning for image translation. In CVPR, pp. 11465\u201311475.","DOI":"10.1109\/CVPR46437.2021.01130"},{"key":"1866_CR107","doi-asserted-by":"crossref","unstructured":"Zhu, J. Y., Park, T., Isola, P., & Efros, A. A. (2017a). Unpaired image-to-image translation using cycle-consistent adversarial networks. In ICCV, pp. 2223\u20132232.","DOI":"10.1109\/ICCV.2017.244"},{"key":"1866_CR108","unstructured":"Zhu, J. Y., Zhang, R., Pathak, D., Darrell, T., Efros, A. A., Wang, O., & Shechtman, E. (2017b). Toward multimodal image-to-image translation. In Advances in neural information processing systems, 30."},{"key":"1866_CR109","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., & Dai, J. (2020). Deformable detr: Deformable transformers for end-to-end object detection. arXiv preprint arXiv:2010.04159."}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-023-01866-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-023-01866-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-023-01866-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,26]],"date-time":"2024-03-26T11:13:19Z","timestamp":1711451599000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-023-01866-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,31]]},"references-count":109,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2024,4]]}},"alternative-id":["1866"],"URL":"https:\/\/doi.org\/10.1007\/s11263-023-01866-y","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,10,31]]},"assertion":[{"value":"27 July 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"19 June 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"31 October 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}