{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,12]],"date-time":"2025-11-12T06:32:52Z","timestamp":1762929172617,"version":"3.45.0"},"reference-count":84,"publisher":"Springer Science and Business Media LLC","issue":"11","license":[{"start":{"date-parts":[[2025,8,12]],"date-time":"2025-08-12T00:00:00Z","timestamp":1754956800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,8,12]],"date-time":"2025-08-12T00:00:00Z","timestamp":1754956800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100010418","name":"Institute for Information and Communications Technology Promotion","doi-asserted-by":"publisher","award":["RS-2019-II190079"],"award-info":[{"award-number":["RS-2019-II190079"]}],"id":[{"id":"10.13039\/501100010418","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100006465","name":"Korea Creative Content Agency","doi-asserted-by":"publisher","award":["RS-2024-00345025"],"award-info":[{"award-number":["RS-2024-00345025"]}],"id":[{"id":"10.13039\/501100006465","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003725","name":"National Research Foundation of Korea","doi-asserted-by":"crossref","award":["RS-2024-00341514","RS-2022-00187238"],"award-info":[{"award-number":["RS-2024-00341514","RS-2022-00187238"]}],"id":[{"id":"10.13039\/501100003725","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,11]]},"DOI":"10.1007\/s11263-025-02554-9","type":"journal-article","created":{"date-parts":[[2025,8,12]],"date-time":"2025-08-12T15:20:09Z","timestamp":1755012009000},"page":"7873-7896","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Learning Compositionality from Multifaceted Synthetic Data for Language-based Object Detection"],"prefix":"10.1007","volume":"133","author":[{"given":"Kwanyong","family":"Park","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sojung","family":"An","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yong Jae","family":"Lee","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7132-4454","authenticated-orcid":false,"given":"Donghyun","family":"Kim","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,8,12]]},"reference":[{"key":"2554_CR1","unstructured":"Achiam, J., Adler, S., Agarwal, S., Ahmad, L., Akkaya, I., Aleman, F.\u00a0L., Almeida, D., Altenschmidt, J., Altman, S., Anadkat, S., et\u00a0al. (2023). Gpt-4 technical report. arXiv preprint arXiv:2303.08774."},{"key":"2554_CR2","unstructured":"Azizi, S., Kornblith, S., Saharia, C., Norouzi, M., & Fleet, D.\u00a0J. (2023). Synthetic data from diffusion models improves imagenet classification. arXiv preprint arXiv:2304.08466."},{"key":"2554_CR3","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., Mann, B., Ryder, N., Subbiah, M., Kaplan, J. D., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A., et al. (2020). Language models are few-shot learners. Advances in neural information processing systems, 33, 1877\u20131901.","journal-title":"Advances in neural information processing systems"},{"key":"2554_CR4","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., Mann, B., Ryder, N., Subbiah, M., Kaplan, J. D., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A., et al. (2020). Language models are few-shot learners. Advances in neural information processing systems, 33, 1877\u20131901.","journal-title":"Advances in neural information processing systems"},{"key":"2554_CR5","unstructured":"Camburu, O.-M., Rockt\u00e4schel, T., Lukasiewicz, T., & Blunsom, P. (2018). e-snli: Natural language inference with natural language explanations. Advances in Neural Information Processing Systems, 31."},{"key":"2554_CR6","doi-asserted-by":"crossref","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., & Zagoruyko, S. (2020). End-to-end object detection with transformers. In European conference on computer vision, pages 213\u2013229. Springer.","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"2554_CR7","unstructured":"Castells, T., Song, H.-K., Piao, T., Choi, S., Kim, B.-K., Yim, H., Lee, C., Kim, J.\u00a0G., & Kim, T.-H. (2024). Edgefusion: on-device text-to-image generation. arXiv preprint arXiv:2404.11925."},{"key":"2554_CR8","doi-asserted-by":"crossref","unstructured":"Changpinyo, S., Sharma, P., Ding, N., & Soricut, R. (2021). Conceptual 12m: Pushing web-scale image-text pre-training to recognize long-tail visual concepts. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 3558\u20133568.","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"2554_CR9","unstructured":"Chen, J., Yu, J., Ge, C., Yao, L., Xie, E., Wu, Y., Wang, Z., Kwok, J., Luo, P., Lu, H., et\u00a0al. (2023). Pixart-alpha: Fast training of diffusion transformer for photorealistic text-to-image synthesis. arXiv preprint arXiv:2310.00426."},{"key":"2554_CR10","unstructured":"Devlin, J., Chang, M.-W., Lee, K., & Toutanova, K. (2018). Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805."},{"key":"2554_CR11","first-page":"32942","volume":"35","author":"Z-Y Dou","year":"2022","unstructured":"Dou, Z.-Y., Kamath, A., Gan, Z., Zhang, P., Wang, J., Li, L., Liu, Z., Liu, C., LeCun, Y., Peng, N., et al. (2022). Coarse-to-fine vision-language pre-training with fusion in the backbone. Advances in neural information processing systems, 35, 32942\u201332956.","journal-title":"Advances in neural information processing systems"},{"key":"2554_CR12","first-page":"76137","volume":"36","author":"S Doveh","year":"2024","unstructured":"Doveh, S., Arbelle, A., Harary, S., Herzig, R., Kim, D., Cascante-Bonilla, P., Alfassy, A., Panda, R., Giryes, R., Feris, R., et al. (2024). Dense and aligned captions (dac) promote compositional reasoning in vl models. Advances in Neural Information Processing Systems, 36, 76137\u201376150.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2554_CR13","doi-asserted-by":"crossref","unstructured":"Doveh, S., Arbelle, A., Harary, S., Panda, R., Herzig, R., Schwartz, E., Kim, D., Giryes, R., Feris, R., Ullman, S., et\u00a0al. (2022). Teaching structured vision &language concepts to vision &language models. arXiv preprint arXiv:2211.11733.","DOI":"10.1109\/CVPR52729.2023.00261"},{"key":"2554_CR14","doi-asserted-by":"crossref","unstructured":"Fan, L., Chen, K., Krishnan, D., Katabi, D., Isola, P., & Tian, Y. (2024). Scaling laws of synthetic images for model training... for now. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 7382\u20137392.","DOI":"10.1109\/CVPR52733.2024.00705"},{"key":"2554_CR15","unstructured":"Gadre, S.\u00a0Y., Ilharco, G., Fang, A., Hayase, J., Smyrnis, G., Nguyen, T., Marten, R., Wortsman, M., Ghosh, D., Zhang, J., et\u00a0al. (2024). Datacomp: In search of the next generation of multimodal datasets. Advances in Neural Information Processing Systems, 36."},{"key":"2554_CR16","unstructured":"Gan, C., Schwartz, J., Alter, S., Mrowca, D., Schrimpf, M., Traer, J., De\u00a0Freitas, J., Kubilius, J., Bhandwaldar, A., Haber, N., et\u00a0al. (2020). Threedworld: A platform for interactive multi-modal physical simulation. arXiv preprint arXiv:2007.04954."},{"key":"2554_CR17","unstructured":"Gu, X., Lin, T.-Y., Kuo, W., & Cui, Y. (2021). Open-vocabulary object detection via vision and language knowledge distillation. arXiv preprint arXiv:2104.13921."},{"key":"2554_CR18","doi-asserted-by":"crossref","unstructured":"Gupta, A., Dollar, P., & Girshick, R. (2019a). Lvis: A dataset for large vocabulary instance segmentation. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pages 5356\u20135364.","DOI":"10.1109\/CVPR.2019.00550"},{"key":"2554_CR19","doi-asserted-by":"crossref","unstructured":"Gupta, A., Dollar, P., & Girshick, R. (2019b). Lvis: A dataset for large vocabulary instance segmentation. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pages 5356\u20135364.","DOI":"10.1109\/CVPR.2019.00550"},{"key":"2554_CR20","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., & Girshick, R. (2017). Mask r-cnn. In Proceedings of the IEEE international conference on computer vision, pages 2961\u20132969.","DOI":"10.1109\/ICCV.2017.322"},{"key":"2554_CR21","unstructured":"He, R., Sun, S., Yu, X., Xue, C., Zhang, W., Torr, P., Bai, S., & Qi, X. (2022). Is synthetic data from generative models ready for image recognition? arXiv preprint arXiv:2210.07574."},{"key":"2554_CR22","doi-asserted-by":"crossref","unstructured":"Hessel, J., Holtzman, A., Forbes, M., Bras, R.\u00a0L., & Choi, Y. (2021). Clipscore: A reference-free evaluation metric for image captioning. arXiv preprint arXiv:2104.08718.","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"2554_CR23","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., & Abbeel, P. (2020). Denoising diffusion probabilistic models. Advances in neural information processing systems, 33, 6840\u20136851.","journal-title":"Advances in neural information processing systems"},{"key":"2554_CR24","unstructured":"Honnibal, M. & Montani, I. (2017). spaCy 2: Natural language understanding with Bloom embeddings, convolutional neural networks and incremental parsing. To appear."},{"key":"2554_CR25","doi-asserted-by":"crossref","unstructured":"Hosseini, A., Reddy, S., Bahdanau, D., Hjelm, R.\u00a0D., Sordoni, A., & Courville, A. (2021). Understanding by understanding not: Modeling negation in language models. arXiv preprint arXiv:2105.03519.","DOI":"10.18653\/v1\/2021.naacl-main.102"},{"key":"2554_CR26","doi-asserted-by":"crossref","unstructured":"Hsu, C.-C., Tsai, Y.-H., Lin, Y.-Y., & Yang, M.-H. (2020). Every pixel matters: Center-aware feature alignment for domain adaptive object detector. In Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part IX 16, pages 733\u2013748. Springer.","DOI":"10.1007\/978-3-030-58545-7_42"},{"key":"2554_CR27","unstructured":"Huang, K., Sun, K., Xie, E., Li, Z., & Liu, X. (2024). T2i-compbench: A comprehensive benchmark for open-world compositional text-to-image generation. Advances in Neural Information Processing Systems, 36."},{"key":"2554_CR28","doi-asserted-by":"crossref","unstructured":"Hur, S., Shin, I., Park, K., Woo, S., & Kweon, I.\u00a0S. (2023). Learning classifiers of prototypes and reciprocal points for universal domain adaptation. In Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pages 531\u2013540.","DOI":"10.1109\/WACV56688.2023.00060"},{"key":"2554_CR29","unstructured":"Jia, C., Yang, Y., Xia, Y., Chen, Y.-T., Parekh, Z., Pham, H., Le, Q.\u00a0V., Sung, Y., Li, Z., & Duerig, T. (2021). Scaling up visual and vision-language representation learning with noisy text supervision."},{"key":"2554_CR30","doi-asserted-by":"crossref","unstructured":"Kamath, A., Singh, M., LeCun, Y., Synnaeve, G., Misra, I., & Carion, N. (2021). Mdetr-modulated detection for end-to-end multi-modal understanding. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, pages 1780\u20131790.","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"2554_CR31","doi-asserted-by":"crossref","unstructured":"Kim, D., Angelova, A., & Kuo, W. (2023). Region-aware pretraining for open-vocabulary object detection with vision transformers. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 11144\u201311154.","DOI":"10.1109\/CVPR52729.2023.01072"},{"key":"2554_CR32","unstructured":"Kirstain, Y., Polyak, A., Singer, U., Matiana, S., Penna, J., & Levy, O. (2024). Pick-a-pic: An open dataset of user preferences for text-to-image generation. Advances in Neural Information Processing Systems, 36."},{"key":"2554_CR33","first-page":"51597","volume":"37","author":"Y Lee","year":"2024","unstructured":"Lee, Y., Park, K., Cho, Y., Lee, Y.-J., & Hwang, S. J. (2024). Koala: Empirical lessons toward memory-efficient and fast diffusion models for text-to-image synthesis. Advances in Neural Information Processing Systems, 37, 51597\u201351633.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2554_CR34","unstructured":"Li, B., Zhang, Y., Guo, D., Zhang, R., Li, F., Zhang, H., Zhang, K., Li, Y., Liu, Z., & Li, C. (2024a). Llava-onevision: Easy visual task transfer. arXiv preprint arXiv:2408.03326."},{"key":"2554_CR35","unstructured":"Li, L., Dou, Z.-Y., Peng, N., & Chang, K.-W. (2024b). Desco: Learning object recognition with rich language descriptions. Advances in Neural Information Processing Systems, 36."},{"key":"2554_CR36","doi-asserted-by":"crossref","unstructured":"Li, L.\u00a0H., Zhang, P., Zhang, H., Yang, J., Li, C., Zhong, Y., Wang, L., Yuan, L., Zhang, L., Hwang, J.-N., et\u00a0al. (2022). Grounded language-image pre-training. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 10965\u201310975.","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"2554_CR37","doi-asserted-by":"crossref","unstructured":"Lin, S., Wang, K., Zeng, X., & Zhao, R. (2023). Explore the power of synthetic data on few-shot object detection. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 638\u2013647.","DOI":"10.1109\/CVPRW59228.2023.00071"},{"key":"2554_CR38","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., & Zitnick, C.\u00a0L. (2014a). Microsoft coco: Common objects in context. In Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13, pages 740\u2013755. Springer.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"2554_CR39","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., & Zitnick, C.\u00a0L. (2014b). Microsoft coco: Common objects in context. In Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13, pages 740\u2013755. Springer.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"2554_CR40","doi-asserted-by":"crossref","unstructured":"Liu, S., Zeng, Z., Ren, T., Li, F., Zhang, H., Yang, J., Li, C., Yang, J., Su, H., Zhu, J., et\u00a0al. (2023). Grounding dino: Marrying dino with grounded pre-training for open-set object detection. arXiv preprint arXiv:2303.05499.","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"2554_CR41","unstructured":"Liu, Y., Ott, M., Goyal, N., Du, J., Joshi, M., Chen, D., Levy, O., Lewis, M., Zettlemoyer, L., & Stoyanov, V. (2019). Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692."},{"key":"2554_CR42","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., & Guo, B. (2021). Swin transformer: Hierarchical vision transformer using shifted windows. In Proceedings of the IEEE\/CVF international conference on computer vision, pages 10012\u201310022.","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"2554_CR43","doi-asserted-by":"crossref","unstructured":"Loper, E. & Bird, S. (2002). Nltk: The natural language toolkit. arXiv preprint cs\/0205028.","DOI":"10.3115\/1118108.1118117"},{"key":"2554_CR44","unstructured":"Mikami, H., Fukumizu, K., Murai, S., Suzuki, S., Kikuchi, Y., Suzuki, T., Maeda, S.-i., & Hayashi, K. (2021). A scaling law for synthetic-to-real transfer: How much is your pre-training effective? arXiv preprint arXiv:2108.11018."},{"key":"2554_CR45","doi-asserted-by":"crossref","unstructured":"Minderer, M., Gritsenko, A., Stone, A., Neumann, M., Weissenborn, D., Dosovitskiy, A., Mahendran, A., Arnab, A., Dehghani, M., Shen, Z., et\u00a0al. (2022). Simple open-vocabulary object detection. In European conference on computer vision, pages 728\u2013755. Springer.","DOI":"10.1007\/978-3-031-20080-9_42"},{"key":"2554_CR46","doi-asserted-by":"crossref","unstructured":"Mishra, S., Panda, R., Phoo, C.\u00a0P., Chen, C.-F.\u00a0R., Karlinsky, L., Saenko, K., Saligrama, V., & Feris, R.\u00a0S. (2022). Task2sim: Towards effective pre-training and transfer from synthetic data. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 9194\u20139204.","DOI":"10.1109\/CVPR52688.2022.00898"},{"key":"2554_CR47","unstructured":"Nguyen, Q., Vu, T., Tran, A., & Nguyen, K. (2024). Dataset diffusion: Diffusion-based synthetic data generation for pixel-level semantic segmentation. Advances in Neural Information Processing Systems, 36."},{"key":"2554_CR48","unstructured":"Ordonez, V., Kulkarni, G., & Berg, T. (2011). Im2text: Describing images using 1 million captioned photographs. Advances in neural information processing systems, 24."},{"key":"2554_CR49","doi-asserted-by":"crossref","unstructured":"Park, K., Saito, K., & Kim, D. (2024). Weak-to-strong compositional learning from generative models for language-based object detection. In Computer Vision\u2013ECCV 2024. Springer.","DOI":"10.1007\/978-3-031-73337-6_1"},{"key":"2554_CR50","doi-asserted-by":"crossref","unstructured":"Park, K., Woo, S., Oh, S.\u00a0W., Kweon, I.\u00a0S., & Lee, J.-Y. (2023). Mask-guided matting in the wild. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 1992\u20132001.","DOI":"10.1109\/CVPR52729.2023.00198"},{"key":"2554_CR51","first-page":"10869","volume":"33","author":"K Park","year":"2020","unstructured":"Park, K., Woo, S., Shin, I., & Kweon, I. S. (2020). Discover, hallucinate, and adapt: Open compound domain adaptation for semantic segmentation. Advances in Neural Information Processing Systems, 33, 10869\u201310880.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2554_CR52","doi-asserted-by":"crossref","unstructured":"Peng, X., Sun, B., Ali, K., & Saenko, K. (2015). Learning deep object detectors from 3d models. In Proceedings of the IEEE international conference on computer vision, pages 1278\u20131286.","DOI":"10.1109\/ICCV.2015.151"},{"key":"2554_CR53","doi-asserted-by":"crossref","unstructured":"Plummer, B.\u00a0A., Wang, L., Cervantes, C.\u00a0M., Caicedo, J.\u00a0C., Hockenmaier, J., & Lazebnik, S. (2015). Flickr30k entities: Collecting region-to-phrase correspondences for richer image-to-sentence models. In Proceedings of the IEEE international conference on computer vision, pages 2641\u20132649.","DOI":"10.1109\/ICCV.2015.303"},{"key":"2554_CR54","unstructured":"Podell, D., English, Z., Lacey, K., Blattmann, A., Dockhorn, T., M\u00fcller, J., Penna, J., & Rombach, R. (2023). Sdxl: Improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952."},{"key":"2554_CR55","doi-asserted-by":"crossref","unstructured":"Prakash, A., Boochoon, S., Brophy, M., Acuna, D., Cameracci, E., State, G., Shapira, O., & Birchfield, S. (2019). Structured domain randomization: Bridging the reality gap by context-aware synthetic data. In 2019 International Conference on Robotics and Automation (ICRA), pages 7249\u20137255. IEEE.","DOI":"10.1109\/ICRA.2019.8794443"},{"key":"2554_CR56","unstructured":"Radford, A., Kim, J.\u00a0W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et\u00a0al. (2021). Learning transferable visual models from natural language supervision. In International conference on machine learning, pages 8748\u20138763. PMLR."},{"key":"2554_CR57","unstructured":"Ren, S., He, K., Girshick, R., & Sun, J. (2015). Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems, 28."},{"key":"2554_CR58","doi-asserted-by":"crossref","unstructured":"Richter, S.\u00a0R., Vineet, V., Roth, S., & Koltun, V. (2016). Playing for data: Ground truth from computer games. In Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11-14, 2016, Proceedings, Part II 14, pages 102\u2013118. Springer.","DOI":"10.1007\/978-3-319-46475-6_7"},{"key":"2554_CR59","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., & Ommer, B. (2022). High-resolution image synthesis with latent diffusion models. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pages 10684\u201310695.","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"2554_CR60","doi-asserted-by":"crossref","unstructured":"Sauer, A., Lorenz, D., Blattmann, A., & Rombach, R. (2023). Adversarial diffusion distillation. arXiv preprint arXiv:2311.17042.","DOI":"10.1007\/978-3-031-73016-0_6"},{"key":"2554_CR61","doi-asserted-by":"crossref","unstructured":"Schulter, S., Suh, Y., Dafnis, K.\u00a0M., Zhang, Z., Zhao, S., Metaxas, D., et\u00a0al. (2023). Omnilabel: A challenging benchmark for language-based object detection.","DOI":"10.1109\/ICCV51070.2023.01098"},{"key":"2554_CR62","doi-asserted-by":"crossref","unstructured":"Shao, S., Li, Z., Zhang, T., Peng, C., Yu, G., Zhang, X., Li, J., & Sun, J. (2019). Objects365: A large-scale, high-quality dataset for object detection. In Proceedings of the IEEE\/CVF international conference on computer vision, pages 8430\u20138439.","DOI":"10.1109\/ICCV.2019.00852"},{"key":"2554_CR63","unstructured":"Shin, I., Park, K., Woo, S., & Kweon, I.\u00a0S. (2021). Unsupervised domain adaptation for video semantic segmentation. arXiv preprint  arXiv:2107.11052."},{"key":"2554_CR64","doi-asserted-by":"crossref","unstructured":"Singh, K., Navaratnam, T., Holmer, J., Schaub-Meyer, S., & Roth, S. (2024). Is synthetic data all we need? benchmarking the robustness of models trained with synthetic images. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 2505\u20132515.","DOI":"10.1109\/CVPRW63382.2024.00257"},{"key":"2554_CR65","unstructured":"Song, J., Meng, C., & Ermon, S. (2020). Denoising diffusion implicit models. arXiv preprint  arXiv:2010.02502."},{"key":"2554_CR66","doi-asserted-by":"crossref","unstructured":"Thrush, T., Jiang, R., Bartolo, M., Singh, A., Williams, A., Kiela, D., & Ross, C. (2022). Winoground: Probing vision and language models for visio-linguistic compositionality. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 5238\u20135248.","DOI":"10.1109\/CVPR52688.2022.00517"},{"key":"2554_CR67","doi-asserted-by":"crossref","unstructured":"Tian, Y., Fan, L., Chen, K., Katabi, D., Krishnan, D., & Isola, P. (2024a). Learning vision from models rivals learning vision from data. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 15887\u201315898.","DOI":"10.1109\/CVPR52733.2024.01504"},{"key":"2554_CR68","unstructured":"Tian, Y., Fan, L., Isola, P., Chang, H., & Krishnan, D. (2024b). Stablerep: Synthetic images from text-to-image models make strong visual representation learners. Advances in Neural Information Processing Systems, 36."},{"key":"2554_CR69","doi-asserted-by":"crossref","unstructured":"Tian, Z., Shen, C., Chen, H., & He, T. (2019). Fcos: Fully convolutional one-stage object detection. In Proceedings of the IEEE\/CVF international conference on computer vision, pages 9627\u20139636.","DOI":"10.1109\/ICCV.2019.00972"},{"key":"2554_CR70","unstructured":"Touvron, H., Lavril, T., Izacard, G., Martinet, X., Lachaux, M.-A., Lacroix, T., Rozi\u00e8re, B., Goyal, N., Hambro, E., Azhar, F., et\u00a0al. (2023). Llama: Open and efficient foundation language models. arXiv preprint  arXiv:2302.13971."},{"key":"2554_CR71","doi-asserted-by":"crossref","unstructured":"Tsai, Y.-H., Hung, W.-C., Schulter, S., Sohn, K., Yang, M.-H., & Chandraker, M. (2018). Learning to adapt structured output space for semantic segmentation. In Proceedings of the IEEE conference on computer vision and pattern recognition, pages 7472\u20137481.","DOI":"10.1109\/CVPR.2018.00780"},{"key":"2554_CR72","doi-asserted-by":"crossref","unstructured":"Wang, Z., Yu, M., Wei, Y., Feris, R., Xiong, J., Hwu, W.-m., Huang, T.\u00a0S., & Shi, H. (2020). Differential treatment for stuff and things: A simple unsupervised domain adaptation method for semantic segmentation. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 12635\u201312644.","DOI":"10.1109\/CVPR42600.2020.01265"},{"key":"2554_CR73","doi-asserted-by":"crossref","unstructured":"Woo, S., Park, K., Oh, S.\u00a0W., Kweon, I.\u00a0S., & Lee, J.-Y. (2022a). Bridging images and videos: A simple learning framework for large vocabulary video object detection. In European Conference on Computer Vision, pages 238\u2013258. Springer.","DOI":"10.1007\/978-3-031-19806-9_14"},{"key":"2554_CR74","doi-asserted-by":"crossref","unstructured":"Woo, S., Park, K., Oh, S.\u00a0W., Kweon, I.\u00a0S., & Lee, J.-Y. (2022b). Tracking by associating clips. In European Conference on Computer Vision, pages 129\u2013145. Springer.","DOI":"10.1007\/978-3-031-19806-9_8"},{"key":"2554_CR75","first-page":"54683","volume":"36","author":"W Wu","year":"2023","unstructured":"Wu, W., Zhao, Y., Chen, H., Gu, Y., Zhao, R., He, Y., Zhou, H., Shou, M. Z., & Shen, C. (2023). Datasetdm: Synthesizing data with perception annotations using diffusion models. Advances in Neural Information Processing Systems, 36, 54683\u201354695.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2554_CR76","unstructured":"Wu, X., Hao, Y., Sun, K., Chen, Y., Zhu, F., Zhao, R., & Li, H. (2023b). Human preference score v2: A solid benchmark for evaluating human preferences of text-to-image synthesis. arXiv preprint  arXiv:2306.09341."},{"key":"2554_CR77","unstructured":"Xie, C., Zhang, Z., Wu, Y., Zhu, F., Zhao, R., & Liang, S. (2024). Described object detection: Liberating object detection with flexible expressions. Advances in Neural Information Processing Systems, 36."},{"key":"2554_CR78","doi-asserted-by":"crossref","unstructured":"Xie, J., Li, W., Li, X., Liu, Z., Ong, Y.\u00a0S., & Loy, C.\u00a0C. (2023). Mosaicfusion: Diffusion models as data augmenters for large vocabulary instance segmentation. arXiv preprint  arXiv:2309.13042.","DOI":"10.1007\/s11263-024-02223-3"},{"key":"2554_CR79","unstructured":"Xu, T., Chen, W., Wang, P., Wang, F., Li, H., & Jin, R. (2021). Cdtrans: Cross-domain transformer for unsupervised domain adaptation. arXiv preprint  arXiv:2109.06165."},{"key":"2554_CR80","unstructured":"Yuksekgonul, M., Bianchi, F., Kalluri, P., Jurafsky, D., & Zou, J. (2023). When and why vision-language models behave like bags-of-words, and what to do about it? In International Conference on Learning Representations."},{"key":"2554_CR81","doi-asserted-by":"crossref","unstructured":"Zareian, A., Rosa, K.\u00a0D., Hu, D.\u00a0H., & Chang, S.-F. (2021). Open-vocabulary object detection using captions. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pages 14393\u201314402.","DOI":"10.1109\/CVPR46437.2021.01416"},{"key":"2554_CR82","doi-asserted-by":"crossref","unstructured":"Zhai, X., Mustafa, B., Kolesnikov, A., & Beyer, L. (2023). Sigmoid loss for language image pre-training. In Proceedings of the IEEE\/CVF international conference on computer vision, pages 11975\u201311986.","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"2554_CR83","doi-asserted-by":"crossref","unstructured":"Zhong, Y., Yang, J., Zhang, P., Li, C., Codella, N., Li, L.\u00a0H., Zhou, L., Dai, X., Yuan, L., Li, Y., et\u00a0al. (2022). Regionclip: Region-based language-image pretraining. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 16793\u201316803.","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"2554_CR84","doi-asserted-by":"crossref","unstructured":"Zhou, X., Girdhar, R., Joulin, A., Kr\u00e4henb\u00fchl, P., & Misra, I. (2022). Detecting twenty-thousand classes using image-level supervision. In European Conference on Computer Vision, pages 350\u2013368. Springer.","DOI":"10.1007\/978-3-031-20077-9_21"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02554-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02554-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02554-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,12]],"date-time":"2025-11-12T06:28:51Z","timestamp":1762928931000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02554-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,12]]},"references-count":84,"journal-issue":{"issue":"11","published-print":{"date-parts":[[2025,11]]}},"alternative-id":["2554"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02554-9","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"type":"print","value":"0920-5691"},{"type":"electronic","value":"1573-1405"}],"subject":[],"published":{"date-parts":[[2025,8,12]]},"assertion":[{"value":"30 September 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 July 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 August 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}