{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,11]],"date-time":"2025-10-11T00:24:21Z","timestamp":1760142261610,"version":"build-2065373602"},"reference-count":84,"publisher":"Springer Science and Business Media LLC","issue":"10","license":[{"start":{"date-parts":[[2025,6,27]],"date-time":"2025-06-27T00:00:00Z","timestamp":1750982400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,6,27]],"date-time":"2025-06-27T00:00:00Z","timestamp":1750982400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U21B2025","62402252"],"award-info":[{"award-number":["U21B2025","62402252"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Pengcheng Laboratory Research Project","award":["PCL2023A08"],"award-info":[{"award-number":["PCL2023A08"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,10]]},"DOI":"10.1007\/s11263-025-02499-z","type":"journal-article","created":{"date-parts":[[2025,6,27]],"date-time":"2025-06-27T00:21:29Z","timestamp":1750983689000},"page":"6813-6831","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Generic Scene Graph Generation Model with Hierarchical Prompt Learning"],"prefix":"10.1007","volume":"133","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-8151-4204","authenticated-orcid":false,"given":"Xuhan","family":"Zhu","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5206-9515","authenticated-orcid":false,"given":"Yifei","family":"Xing","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1830-2595","authenticated-orcid":false,"given":"Ruiping","family":"Wang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6110-4036","authenticated-orcid":false,"given":"Yaowei","family":"Wang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8564-0346","authenticated-orcid":false,"given":"Xiangyuan","family":"Lan","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,6,27]]},"reference":[{"key":"2499_CR1","doi-asserted-by":"crossref","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., & Zagoruyko, S. (2020). End-to-end object detection with transformers. In: European conference on computer vision, Springer, 213\u2013229.","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"2499_CR2","doi-asserted-by":"crossref","unstructured":"Chen, S., Jin, Q., Wang, P., & Wu, Q. (2020). Say as you wish: Fine-grained control of image caption generation with abstract scene graphs. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 9962\u20139971.","DOI":"10.1109\/CVPR42600.2020.00998"},{"key":"2499_CR3","doi-asserted-by":"crossref","unstructured":"Chen, Z., Wu, J., Lei, Z., Zhang, Z., & Chen, C. (2023). Expanding scene graph boundaries: Fully open-vocabulary scene graph generation via visual-concept alignment and retention. arXiv preprint arXiv:2311.10988.","DOI":"10.1007\/978-3-031-72848-8_7"},{"key":"2499_CR4","doi-asserted-by":"crossref","unstructured":"Chiou, M. J., Ding, H., Yan, H., Wang, C., Zimmermann, R., & Feng, J. (2021). Recovering the unbiased scene graphs from the biased ones. In: Proceedings of the 29th ACM International Conference on Multimedia, 1581\u2014-1590.","DOI":"10.1145\/3474085.3475297"},{"issue":"9","key":"2499_CR5","doi-asserted-by":"publisher","first-page":"11169","DOI":"10.1109\/TPAMI.2023.3268066","volume":"45","author":"Y Cong","year":"2023","unstructured":"Cong, Y., Yang, M. Y., & Rosenhahn, B. (2023). Reltr: Relation transformer for scene graph generation. IEEE Transactions on Pattern Analysis and Machine Intelligence, 45(9), 11169\u201311183.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2499_CR6","doi-asserted-by":"crossref","unstructured":"Cui, Y., Jia, M., Lin, T. Y., Song, Y., & Belongie, S. (2019). Class-balanced loss based on effective number of samples. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, 9268\u20139277.","DOI":"10.1109\/CVPR.2019.00949"},{"key":"2499_CR7","unstructured":"Devlin, J., Chang, M. W., Lee, K., & Toutanova, K. (2018). Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805."},{"key":"2499_CR8","doi-asserted-by":"crossref","unstructured":"Dong, X., Gan, T., Song, X., Wu, J., Cheng, Y., & Nie, L. (2022). Stacked hybrid-attention and group collaborative learning for unbiased scene graph generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 19427\u201319436.","DOI":"10.1109\/CVPR52688.2022.01882"},{"key":"2499_CR9","doi-asserted-by":"crossref","unstructured":"Du, Y., Wei, F., Zhang, Z., Shi, M., Gao, Y., & Li, G. (2022). Learning to prompt for open-vocabulary object detection with vision-language model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 14084\u201314093.","DOI":"10.1109\/CVPR52688.2022.01369"},{"key":"2499_CR10","doi-asserted-by":"crossref","unstructured":"Gu, J., Joty, S., Cai, J., Zhao, H., Yang, X., & Wang, G. (2019). Unpaired image captioning via scene graph alignments. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 10323\u201310332.","DOI":"10.1109\/ICCV.2019.01042"},{"key":"2499_CR11","unstructured":"Gu, J., Han, Z., Chen, S., et al. (2023). A systematic survey of prompt engineering on vision-language foundation models. arXiv preprint arXiv:2307.12980."},{"key":"2499_CR12","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 770\u2013778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"2499_CR13","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., & Girshick, R. (2017). Mask r-cnn. In: Proceedings of the IEEE international conference on computer vision, 2961\u20132969.","DOI":"10.1109\/ICCV.2017.322"},{"key":"2499_CR14","doi-asserted-by":"crossref","unstructured":"He, T., Gao, L., Song, J., & Li, Y. F. (2022). Towards open-vocabulary scene graph generation with prompt-based finetuning. In: European Conference on Computer Vision, Springer, 56\u201373.","DOI":"10.1007\/978-3-031-19815-1_4"},{"key":"2499_CR15","unstructured":"Hildebrandt, M., Li, H., Koner, R., Tresp, V., & G\u00fcnnemann, S. (2020). Scene graph reasoning for visual question answering. arXiv preprint arXiv:2007.01072."},{"key":"2499_CR16","doi-asserted-by":"crossref","unstructured":"Hu, J., Huang, L., Ren, T., Zhang, S., Ji, R., & Cao, L. (2023). You only segment once: Towards real-time panoptic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 17819\u201317829.","DOI":"10.1109\/CVPR52729.2023.01709"},{"key":"2499_CR17","doi-asserted-by":"crossref","unstructured":"Im, J., Nam, J., Park, N., Lee, H., & Park, S. (2024). Egtr: Extracting graph from transformer for scene graph generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 24229\u201324238.","DOI":"10.1109\/CVPR52733.2024.02287"},{"key":"2499_CR18","doi-asserted-by":"crossref","unstructured":"Jeon, J., Kim, K., Yoon, K., & Park, C. (2025). Semantic diversity-aware prototype-based learning for unbiased scene graph generation. In: European Conference on Computer Vision, Springer, 379\u2013395.","DOI":"10.1007\/978-3-031-73113-6_22"},{"key":"2499_CR19","unstructured":"Jia, C., Yang, Y., Xia, Y., Chen, Y. T., Parekh, Z., Pham, H., Le, Q. V., Sung, Y., Li, Z., & Duerig, T. (2021). Scaling up visual and vision-language representation learning with noisy text supervision. In: ICML."},{"key":"2499_CR20","doi-asserted-by":"crossref","unstructured":"Jin, W., Cheng, Y., Shen, Y., Chen, W., & Ren, X. (2021a). A good prompt is worth millions of parameters? low-resource prompt-based learning for vision-language models. arXiv preprint arXiv:2110.08484.","DOI":"10.18653\/v1\/2022.acl-long.197"},{"key":"2499_CR21","unstructured":"Jin, Y., Chen, Y., Wang, L., Wang, J., Yu, P., Liu, Z., & Hwang, J. N. (2021b). Is object detection necessary for human-object interaction recognition? arXiv preprint arXiv:2107.13083."},{"key":"2499_CR22","unstructured":"Jocher, G., Chaurasia, A., & Qiu, J. (2023). Ultralytics YOLO. https:\/\/github.com\/ultralytics\/ultralytics."},{"key":"2499_CR23","doi-asserted-by":"crossref","unstructured":"Johnson, J., Krishna, R., Stark, M., Li, L. J., Shamma, D., Bernstein, M., & Fei-Fei, L. (2015). Image retrieval using scene graphs. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 3668\u20133678.","DOI":"10.1109\/CVPR.2015.7298990"},{"key":"2499_CR24","doi-asserted-by":"crossref","unstructured":"Kirillov, A., Girshick, R., He, K., Doll\u00e1r, P. (2019). Panoptic feature pyramid networks. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, 6399\u20136408.","DOI":"10.1109\/CVPR.2019.00656"},{"key":"2499_CR25","doi-asserted-by":"crossref","unstructured":"Knyazev, B., de Vries, H., Cangea, C., Taylor, G. W., Courville, A., & Belilovsky, E. (2020). Graph density-aware losses for novel compositions in scene graph generation. In: BMVC.","DOI":"10.5244\/C.34.99"},{"issue":"1","key":"2499_CR26","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., Zhu, Y., Groth, O., Johnson, J., Hata, K., Kravitz, J., Chen, S., Kalantidis, Y., Li, L. J., Shamma, D. A., et al. (2017). Visual genome: Connecting language and vision using crowdsourced dense image annotations. International journal of computer vision, 123(1), 32\u201373.","journal-title":"International journal of computer vision"},{"issue":"7","key":"2499_CR27","doi-asserted-by":"publisher","first-page":"1956","DOI":"10.1007\/s11263-020-01316-z","volume":"128","author":"A Kuznetsova","year":"2020","unstructured":"Kuznetsova, A., Rom, H., Alldrin, N., Uijlings, J., Krasin, I., Pont-Tuset, J., Kamali, S., Popov, S., Malloci, M., Kolesnikov, A., Duerig, T., & Ferrari, V. (2020). The open images dataset v4: Unified image classification, object detection, and visual relationship detection at scale. International Journal of Computer Vision(IJCV), 128(7), 1956\u20131981.","journal-title":"International Journal of Computer Vision(IJCV)"},{"key":"2499_CR28","doi-asserted-by":"crossref","unstructured":"Li, F., Zhang, H., Xu, H., Liu, S., Zhang, L., Ni, L. M., Shum, H. Y. (2023a). Mask dino: Towards a unified transformer-based framework for object detection and segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 3041\u20133050.","DOI":"10.1109\/CVPR52729.2023.00297"},{"key":"2499_CR29","doi-asserted-by":"crossref","unstructured":"Li, L., Chen, L., Huang, Y., Zhang, Z., Zhang, S., & Xiao, J. (2022a). The devil is in the labels: Noisy label correction for robust scene graph generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 18869\u201318878.","DOI":"10.1109\/CVPR52688.2022.01830"},{"key":"2499_CR30","doi-asserted-by":"crossref","unstructured":"Li, L., Chen, G., Xiao, J., Yang, Y., Wang, C., & Chen, L. (2023b). Compositional feature augmentation for unbiased scene graph generation. arXiv preprint arXiv:2308.06712.","DOI":"10.1109\/ICCV51070.2023.01982"},{"key":"2499_CR31","first-page":"50105","volume":"36","author":"L Li","year":"2023","unstructured":"Li, L., Xiao, J., Chen, G., Shao, J., Zhuang, Y., & Chen, L. (2023). Zero-shot visual relation detection via composite visual cues from large language models. Advances in Neural Information Processing Systems, 36, 50105\u201350116.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2499_CR32","doi-asserted-by":"crossref","unstructured":"Li, R., Zhang, S., Wan, B., & He, X. (2021). Bipartite graph network with adaptive message passing for unbiased scene graph generation. In: CVPR.","DOI":"10.1109\/CVPR46437.2021.01096"},{"key":"2499_CR33","doi-asserted-by":"crossref","unstructured":"Li, R., Zhang, S., & He, X. (2022b). Sgtr: End-to-end scene graph generation with transformer. In: proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, 19486\u201319496.","DOI":"10.1109\/CVPR52688.2022.01888"},{"key":"2499_CR34","doi-asserted-by":"crossref","unstructured":"Lin, T. Y., Goyal, P., Girshick, R., He, K., & Dollar, P. (2017). Focal loss for dense object detection.","DOI":"10.1109\/ICCV.2017.324"},{"key":"2499_CR35","doi-asserted-by":"crossref","unstructured":"Lin, X., Ding, C., Zeng, J., & Tao, D. (2020). Gps-net: Graph property sensing network for scene graph generation. In: CVPR.","DOI":"10.1109\/CVPR42600.2020.00380"},{"key":"2499_CR36","doi-asserted-by":"crossref","unstructured":"Lin, X., Ding, C., Zhan, Y., Li, Z., & Tao, D. (2022). Hl-net: Heterophily learning network for scene graph generation. In: proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, 19476\u201319485.","DOI":"10.1109\/CVPR52688.2022.01887"},{"key":"2499_CR37","unstructured":"Liu, P., Yuan, W., Fu, J., Jiang, Z., Hayashi, H., & Neubig, G. (2021a). Pre-train, prompt, and predict: A systematic survey of prompting methods in natural language processing. arXiv preprint arXiv:2107.13586."},{"key":"2499_CR38","doi-asserted-by":"crossref","unstructured":"Liu, S., Zeng, Z., Ren, T., Li, F., Zhang, H., Yang, J., Jiang, Q., Li, C., Yang, J., Su, H., et al. (2024). Grounding dino: Marrying dino with grounded pre-training for open-set object detection. In: European Conference on Computer Vision, Springer, 38\u201355.","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"2499_CR39","unstructured":"Liu, X., Zheng, Y., Du, Z., Ding, M., Qian, Y., Yang, Z., & Tang, J. (2021b). Gpt understands, too. arXiv preprint arXiv:2103.10385."},{"key":"2499_CR40","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., & Guo, B. (2021c). Swin transformer: Hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF international conference on computer vision, 10012\u201310022.","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"2499_CR41","doi-asserted-by":"crossref","unstructured":"Lorenz, J., Pest, A., Kienzle, D., Ludwig, K., & Lienhart, R. (2024). A fair ranking and new model for panoptic scene graph generation. arXiv preprint arXiv:2407.09216.","DOI":"10.1007\/978-3-031-73030-6_9"},{"key":"2499_CR42","doi-asserted-by":"crossref","unstructured":"Lyu, X., Gao, L., Guo, Y., Zhao, Z., Huang, H., Shen, H. T., & Song, J. (2022). Fine-grained predicates learning for scene graph generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 19467\u201319475.","DOI":"10.1109\/CVPR52688.2022.01886"},{"key":"2499_CR43","doi-asserted-by":"crossref","unstructured":"Ma, N., Zhang, X., Zheng, H. T., & Sun, J. (2018). Shufflenet v2: Practical guidelines for efficient cnn architecture design. In: Proceedings of the European conference on computer vision (ECCV), 116\u2013131.","DOI":"10.1007\/978-3-030-01264-9_8"},{"key":"2499_CR44","unstructured":"Neau, M., Santos, P. E., Bosser, A. G., & Buche, C. (2024). React: Real-time efficiency and accuracy compromise for tradeoffs in scene graph generation. arxiv:2405.16116."},{"key":"2499_CR45","unstructured":"Peng, Y., Li, H., Wu, P., Zhang, Y., Sun, X., & Wu, F. (2025). D-FINE: Redefine regression task of DETRs as fine-grained distribution refinement. In: The Thirteenth International Conference on Learning Representations, https:\/\/openreview.net\/forum?id=MFZjrTFE7h."},{"key":"2499_CR46","unstructured":"Radford, A., Narasimhan, K., Salimans, T., Sutskever, I., et al. (2018). Improving language understanding by generative pre-training. arXiv preprint arXiv:2109.11797."},{"issue":"8","key":"2499_CR47","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I., et al. (2019). Language models are unsupervised multitask learners. OpenAI blog, 1(8), 9.","journal-title":"OpenAI blog"},{"key":"2499_CR48","unstructured":"Radford, A., Kim, J. W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et al. (2021). Learning transferable visual models from natural language supervision. In: ICML, PMLR, 8748\u20138763."},{"key":"2499_CR49","doi-asserted-by":"crossref","unstructured":"Redmon, J., Divvala, S., Girshick, R., & Farhadi, A. (2016). You only look once: Unified, real-time object detection. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 779\u2013788.","DOI":"10.1109\/CVPR.2016.91"},{"key":"2499_CR50","unstructured":"Ren, S., He, K., Girshick, R., & Sun, J. (2015). Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems, 28."},{"key":"2499_CR51","unstructured":"SegmentsAI (2023) Panoptic segment anything. https:\/\/github.com\/segments-ai\/panoptic-segment-anything."},{"key":"2499_CR52","doi-asserted-by":"crossref","unstructured":"Shi, H., Li, L., Xiao, J., Zhuang, Y., & Chen, L. (2024). From easy to hard: Learning curricular shape-aware features for robust panoptic scene graph generation. International Journal of Computer Vision, 1\u201320.","DOI":"10.1007\/s11263-024-02190-9"},{"key":"2499_CR53","doi-asserted-by":"crossref","unstructured":"Tan, J., Wang, C., Li, B., Li, Q., Ouyang, W., Yin, C., & Yan, J. (2020). Equalization loss for long-tailed object recognition. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, 11662\u201311671.","DOI":"10.1109\/CVPR42600.2020.01168"},{"key":"2499_CR54","doi-asserted-by":"crossref","unstructured":"Tan, J., Lu, X., Zhang, G., Yin, C., & Li, Q. (2021). Equalization loss v2: A new gradient balance approach for long-tailed object detection. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, 1685\u20131694.","DOI":"10.1109\/CVPR46437.2021.00173"},{"key":"2499_CR55","doi-asserted-by":"crossref","unstructured":"Tang, K., Zhang, H., Wu, B., Luo, W., & Liu, W. (2019). Learning to compose dynamic tree structures for visual contexts. In: CVPR.","DOI":"10.1109\/CVPR.2019.00678"},{"key":"2499_CR56","doi-asserted-by":"crossref","unstructured":"Tang, K., Niu, Y., Huang, J., Shi, J., & Zhang, H. (2020). Unbiased scene graph generation from biased training. In: CVPR.","DOI":"10.1109\/CVPR42600.2020.00377"},{"key":"2499_CR57","doi-asserted-by":"crossref","unstructured":"Teney, D., Liu, L., & van Den Hengel, A. (2017). Graph-structured representations for visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 1\u20139.","DOI":"10.1109\/CVPR.2017.344"},{"key":"2499_CR58","doi-asserted-by":"crossref","unstructured":"Wang, H., Li, Y., Yao, H., & Li, X. (2023a). Clipn for zero-shot ood detection: Teaching clip to say no. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 1802\u20131812.","DOI":"10.1109\/ICCV51070.2023.00173"},{"key":"2499_CR59","doi-asserted-by":"publisher","unstructured":"Wang, J., Wen, Z., Li, X., Guo, Z., Yang, J., & Liu, Z. (2024). Pair then relation: Pair-Net for panoptic scene graph generation. https:\/\/doi.org\/10.48550\/arXiv.2307.08699, arxiv:2307.08699 [cs].","DOI":"10.48550\/arXiv.2307.08699"},{"issue":"10","key":"2499_CR60","doi-asserted-by":"publisher","first-page":"2489","DOI":"10.1007\/s11263-023-01817-7","volume":"131","author":"W Wang","year":"2023","unstructured":"Wang, W., Wang, R., Shan, S., & Chen, X. (2023). Importance first: Generating scene graph of human interest. International Journal of Computer Vision, 131(10), 2489\u20132515.","journal-title":"International Journal of Computer Vision"},{"key":"2499_CR61","doi-asserted-by":"crossref","unstructured":"Xian, Y., Schiele, B., & Akata, Z. (2017). Zero-shot learning-the good, the bad and the ugly. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 4582\u20134591.","DOI":"10.1109\/CVPR.2017.328"},{"key":"2499_CR62","doi-asserted-by":"crossref","unstructured":"Xu, D., Zhu, Y., Choy, C., Fei-Fei, L. (2017). Scene graph generation by iterative message passing. In: CVPR.","DOI":"10.1109\/CVPR.2017.330"},{"key":"2499_CR63","doi-asserted-by":"crossref","unstructured":"Xu, G., Chai, J., & Kordjamshidi, P. (2024). Gipcol: Graph-injected soft prompting for compositional zero-shot learning. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, 5774\u20135783.","DOI":"10.1109\/WACV57701.2024.00567"},{"key":"2499_CR64","doi-asserted-by":"crossref","unstructured":"Yan, S., Shen, C., Jin, Z., Huang, J., Jiang, R., Chen, Y., & Hua, X. (2020). PCPL: predicate-correlation perception learning for unbiased scene graph generation. In: ACM MM.","DOI":"10.1145\/3394171.3413722"},{"key":"2499_CR65","doi-asserted-by":"crossref","unstructured":"Yang, J., Lu, J., Lee, S., Batra, D., & Parikh, D. (2018). Graph r-cnn for scene graph generation. In: ECCV, 670\u2013685.","DOI":"10.1007\/978-3-030-01246-5_41"},{"key":"2499_CR66","doi-asserted-by":"crossref","unstructured":"Yang, J., Ang, Y. Z., Guo, Z., Zhou, K., Zhang, W., & Liu, Z. (2022). Panoptic scene graph generation. In: European Conference on Computer Vision, Springer, 178\u2013196.","DOI":"10.1007\/978-3-031-19812-0_11"},{"key":"2499_CR67","unstructured":"Yao, Y., Zhang, A., Zhang, Z., Liu, Z., Chua, T. S., & Sun, M. (2021). Cpt: Colorful prompt tuning for pre-trained vision-language models. arXiv preprint arXiv:2109.11797."},{"key":"2499_CR68","doi-asserted-by":"crossref","unstructured":"Yu, Q., Li, J., Wu, Y., Tang, S., Ji, W., & Zhuang, Y. (2023). Visually-prompted language model for fine-grained scene graph generation in an open world. arXiv preprint arXiv:2303.13233.","DOI":"10.1109\/ICCV51070.2023.01971"},{"key":"2499_CR69","doi-asserted-by":"crossref","unstructured":"Zellers, R., Yatskar, M., Thomson, S., & Choi, Y. (2018). Neural motifs: Scene graph parsing with global context. In: CVPR.","DOI":"10.1109\/CVPR.2018.00611"},{"key":"2499_CR70","doi-asserted-by":"crossref","unstructured":"Zhang, A., Yao, Y., Chen, Q., Ji, W., Liu, Z., Sun, M., & Chua, T. S. (2022). Fine-grained scene graph generation with data transfer. In: ECCV.","DOI":"10.1007\/978-3-031-19812-0_24"},{"key":"2499_CR71","unstructured":"Zhang, C., Chao, W. L., & Xuan, D. (2019a). An empirical study on leveraging scene graphs for visual question answering. arXiv preprint arXiv:1907.12133."},{"key":"2499_CR72","doi-asserted-by":"crossref","unstructured":"Zhang, J., Shih, K., Elgammal, A., Tao, A., & Catanzaro, B. (2019b). Graphical contrastive losses for scene graph parsing. In: CVPR.","DOI":"10.1109\/CVPR.2019.01180"},{"key":"2499_CR73","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Huang, X., Ma, J., Li, Z., Luo, Z., Xie, Y., Qin, Y., Luo, T., Li, Y., Liu, S., et al. (2023a). Recognize anything: A strong image tagging model. arXiv preprint arXiv:2306.03514.","DOI":"10.1109\/CVPRW63382.2024.00179"},{"key":"2499_CR74","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Pan, Y., Yao, T., Huang, R., Mei, T., & Chen, C. W. (2023b). Learning to generate language-supervised and open-vocabulary scene graph using pre-trained visual-semantic space. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2915\u20132924.","DOI":"10.1109\/CVPR52729.2023.00285"},{"issue":"3","key":"2499_CR75","doi-asserted-by":"publisher","first-page":"1743","DOI":"10.1109\/TCSVT.2023.3297842","volume":"34","author":"C Zheng","year":"2023","unstructured":"Zheng, C., Gao, L., Lyu, X., Zeng, P., El Saddik, A., & Shen, H. T. (2023). Dual-branch hybrid learning network for unbiased scene graph generation. IEEE Transactions on Circuits and Systems for Video Technology, 34(3), 1743\u20131756.","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"key":"2499_CR76","doi-asserted-by":"crossref","unstructured":"Zheng, C., Lyu, X., Gao, L., Dai, B., & Song, J. (2023b). Prototype-based embedding network for scene graph generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 22783\u201322792.","DOI":"10.1109\/CVPR52729.2023.02182"},{"key":"2499_CR77","doi-asserted-by":"crossref","unstructured":"Zhong, Y., Wang, L., Chen, J., Yu, D., & Li, Y. (2020). Comprehensive image captioning via scene graph decomposition. In: Proceedings of the European Conference on Computer Vision, 211\u2013229.","DOI":"10.1007\/978-3-030-58568-6_13"},{"key":"2499_CR78","doi-asserted-by":"crossref","unstructured":"Zhou, K., Yang, J., Loy, C. C., & Liu, Z. (2022a). Conditional prompt learning for vision-language models. In: CVPR, 16816\u201316825.","DOI":"10.1109\/CVPR52688.2022.01631"},{"issue":"9","key":"2499_CR79","doi-asserted-by":"publisher","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","volume":"130","author":"K Zhou","year":"2022","unstructured":"Zhou, K., Yang, J., Loy, C. C., & Liu, Z. (2022). Learning to prompt for vision-language models. International Journal of Computer Vision, 130(9), 2337\u20132348.","journal-title":"International Journal of Computer Vision"},{"key":"2499_CR80","doi-asserted-by":"crossref","unstructured":"Zhou, Z., Lei, Y., Zhang, B., Liu, L., & Liu, Y. (2023a). Zegclip: Towards adapting clip for zero-shot semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 11175\u201311185.","DOI":"10.1109\/CVPR52729.2023.01075"},{"key":"2499_CR81","doi-asserted-by":"crossref","unstructured":"Zhou, Z., Shi, M., & Caesar, H. (2023b). HiLo: Exploiting high low frequency relations for unbiased panoptic scene graph generation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), 21637\u201321648.","DOI":"10.1109\/ICCV51070.2023.01978"},{"key":"2499_CR82","unstructured":"Zhu, P., Wang, X., Zhu, L., Sun, Z., Zheng, W., Wang, Y., & Chen, C. (2022). Prompt-based learning for unpaired image captioning. arXiv preprint arXiv:2205.13125."},{"key":"2499_CR83","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., & Dai, J. (2020). Deformable detr: Deformable transformers for end-to-end object detection. arXiv preprint arXiv:2010.04159."},{"key":"2499_CR84","unstructured":"Zhu, X., Xing, Y., Wang, R., Wang, Y., & Lan, X. (2024). Hierarchical prompt learning for scene graph generation. In: 35th British Machine Vision Conference 2024, BMVC 2024, Glasgow, UK, November 25-28, 2024, BMVA, https:\/\/papers.bmvc2024.org\/0183.pdf."}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02499-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02499-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02499-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,10]],"date-time":"2025-10-10T08:49:18Z","timestamp":1760086158000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02499-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,27]]},"references-count":84,"journal-issue":{"issue":"10","published-print":{"date-parts":[[2025,10]]}},"alternative-id":["2499"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02499-z","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"type":"print","value":"0920-5691"},{"type":"electronic","value":"1573-1405"}],"subject":[],"published":{"date-parts":[[2025,6,27]]},"assertion":[{"value":"28 February 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 June 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 June 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}