{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,15]],"date-time":"2026-05-15T15:51:39Z","timestamp":1778860299749,"version":"3.51.4"},"publisher-location":"Cham","reference-count":49,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726835","type":"print"},{"value":"9783031726842","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72684-2_12","type":"book-chapter","created":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T19:02:45Z","timestamp":1730574165000},"page":"199-215","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":8,"title":["OpenPSG: Open-Set Panoptic Scene Graph Generation via\u00a0Large Multimodal Models"],"prefix":"10.1007","author":[{"given":"Zijian","family":"Zhou","sequence":"first","affiliation":[]},{"given":"Zheng","family":"Zhu","sequence":"additional","affiliation":[]},{"given":"Holger","family":"Caesar","sequence":"additional","affiliation":[]},{"given":"Miaojing","family":"Shi","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,3]]},"reference":[{"key":"12_CR1","unstructured":"Achiam, J., et\u00a0al.: GPT-4 technical report. arXiv preprint arXiv:2303.08774 (2023)"},{"key":"12_CR2","doi-asserted-by":"crossref","unstructured":"Caesar, H., Uijlings, J., Ferrari, V.: COCO-stuff: thing and stuff classes in context. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00132"},{"key":"12_CR3","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"12_CR4","doi-asserted-by":"crossref","unstructured":"Chen, S., Jin, Q., Wang, P., Wu, Q.: Say as you wish: fine-grained control of image caption generation with abstract scene graphs. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00998"},{"key":"12_CR5","doi-asserted-by":"crossref","unstructured":"Chen, Z., Wu, J., Lei, Z., Zhang, Z., Chen, C.: Expanding scene graph boundaries: fully open-vocabulary scene graph generation via visual-concept alignment and retention. arXiv preprint arXiv:2311.10988 (2023)","DOI":"10.1007\/978-3-031-72848-8_7"},{"key":"12_CR6","doi-asserted-by":"crossref","unstructured":"Cheng, B., Misra, I., Schwing, A.G., Kirillov, A., Girdhar, R.: Masked-attention mask transformer for universal image segmentation. In: CVPR, pp. 1290\u20131299 (2022)","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"12_CR7","doi-asserted-by":"crossref","unstructured":"Dai, J., He, K., Sun, J.: Convolutional feature masking for joint object and stuff segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3992\u20134000 (2015)","DOI":"10.1109\/CVPR.2015.7299025"},{"key":"12_CR8","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16$$\\times $$16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"12_CR9","doi-asserted-by":"crossref","unstructured":"Du, Y., Wei, F., Zhang, Z., Shi, M., Gao, Y., Li, G.: Learning to prompt for open-vocabulary object detection with vision-language model. In: CVPR, pp. 14084\u201314093 (2022)","DOI":"10.1109\/CVPR52688.2022.01369"},{"key":"12_CR10","doi-asserted-by":"crossref","unstructured":"Gao, L., Wang, B., Wang, W.: Image captioning with scene-graph based semantic concepts. In: ICMLC (2018)","DOI":"10.1145\/3195106.3195114"},{"key":"12_CR11","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"540","DOI":"10.1007\/978-3-031-20059-5_31","volume-title":"Computer Vision \u2013 ECCV 2022","author":"G Ghiasi","year":"2022","unstructured":"Ghiasi, G., Gu, X., Cui, Y., Lin, T.Y.: Scaling open-vocabulary image segmentation with image-level labels. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13696, pp. 540\u2013557. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20059-5_31"},{"key":"12_CR12","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"56","DOI":"10.1007\/978-3-031-19815-1_4","volume-title":"Computer Vision \u2013 ECCV 2022","author":"T He","year":"2022","unstructured":"He, T., Gao, L., Song, J., Li, Y.F.: Towards open-vocabulary scene graph generation with prompt-based finetuning. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13688, pp. 56\u201373. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19815-1_4"},{"key":"12_CR13","unstructured":"Hildebrandt, M., Li, H., Koner, R., Tresp, V., G\u00fcnnemann, S.: Scene graph reasoning for visual question answering. arXiv preprint arXiv:2007.01072 (2020)"},{"key":"12_CR14","doi-asserted-by":"crossref","unstructured":"Joshi, M., Levy, O., Weld, D.S., Zettlemoyer, L.: BERT for coreference resolution: baselines and analysis. arXiv preprint arXiv:1908.09091 (2019)","DOI":"10.18653\/v1\/D19-1588"},{"key":"12_CR15","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"466","DOI":"10.1007\/978-3-030-86520-7_29","volume-title":"Machine Learning and Knowledge Discovery in Databases. Research Track","author":"X Kan","year":"2021","unstructured":"Kan, X., Cui, H., Yang, C.: Zero-shot scene graph relation prediction through commonsense knowledge integration. In: Oliver, N., P\u00e9rez-Cruz, F., Kramer, S., Read, J., Lozano, J.A. (eds.) ECML PKDD 2021, Part II. LNCS (LNAI), vol. 12976, pp. 466\u2013482. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-86520-7_29"},{"key":"12_CR16","doi-asserted-by":"crossref","unstructured":"Kirillov, A., He, K., Girshick, R., Rother, C., Doll\u00e1r, P.: Panoptic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9404\u20139413 (2019)","DOI":"10.1109\/CVPR.2019.00963"},{"key":"12_CR17","doi-asserted-by":"crossref","unstructured":"Krishna, R., et\u00a0al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vision 32\u201373 (2017)","DOI":"10.1007\/s11263-016-0981-7"},{"key":"12_CR18","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)"},{"key":"12_CR19","unstructured":"Li, L., et al.: Panoptic scene graph generation with semantics-prototype learning. arXiv preprint arXiv:2307.15567 (2023)"},{"key":"12_CR20","doi-asserted-by":"crossref","unstructured":"Liang, F., et al.: Open-vocabulary semantic segmentation with mask-adapted clip. In: CVPR, pp. 7061\u20137070 (2023)","DOI":"10.1109\/CVPR52729.2023.00682"},{"key":"12_CR21","unstructured":"Lin, C., et al.: Learning object-language alignments for open-vocabulary object detection. arXiv preprint arXiv:2211.14843 (2022)"},{"key":"12_CR22","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"12_CR23","doi-asserted-by":"crossref","unstructured":"Lin, X., Ding, C., Zeng, J., Tao, D.: GPS-net: graph property sensing network for scene graph generation. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00380"},{"key":"12_CR24","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744 (2023)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"12_CR25","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: NeurIPS, vol. 36 (2024)"},{"key":"12_CR26","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: ICLR (2019)"},{"key":"12_CR27","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML, pp. 8748\u20138763 (2021)"},{"issue":"1","key":"12_CR28","first-page":"5485","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. 21(1), 5485\u20135551 (2020)","journal-title":"J. Mach. Learn. Res."},{"key":"12_CR29","doi-asserted-by":"crossref","unstructured":"Singh, K.P., Salvador, J., Weihs, L., Kembhavi, A.: Scene graph contrastive learning for embodied navigation. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00999"},{"key":"12_CR30","doi-asserted-by":"crossref","unstructured":"Tang, K., Zhang, H., Wu, B., Luo, W., Liu, W.: Learning to compose dynamic tree structures for visual contexts. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00678"},{"key":"12_CR31","unstructured":"Touvron, H., et\u00a0al.: LLaMA: open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"12_CR32","unstructured":"Wang, C., Liu, X., Song, D.: Language models are open knowledge graphs. arXiv preprint arXiv:2010.11967 (2020)"},{"key":"12_CR33","doi-asserted-by":"crossref","unstructured":"Wang, J., Wen, Z., Li, X., Guo, Z., Yang, J., Liu, Z.: Pair then relation: pair-net for panoptic scene graph generation. arXiv preprint arXiv:2307.08699 (2023)","DOI":"10.1109\/TPAMI.2024.3442301"},{"key":"12_CR34","doi-asserted-by":"crossref","unstructured":"Xu, D., Zhu, Y., Choy, C.B., Fei-Fei, L.: Scene graph generation by iterative message passing. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.330"},{"key":"12_CR35","doi-asserted-by":"crossref","unstructured":"Yang, J., et al.: Focusing on flexible masks: a novel framework for panoptic scene graph generation with relation constraints. In: ACM MM, pp. 4209\u20134218 (2023)","DOI":"10.1145\/3581783.3612544"},{"key":"12_CR36","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"178","DOI":"10.1007\/978-3-031-19812-0_11","volume-title":"Computer Vision \u2013 ECCV 2022","author":"J Yang","year":"2022","unstructured":"Yang, J., Ang, Y.Z., Guo, Z., Zhou, K., Zhang, W., Liu, Z.: Panoptic scene graph generation. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13687, pp. 178\u2013196. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19812-0_11"},{"key":"12_CR37","doi-asserted-by":"crossref","unstructured":"Yao, L., et al.: DetCLIPv2: scalable open-vocabulary object detection pre-training via word-region alignment. In: CVPR, pp. 23497\u201323506 (2023)","DOI":"10.1109\/CVPR52729.2023.02250"},{"key":"12_CR38","doi-asserted-by":"crossref","unstructured":"Yu, Q., Li, J., Wu, Y., Tang, S., Ji, W., Zhuang, Y.: Visually-prompted language model for fine-grained scene graph generation in an open world. arXiv preprint arXiv:2303.13233 (2023)","DOI":"10.1109\/ICCV51070.2023.01971"},{"key":"12_CR39","unstructured":"Yu, Q., Shen, X., Chen, L.C.: Towards open-ended visual recognition with large language model. arXiv preprint arXiv:2311.08400 (2023)"},{"key":"12_CR40","doi-asserted-by":"crossref","unstructured":"Yu, X., et al.: Zero-shot scene graph generation with knowledge graph completion. In: 2022 IEEE International Conference on Multimedia and Expo (ICME), pp.\u00a01\u20136. IEEE (2022)","DOI":"10.1109\/ICME52920.2022.9859944"},{"key":"12_CR41","doi-asserted-by":"crossref","unstructured":"Yue, K., Chen, B.C., Geiping, J., Li, H., Goldstein, T., Lim, S.N.: Object recognition as next token prediction. arXiv preprint arXiv:2312.02142 (2023)","DOI":"10.1109\/CVPR52733.2024.01575"},{"key":"12_CR42","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"106","DOI":"10.1007\/978-3-031-20077-9_7","volume-title":"Computer Vision \u2013 ECCV 2022","author":"Y Zang","year":"2022","unstructured":"Zang, Y., Li, W., Zhou, K., Huang, C., Loy, C.C.: Open-vocabulary DETR with conditional matching. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13669, pp. 106\u2013122. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20077-9_7"},{"key":"12_CR43","doi-asserted-by":"crossref","unstructured":"Zellers, R., Yatskar, M., Thomson, S., Choi, Y.: Neural motifs: scene graph parsing with global context. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00611"},{"key":"12_CR44","doi-asserted-by":"crossref","unstructured":"Zhang, H., et al.: A simple framework for open-vocabulary segmentation and detection. In: ICCV, pp. 1020\u20131031 (2023)","DOI":"10.1109\/ICCV51070.2023.00100"},{"key":"12_CR45","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Pan, Y., Yao, T., Huang, R., Mei, T., Chen, C.W.: Learning to generate language-supervised and open-vocabulary scene graph using pre-trained visual-semantic space. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2915\u20132924 (2023)","DOI":"10.1109\/CVPR52729.2023.00285"},{"key":"12_CR46","doi-asserted-by":"crossref","unstructured":"Zhao, C., Shen, Y., Chen, Z., Ding, M., Gan, C.: TextPSG: panoptic scene graph generation from textual descriptions. In: ICCV, pp. 2839\u20132850 (2023)","DOI":"10.1109\/ICCV51070.2023.00266"},{"key":"12_CR47","doi-asserted-by":"crossref","unstructured":"Zhou, Z., Shi, M., Caesar, H.: HiLo: exploiting high low frequency relations for unbiased panoptic scene graph generation. arXiv preprint arXiv:2303.15994 (2023)","DOI":"10.1109\/ICCV51070.2023.01978"},{"key":"12_CR48","unstructured":"Zhou, Z., Shi, M., Caesar, H.: VLPrompt: vision-language prompting for panoptic scene graph generation. arXiv preprint arXiv:2311.16492 (2023)"},{"key":"12_CR49","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: MiniGPT-4: enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72684-2_12","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T18:18:29Z","timestamp":1732990709000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72684-2_12"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,3]]},"ISBN":["9783031726835","9783031726842"],"references-count":49,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72684-2_12","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,3]]},"assertion":[{"value":"3 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}