{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T01:48:00Z","timestamp":1775785680714,"version":"3.50.1"},"reference-count":56,"publisher":"Springer Science and Business Media LLC","issue":"7","license":[{"start":{"date-parts":[[2025,8,18]],"date-time":"2025-08-18T00:00:00Z","timestamp":1755475200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2025,8,18]],"date-time":"2025-08-18T00:00:00Z","timestamp":1755475200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"funder":[{"DOI":"10.13039\/501100012165","name":"Key Technologies Research and Development Program","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100012165","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J. King Saud Univ. Comput. Inf. Sci."],"published-print":{"date-parts":[[2025,9]]},"DOI":"10.1007\/s44443-025-00203-2","type":"journal-article","created":{"date-parts":[[2025,8,18]],"date-time":"2025-08-18T15:07:08Z","timestamp":1755529628000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["SPIN-SGG: spatial integration for open-vocabulary scene graph generation"],"prefix":"10.1007","volume":"37","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-5808-7843","authenticated-orcid":false,"given":"Nanhao","family":"Liang","sequence":"first","affiliation":[]},{"given":"Xiaoyuan","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Shengyi","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Yong","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Yingwei","family":"Xia","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,8,18]]},"reference":[{"key":"203_CR1","doi-asserted-by":"publisher","first-page":"33","DOI":"10.1016\/j.cviu.2017.12.004","volume":"173","author":"S Aditya","year":"2018","unstructured":"Aditya S, Yang Y, Baral C et al (2018) Image understanding using vision and reasoning through scene description graph. Comput Vis Image Underst 173:33\u201345","journal-title":"Comput Vis Image Underst"},{"key":"203_CR2","doi-asserted-by":"crossref","unstructured":"Anderson P, Fernando B, Johnson M, et\u00a0al. (2018) Bottom-up and top-down attention for image captioning and visual question answering. In: CVPR","DOI":"10.1109\/CVPR.2018.00636"},{"key":"203_CR3","doi-asserted-by":"crossref","unstructured":"Antol S, Agrawal A, Lu J, et\u00a0al. (2015) Vqa: visual question answering. In: Proceedings of the IEEE international conference on computer vision. pp 2425\u20132433","DOI":"10.1109\/ICCV.2015.279"},{"key":"203_CR4","unstructured":"Bhat SF, Alhashim I, Wonka P (2021) Adabins: depth estimation using adaptive bins. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp 4009\u20134018"},{"key":"203_CR5","first-page":"1403","volume":"34","author":"A Bozic","year":"2021","unstructured":"Bozic A, Palafox P, Thies J et al (2021) Transformerfusion: monocular rgb scene reconstruction using transformers. Adv Neural Inf Process Syst 34:1403\u20131414","journal-title":"Adv Neural Inf Process Syst"},{"key":"203_CR6","unstructured":"Brown TB (2020) Language models are few-shot learners. arXiv:2005.14165"},{"key":"203_CR7","unstructured":"Ce L (2008) Sift flow: dense correspondence across different scenes. ECCV 2008"},{"key":"203_CR8","unstructured":"Chen Y, Huang X, Qi Z, et\u00a0al. (2023) Depth anything: unifying monocular depth estimation and completion with vision transformers. arXiv:2311.16517"},{"key":"203_CR9","doi-asserted-by":"crossref","unstructured":"Chen S, Jin Q, Wang P, et\u00a0al. (2020) Say as you wish: fine-grained control of image caption generation with abstract scene graphs. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp 9962\u20139971","DOI":"10.1109\/CVPR42600.2020.00998"},{"key":"203_CR10","doi-asserted-by":"crossref","unstructured":"Chen Z, Wang W, Tian H, et\u00a0al. (2024b) How far are we to gpt-4v? closing the gap to commercial multimodal models with open-source suites. arXiv:2404.16821","DOI":"10.1007\/s11432-024-4231-5"},{"key":"203_CR11","doi-asserted-by":"crossref","unstructured":"Chen Z, Wu J, Wang W, et\u00a0al. (2024c) Internvl: scaling up vision foundation models and aligning for generic visual-linguistic tasks. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp 24185\u201324198","DOI":"10.1109\/CVPR52733.2024.02283"},{"key":"203_CR12","doi-asserted-by":"crossref","unstructured":"Chen B, Xu Z, Kirmani S, et\u00a0al. (2024a) Spatialvlm: endowing vision-language models with spatial reasoning capabilities. arXiv:2401.12168","DOI":"10.1109\/CVPR52733.2024.01370"},{"key":"203_CR13","doi-asserted-by":"crossref","unstructured":"Chen T, Yu W, Chen R, et\u00a0al. (2019) Knowledge-embedded routing network for scene graph generation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp 6163\u20136171","DOI":"10.1109\/CVPR.2019.00632"},{"key":"203_CR14","first-page":"126725","volume":"37","author":"Y Du","year":"2024","unstructured":"Du Y, Sun W, Snoek C (2024) Ipo: interpretable prompt optimization for vision-language models. Adv Neural Inf Process Syst 37:126725\u2013126766","journal-title":"Adv Neural Inf Process Syst"},{"key":"203_CR15","unstructured":"Eigen D, Puhrsch C, Fergus R (2014) Depth map prediction from a single image using a multi-scale deep network. Adv Neural Inf Process Syst 27"},{"key":"203_CR16","doi-asserted-by":"publisher","first-page":"151","DOI":"10.1007\/s11263-006-0031-y","volume":"75","author":"D Hoiem","year":"2007","unstructured":"Hoiem D, Efros AA, Hebert M (2007) Recovering surface layout from an image. Int J Comput Vision 75:151\u2013172","journal-title":"Int J Comput Vision"},{"key":"203_CR17","unstructured":"Huang S, Dong L, Wang W et al (2024) Language is not all you need: aligning perception with language models. Adv Neural Inf Process Syst 36"},{"key":"203_CR18","unstructured":"Jiang B, Zhuang Z, Taylor CJ (2023) Enhancing scene graph generation with hierarchical relationships and commonsense knowledge. arXiv:2311.12889"},{"key":"203_CR19","doi-asserted-by":"crossref","unstructured":"Jia M, Tang L, Chen BC, et\u00a0al. (2022) Visual prompt tuning. In: European conference on computer vision. Springer, pp 709\u2013727","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"203_CR20","unstructured":"Kazhdan M, Bolitho M, Hoppe H (2006) Poisson surface reconstruction. In: Proceedings of the fourth eurographics symposium on geometry processing"},{"issue":"3","key":"203_CR21","doi-asserted-by":"publisher","first-page":"219","DOI":"10.1007\/BF00977785","volume":"9","author":"DT Lee","year":"1980","unstructured":"Lee DT, Schachter BJ (1980) Two algorithms for constructing a delaunay triangulation. Int J Comput Inf Sci 9(3):219\u2013242","journal-title":"Int J Comput Inf Sci"},{"key":"203_CR22","first-page":"9694","volume":"34","author":"J Li","year":"2021","unstructured":"Li J, Selvaraju R, Gotmare A et al (2021) Align before fuse: vision and language representation learning with momentum distillation. Adv Neural Inf Process Syst 34:9694\u20139705","journal-title":"Adv Neural Inf Process Syst"},{"key":"203_CR23","doi-asserted-by":"crossref","unstructured":"Liang N, Liu Y, Sun W, et\u00a0al. (2024) Ckt-rcm: clip-based knowledge transfer and relational context mining for unbiased panoptic scene graph generation. In: ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, pp 3570\u20133574","DOI":"10.1109\/ICASSP48485.2024.10446810"},{"key":"203_CR24","unstructured":"Li J, Li D, Savarese S, et\u00a0al. (2023) Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In: International conference on machine learning. PMLR, pp 19730\u201319742"},{"key":"203_CR25","doi-asserted-by":"crossref","unstructured":"Li Y, Ouyang W, Zhou B, et\u00a0al. (2017) Scene graph generation from objects, phrases and region captions. In: Proceedings of the IEEE international conference on computer vision. pp 1261\u20131270","DOI":"10.1109\/ICCV.2017.142"},{"key":"203_CR26","doi-asserted-by":"crossref","unstructured":"Li Y, Ouyang W, Zhou B, et\u00a0al. (2018) Factorizable net: an efficient subgraph-based framework for scene graph generation. In: ECCV","DOI":"10.1007\/978-3-030-01246-5_21"},{"key":"203_CR27","doi-asserted-by":"crossref","unstructured":"Li B, Shen C, Dai Y, et\u00a0al. (2015) Depth and surface normal estimation from monocular images using regression on deep features and hierarchical crfs. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1119\u20131127","DOI":"10.1109\/CVPR.2015.7298715"},{"key":"203_CR28","unstructured":"Liu H, Li C, Wu Q et al (2024) Visual instruction tuning. Adv Neural Inf Process Syst 36"},{"key":"203_CR29","doi-asserted-by":"crossref","unstructured":"Liu S, Zeng Z, Ren T, et\u00a0al. (2023) Grounding dino: marrying dino with grounded pre-training for open-set object detection. arXiv:2303.05499","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"203_CR30","doi-asserted-by":"crossref","unstructured":"Li Z, Wang X, Liu X et al (2024) Binsformer: revisiting adaptive bins for monocular depth estimation. IEEE Trans Image Process","DOI":"10.1109\/TIP.2024.3416065"},{"key":"203_CR31","doi-asserted-by":"crossref","unstructured":"Lu C, Krishna R, Bernstein M, et\u00a0al. (2016) Visual relationship detection with language priors. In: Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11\u201314, 2016, Proceedings, Part I 14. Springer, pp 852\u2013869","DOI":"10.1007\/978-3-319-46448-0_51"},{"key":"203_CR32","doi-asserted-by":"crossref","unstructured":"Murez Z, Van\u00a0As T, Bartolozzi J, et\u00a0al. (2020) Atlas: End-to-end 3d scene reconstruction from posed images. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part VII 16. Springer, pp 414\u2013431","DOI":"10.1007\/978-3-030-58571-6_25"},{"key":"203_CR33","doi-asserted-by":"crossref","unstructured":"Pu T, Chen T, Wu H et al (2023) Spatial-temporal knowledge-embedded transformer for video scene graph generation. IEEE Trans Image Process","DOI":"10.1109\/TIP.2023.3345652"},{"key":"203_CR34","doi-asserted-by":"publisher","first-page":"3950","DOI":"10.1109\/TMM.2022.3169065","volume":"25","author":"T Qian","year":"2022","unstructured":"Qian T, Chen J, Chen S et al (2022) Scene graph refinement network for visual question answering. IEEE Trans Multimedia 25:3950\u20133961","journal-title":"IEEE Trans Multimedia"},{"key":"203_CR35","doi-asserted-by":"crossref","unstructured":"Qi M, Li W, Yang Z, et\u00a0al. (2019) Attentive relational networks for mapping images to scene graphs. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp 3957\u20133966","DOI":"10.1109\/CVPR.2019.00408"},{"key":"203_CR36","unstructured":"Radford A, Kim JW, Hallacy C, et\u00a0al. (2021) Learning transferable visual models from natural language supervision. In: International conference on machine learning. PMLR, pp 8748\u20138763"},{"key":"203_CR37","doi-asserted-by":"crossref","unstructured":"Ranftl R, Bochkovskiy A, Koltun V (2021) Vision transformers for dense prediction. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV). pp 12179\u201312188","DOI":"10.1109\/ICCV48922.2021.01196"},{"key":"203_CR38","doi-asserted-by":"crossref","unstructured":"Sch\u00f6nberger JL, Zheng E, Frahm JM, et\u00a0al. (2016) Pixelwise view selection for unstructured multi-view stereo. In: Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11-14, 2016, Proceedings, Part III 14. Springer, pp 501\u2013518","DOI":"10.1007\/978-3-319-46487-9_31"},{"key":"203_CR39","doi-asserted-by":"crossref","unstructured":"Shao S, Pei Z, Chen W, et\u00a0al. (2023) Nddepth: normal-distance assisted monocular depth estimation. In: Proceedings of the IEEE\/CVF international conference on computer vision. pp 7931\u20137940","DOI":"10.1109\/ICCV51070.2023.00729"},{"key":"203_CR40","unstructured":"Sun W, Du Y, Liu G, et\u00a0al. (2024) Training-free semantic segmentation via llm-supervision. arXiv preprint arXiv:2404.00701"},{"key":"203_CR41","unstructured":"Sun W, Song X, Li P, et\u00a0al. (2025) The curse of depth in large language models. arXiv:2502.05795"},{"key":"203_CR42","doi-asserted-by":"crossref","unstructured":"Sun J, Xie Y, Chen L, et\u00a0al. (2021) Neuralrecon: Real-time coherent 3d reconstruction from monocular video. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp 15598\u201315607","DOI":"10.1109\/CVPR46437.2021.01534"},{"key":"203_CR43","doi-asserted-by":"crossref","unstructured":"Wang Wea (2024) The all-seeing project v2: towards general relation comprehension of the open world. arXiv:2402.19474","DOI":"10.1007\/978-3-031-73414-4_27"},{"key":"203_CR44","doi-asserted-by":"crossref","unstructured":"Wang W, Ren Y, Luo H, et\u00a0al. (2024) The all-seeing project v2: towards general relation comprehension of the open world. arXiv:2402.19474","DOI":"10.1007\/978-3-031-73414-4_27"},{"key":"203_CR45","first-page":"24824","volume":"35","author":"J Wei","year":"2022","unstructured":"Wei J, Wang X, Schuurmans D et al (2022) Chain-of-thought prompting elicits reasoning in large language models. Adv Neural Inf Process Syst 35:24824\u201324837","journal-title":"Adv Neural Inf Process Syst"},{"key":"203_CR46","doi-asserted-by":"crossref","unstructured":"Xu M, Wu M, Zhao Y (2024) Llava-spacesgg: Visual instruct tuning for open-vocabulary scene graph generation with enhanced spatial relations. arXiv:2412.06322","DOI":"10.1109\/WACV61041.2025.00620"},{"key":"203_CR47","doi-asserted-by":"crossref","unstructured":"Xu D, Zhu Y, Choy CB, et\u00a0al. (2017) Scene graph generation by iterative message passing. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp 5410\u20135419","DOI":"10.1109\/CVPR.2017.330"},{"key":"203_CR48","doi-asserted-by":"crossref","unstructured":"Yang J, Ang YZ, Guo Z, et\u00a0al. (2022a) Panoptic scene graph generation. In: ECCV","DOI":"10.1007\/978-3-031-19812-0_11"},{"key":"203_CR49","doi-asserted-by":"crossref","unstructured":"Yang J, Ang YZ, Guo Z, et\u00a0al. (2022b) Panoptic scene graph generation. In: European conference on computer vision. Springer, pp 178\u2013196","DOI":"10.1007\/978-3-031-19812-0_11"},{"key":"203_CR50","doi-asserted-by":"crossref","unstructured":"Yang J, Lu J, Lee S, et\u00a0al. (2018) Graph r-cnn for scene graph generation. In: Proceedings of the European conference on computer vision (ECCV). pp 670\u2013685","DOI":"10.1007\/978-3-030-01246-5_41"},{"key":"203_CR51","doi-asserted-by":"crossref","unstructured":"Yang X, Tang K, Zhang H, et\u00a0al. (2019) Auto-encoding scene graphs for image captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp 10685\u201310694","DOI":"10.1109\/CVPR.2019.01094"},{"key":"203_CR52","doi-asserted-by":"crossref","unstructured":"Yao T, Pan Y, Li Y, et\u00a0al. (2018) Exploring visual relationship for image captioning. In: Proceedings of the European conference on computer vision (ECCV). pp 684\u2013699","DOI":"10.1007\/978-3-030-01264-9_42"},{"key":"203_CR53","doi-asserted-by":"crossref","unstructured":"Yoon K, Kim K, Jeon J, et\u00a0al. (2024) Ra-sgg: retrieval-augmented scene graph generation framework via multi-prototype learning. arXiv:2412.12788","DOI":"10.1609\/aaai.v39i9.33036"},{"key":"203_CR54","unstructured":"Yu J, Wang Z, Vasudevan V, et\u00a0al. (2022) Coca: contrastive captioners are image-text foundation models. arXiv:2205.01917"},{"key":"203_CR55","doi-asserted-by":"crossref","unstructured":"Zellers R, Bisk Y, Farhadi A, et\u00a0al. (2019) From recognition to cognition: visual commonsense reasoning. In: CVPR","DOI":"10.1109\/CVPR.2019.00688"},{"key":"203_CR56","unstructured":"Zhu D, Chen J, Shen X, et\u00a0al. (2023) Minigpt-4: enhancing vision-language understanding with advanced large language models. arXiv:2304.10592"}],"container-title":["Journal of King Saud University Computer and Information Sciences"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s44443-025-00203-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s44443-025-00203-2\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s44443-025-00203-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,17]],"date-time":"2025-09-17T12:42:04Z","timestamp":1758112924000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s44443-025-00203-2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,18]]},"references-count":56,"journal-issue":{"issue":"7","published-print":{"date-parts":[[2025,9]]}},"alternative-id":["203"],"URL":"https:\/\/doi.org\/10.1007\/s44443-025-00203-2","relation":{},"ISSN":["1319-1578","2213-1248"],"issn-type":[{"value":"1319-1578","type":"print"},{"value":"2213-1248","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,8,18]]},"assertion":[{"value":"11 May 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 July 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 August 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"All authors certify that they have no affiliations with or involvement in any organization or entity with any financial interest or non-financial interest in the subject matter or materials discussed in this manuscript.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing Interest"}}],"article-number":"178"}}