{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,2]],"date-time":"2026-06-02T17:11:57Z","timestamp":1780420317760,"version":"3.54.1"},"reference-count":67,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100010256","name":"Guangzhou Municipal Science and Technology Project","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100010256","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neural Networks"],"published-print":{"date-parts":[[2026,11]]},"DOI":"10.1016\/j.neunet.2026.109183","type":"journal-article","created":{"date-parts":[[2026,5,26]],"date-time":"2026-05-26T23:39:30Z","timestamp":1779838770000},"page":"109183","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["SAGE: Semantic-guided framework with decoupled optimization for open-vocabulary video visual relationship detection"],"prefix":"10.1016","volume":"203","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-8216-4140","authenticated-orcid":false,"given":"Shiqi","family":"Wang","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Weiying","family":"Xue","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shuyi","family":"Hu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Haowen","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5378-6404","authenticated-orcid":false,"given":"Qi","family":"Liu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.neunet.2026.109183_bib0001","series-title":"Proceedings of the advances in neural information processing systems (neurIPS)","first-page":"23716","article-title":"Flamingo: A visual language model for few-shot learning","volume":"vol. 35","author":"Alayrac","year":"2022"},{"issue":"12","key":"10.1016\/j.neunet.2026.109183_bib0002","doi-asserted-by":"crossref","first-page":"13083","DOI":"10.1109\/TKDE.2023.3270328","article-title":"Video visual relation detection with contextual knowledge embedding","volume":"35","author":"Cao","year":"2023","journal-title":"IEEE Transactions on Knowledge and Data Engineering"},{"key":"10.1016\/j.neunet.2026.109183_bib0003","series-title":"Proceedings of the international conference on learning representations (ICLR)","article-title":"Plot: Prompt learning with optimal transport for vision-language models","author":"Chen","year":"2023"},{"key":"10.1016\/j.neunet.2026.109183_bib0004","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision (ICCV)","first-page":"13485","article-title":"Social fabric: Tubelet compositions for video relation detection","author":"Chen","year":"2021"},{"key":"10.1016\/j.neunet.2026.109183_bib0005","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR)","first-page":"10337","article-title":"Memory enhanced global-local aggregation for video object detection","author":"Chen","year":"2020"},{"key":"10.1016\/j.neunet.2026.109183_bib0006","unstructured":"Chu, X., Qiao, L., Zhang, X., Xu, S., Wei, F., Yang, Y., Sun, Y., Hu, Y., Wang, X., & Zhang, B. (2024). MobileVLM v2: Faster and stronger baseline for vision language models. arXiv preprint arXiv: 2402.03766."},{"key":"10.1016\/j.neunet.2026.109183_bib0007","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision (ICCV)","first-page":"16372","article-title":"Spatial-temporal transformer for dynamic scene graph generation","author":"Cong","year":"2021"},{"key":"10.1016\/j.neunet.2026.109183_bib0008","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR)","first-page":"19497","article-title":"Classification-then-grounding: Reformulating video scene graphs as temporal bipartite graphs","author":"Gao","year":"2022"},{"key":"10.1016\/j.neunet.2026.109183_bib0009","unstructured":"Gao, K., Chen, L., Zhang, H., Xiao, J., & Sun, Q. (2023). Compositional prompt tuning with motion cues for open-vocabulary video relation detection. arXiv preprint arXiv: 2302.00268."},{"key":"10.1016\/j.neunet.2026.109183_bib0010","series-title":"Proceedings of the European conference on computer vision (ECCV)","first-page":"266","article-title":"Open vocabulary object detection with pseudo bounding-box labels","author":"Gao","year":"2022"},{"key":"10.1016\/j.neunet.2026.109183_bib0011","unstructured":"Gu, X., Lin, T.-Y., Kuo, W., & Cui, Y. (2021). Open-vocabulary object detection via vision and language knowledge distillation. arXiv preprint arXiv: 2104.13921."},{"key":"10.1016\/j.neunet.2026.109183_bib0012","series-title":"Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR)","first-page":"770","article-title":"Deep residual learning for image recognition","author":"He","year":"2016"},{"key":"10.1016\/j.neunet.2026.109183_bib0013","series-title":"Proceedings of the European conference on computer vision (ECCV)","first-page":"56","article-title":"Towards open-vocabulary scene graph generation with prompt-based finetuning","author":"He","year":"2022"},{"key":"10.1016\/j.neunet.2026.109183_bib0014","doi-asserted-by":"crossref","unstructured":"Herzig, R., Mendelson, A., Karlinsky, L., Arbelle, A., Feris, R., Darrell, T., & Globerson, A. (2023). Incorporating structured representations into pretrained vision & language models using scene graphs. arXiv preprint arXiv: 2305.06343.","DOI":"10.18653\/v1\/2023.emnlp-main.870"},{"key":"10.1016\/j.neunet.2026.109183_bib0015","doi-asserted-by":"crossref","DOI":"10.1016\/j.neunet.2024.106200","article-title":"Hierarchical matching and reasoning for multi-query image retrieval","volume":"173","author":"Ji","year":"2024","journal-title":"Neural Networks"},{"key":"10.1016\/j.neunet.2026.109183_bib0016","series-title":"Proceedings of the international conference on machine learning (ICML)","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","author":"Jia","year":"2021"},{"key":"10.1016\/j.neunet.2026.109183_bib0017","unstructured":"Jia, J., Hu, Y., Weng, X. et al. (2024). TinyLLaVA factory: A modularized codebase for small-scale large multimodal models. arXiv preprint arXiv: 2405.11788."},{"key":"10.1016\/j.neunet.2026.109183_bib0018","series-title":"Proceedings of the European conference on computer vision (ECCV)","first-page":"709","article-title":"Visual prompt tuning","author":"Jia","year":"2022"},{"key":"10.1016\/j.neunet.2026.109183_bib0019","series-title":"Proceedings of the 32nd ACM international conference on multimedia","first-page":"1437","article-title":"VrdONE: One-stage video visual relation detection","author":"Jiang","year":"2024"},{"key":"10.1016\/j.neunet.2026.109183_bib0020","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision (ICCV)","first-page":"15670","article-title":"Knowledge-aware prompt tuning for generalizable vision-language models","author":"Kan","year":"2023"},{"key":"10.1016\/j.neunet.2026.109183_bib0021","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR)","first-page":"19113","article-title":"Maple: Multi-modal prompt learning","author":"Khattak","year":"2023"},{"key":"10.1016\/j.neunet.2026.109183_bib0022","doi-asserted-by":"crossref","first-page":"158","DOI":"10.1016\/j.neunet.2021.02.001","article-title":"Visual question answering based on local-scene-aware referring expression generation","volume":"139","author":"Kim","year":"2021","journal-title":"Neural Networks"},{"key":"10.1016\/j.neunet.2026.109183_bib0023","unstructured":"Kuo, W., Cui, Y., Gu, X., Piergiovanni, A. J., & Angelova, A. (2022). F-VLM: Open-vocabulary object detection upon frozen vision and language models. arXiv preprint arXiv: 2209.15639."},{"key":"10.1016\/j.neunet.2026.109183_bib0024","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR)","first-page":"4953","article-title":"Align and prompt: Video-and-language pre-training with entity prompts","author":"Li","year":"2022"},{"key":"10.1016\/j.neunet.2026.109183_bib0025","series-title":"Proceedings of the international conference on machine learning (ICML)","first-page":"19730","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","author":"Li","year":"2023"},{"key":"10.1016\/j.neunet.2026.109183_bib0026","series-title":"Proceedings of the international conference on machine learning (ICML)","first-page":"12888","article-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation","author":"Li","year":"2022"},{"key":"10.1016\/j.neunet.2026.109183_bib0027","series-title":"Proceedings of the advances in neural information processing systems (NeurIPS)","article-title":"Zero-shot visual relation detection via composite visual cues from large language models","volume":"vol. 36","author":"Li","year":"2024"},{"key":"10.1016\/j.neunet.2026.109183_bib0028","series-title":"Proceedings of the ACM international conference on multimedia (ACM MM)","first-page":"4091","article-title":"Interventional video relation detection","author":"Li","year":"2021"},{"key":"10.1016\/j.neunet.2026.109183_bib0029","series-title":"Proceedings of the AAAI conference on artificial intelligence (AAAI)","first-page":"3495","article-title":"TD2-Net: Toward denoising and debiasing for video scene graph generation","volume":"vol. 38","author":"Lin","year":"2024"},{"key":"10.1016\/j.neunet.2026.109183_bib0030","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR)","first-page":"10840","article-title":"Beyond short-term snippet: Video relation detection with spatio-temporal global context","author":"Liu","year":"2020"},{"key":"10.1016\/j.neunet.2026.109183_bib0031","unstructured":"Loshchilov, I., & Hutter, F. (2017). Decoupled weight decay regularization. arXiv preprint arXiv: 1711.05101."},{"key":"10.1016\/j.neunet.2026.109183_bib0032","unstructured":"Luo, H., Ji, L., Shi, B., Huang, H., Duan, N., Li, T., Li, J., Bharti, T., & Zhou, M. (2020). UniVL: A unified video and language pre-training model for multimodal understanding and generation. arXiv preprint arXiv: 2002.06353."},{"key":"10.1016\/j.neunet.2026.109183_bib0033","series-title":"Proceedings of the European conference on computer vision (ECCV)","first-page":"1","article-title":"Expanding language-image pretrained models for general video recognition","author":"Ni","year":"2022"},{"key":"10.1016\/j.neunet.2026.109183_bib0034","doi-asserted-by":"crossref","DOI":"10.1016\/j.neucom.2023.126658","article-title":"Combined scaling for zero-shot transfer learning","volume":"555","author":"Pham","year":"2023","journal-title":"Neurocomputing"},{"key":"10.1016\/j.neunet.2026.109183_bib0035","series-title":"Proceedings of the ACM international conference on multimedia (ACM MM)","first-page":"84","article-title":"Video relation detection with spatio-temporal graph","author":"Qian","year":"2019"},{"key":"10.1016\/j.neunet.2026.109183_bib0036","doi-asserted-by":"crossref","first-page":"434","DOI":"10.1016\/j.neunet.2022.05.008","article-title":"Visual context learning based on textual knowledge for image-text retrieval","volume":"152","author":"Qin","year":"2022","journal-title":"Neural Networks"},{"key":"10.1016\/j.neunet.2026.109183_bib0037","series-title":"Proceedings of the international conference on machine learning (ICML)","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.neunet.2026.109183_bib0038","unstructured":"Shafir, Y., Tevet, G., Kapon, R., & Bermano, A. H. (2023). Human motion diffusion as a generative prior. arXiv preprint arXiv: 2303.01418."},{"key":"10.1016\/j.neunet.2026.109183_bib0039","series-title":"Proceedings of the 2019 on international conference on multimedia retrieval","first-page":"279","article-title":"Annotating objects and relations in user-generated videos","author":"Shang","year":"2019"},{"key":"10.1016\/j.neunet.2026.109183_bib0040","series-title":"Proceedings of the ACM international conference on multimedia (ACM MM)","first-page":"3654","article-title":"Video visual relation detection via iterative inference","author":"Shang","year":"2021"},{"key":"10.1016\/j.neunet.2026.109183_bib0041","series-title":"Proceedings of the ACM international conference on multimedia (ACM MM)","first-page":"1300","article-title":"Video visual relation detection","author":"Shang","year":"2017"},{"key":"10.1016\/j.neunet.2026.109183_bib0042","series-title":"The eleventh international conference on learning representations","article-title":"Human motion diffusion model","author":"Tevet","year":"2022"},{"key":"10.1016\/j.neunet.2026.109183_bib0043","doi-asserted-by":"crossref","first-page":"242","DOI":"10.1016\/j.neunet.2023.11.002","article-title":"Human-object interaction detection via global context and pairwise-level fusion features integration","volume":"170","author":"Wang","year":"2024","journal-title":"Neural Networks"},{"key":"10.1016\/j.neunet.2026.109183_bib0044","series-title":"European conference on computer vision (ECCV)","first-page":"471","article-title":"The all-seeing project V2: Towards general relation comprehension of the open world","author":"Wang","year":"2024"},{"key":"10.1016\/j.neunet.2026.109183_bib0045","series-title":"Proceedings of the international conference on machine learning (ICML)","first-page":"36978","article-title":"Open-VCLIP: Transforming CLIP to an open-vocabulary video model via interpolated weight optimization","author":"Weng","year":"2023"},{"key":"10.1016\/j.neunet.2026.109183_bib0046","series-title":"Proceedings of the IEEE international conference on image processing (ICIP)","first-page":"3645","article-title":"Simple online and realtime tracking with a deep association metric","author":"Wojke","year":"2017"},{"issue":"7","key":"10.1016\/j.neunet.2026.109183_bib0047","doi-asserted-by":"crossref","first-page":"5092","DOI":"10.1109\/TPAMI.2024.3361862","article-title":"Towards open vocabulary learning: A survey","volume":"46","author":"Wu","year":"2024","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"10.1016\/j.neunet.2026.109183_bib0048","series-title":"Proceedings of the 32nd ACM international conference on multimedia","first-page":"8566","article-title":"Open-vocabulary video scene graph generation via union-aware semantic alignment","author":"Wu","year":"2024"},{"key":"10.1016\/j.neunet.2026.109183_bib0049","series-title":"Proceedings of the European conference on computer vision (ECCV)","first-page":"374","article-title":"Meta spatio-temporal debiasing for video scene graph generation","author":"Xu","year":"2022"},{"key":"10.1016\/j.neunet.2026.109183_bib0050","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR)","first-page":"2945","article-title":"Side adapter network for open-vocabulary semantic segmentation","author":"Xu","year":"2023"},{"key":"10.1016\/j.neunet.2026.109183_bib0051","doi-asserted-by":"crossref","DOI":"10.1016\/j.neunet.2025.107348","article-title":"Towards zero-shot human-object interaction detection via vision-language integration","volume":"187","author":"Xue","year":"2025","journal-title":"Neural Networks"},{"key":"10.1016\/j.neunet.2026.109183_bib0052","series-title":"Proceedings of the AAAI conference on artificial intelligence (AAAI)","first-page":"6513","article-title":"Multi-modal prompting for open-vocabulary video visual relationship detection","volume":"vol. 38","author":"Yang","year":"2024"},{"key":"10.1016\/j.neunet.2026.109183_bib0053","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR)","first-page":"6757","article-title":"Visual-language prompt tuning with knowledge-guided context optimization","author":"Yao","year":"2023"},{"key":"10.1016\/j.neunet.2026.109183_bib0054","doi-asserted-by":"crossref","unstructured":"Yao, H., Zhang, R., & Xu, C. (2024a). TCP: Textual-based class-aware prompt tuning for visual-language model. arXiv preprint arXiv: 2311.18231.","DOI":"10.1109\/CVPR52733.2024.02212"},{"key":"10.1016\/j.neunet.2026.109183_bib0055","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"27391","article-title":"DetCLIPv3: Towards versatile generative open-vocabulary object detection","author":"Yao","year":"2024"},{"key":"10.1016\/j.neunet.2026.109183_bib0056","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision (ICCV)","first-page":"21560","article-title":"Visually-prompted language model for fine-grained scene graph generation in an open world","author":"Yu","year":"2023"},{"key":"10.1016\/j.neunet.2026.109183_bib0057","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"21649","article-title":"RLIPv2: Fast scaling of relational language-image pre-training","author":"Yuan","year":"2023"},{"key":"10.1016\/j.neunet.2026.109183_bib0058","series-title":"The eleventh international conference on learning representations (ICLR)","article-title":"When and why vision-language models behave like bags-of-words, and what to do about it?","author":"Yuksekgonul","year":"2022"},{"key":"10.1016\/j.neunet.2026.109183_bib0059","unstructured":"Zang, Y., Li, W., Zhou, K., Huang, C., & Loy, C. C. (2022). Unified vision and language prompt learning. arXiv preprint arXiv: 2210.07225."},{"issue":"12","key":"10.1016\/j.neunet.2026.109183_bib0060","doi-asserted-by":"crossref","first-page":"12425","DOI":"10.1109\/TCSVT.2024.3437437","article-title":"Entity dependency learning network with relation prediction for video visual relation detection","volume":"34","author":"Zhang","year":"2024","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"key":"10.1016\/j.neunet.2026.109183_bib0061","unstructured":"Zhang, M., Cai, Z., Pan, L., Hong, F., Guo, X., Yang, L., & Liu, Z. (2022). Motiondiffuse: Text-driven human motion generation with diffusion model. arXiv preprint arXiv: 2208.15001."},{"key":"10.1016\/j.neunet.2026.109183_bib0062","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"6962","article-title":"Unified visual relationship detection with vision and language models","author":"Zhao","year":"2023"},{"key":"10.1016\/j.neunet.2026.109183_bib0063","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR)","first-page":"18836","article-title":"VRDFormer: End-to-end video visual relation detection with transformers","author":"Zheng","year":"2022"},{"key":"10.1016\/j.neunet.2026.109183_bib0064","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR)","first-page":"16816","article-title":"Conditional prompt learning for vision-language models","author":"Zhou","year":"2022"},{"issue":"9","key":"10.1016\/j.neunet.2026.109183_bib0065","doi-asserted-by":"crossref","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","article-title":"Learning to prompt for vision-language models","volume":"130","author":"Zhou","year":"2022","journal-title":"International Journal of Computer Vision (IJCV)"},{"key":"10.1016\/j.neunet.2026.109183_bib0066","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision (ICCV)","first-page":"15659","article-title":"Prompt-aligned gradient for prompt tuning","author":"Zhu","year":"2023"},{"key":"10.1016\/j.neunet.2026.109183_bib0067","series-title":"Proceedings of the AAAI conference on artificial intelligence (AAAI)","first-page":"3834","article-title":"Debiased fine-tuning for vision-language models by prompt regularization","author":"Zhu","year":"2023"}],"container-title":["Neural Networks"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608026006441?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608026006441?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,2]],"date-time":"2026-06-02T16:58:26Z","timestamp":1780419506000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0893608026006441"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,11]]},"references-count":67,"alternative-id":["S0893608026006441"],"URL":"https:\/\/doi.org\/10.1016\/j.neunet.2026.109183","relation":{},"ISSN":["0893-6080"],"issn-type":[{"value":"0893-6080","type":"print"}],"subject":[],"published":{"date-parts":[[2026,11]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"SAGE: Semantic-guided framework with decoupled optimization for open-vocabulary video visual relationship detection","name":"articletitle","label":"Article Title"},{"value":"Neural Networks","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neunet.2026.109183","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"109183"}}