{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T00:36:34Z","timestamp":1765499794304,"version":"3.48.0"},"reference-count":57,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Expert Systems with Applications"],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1016\/j.eswa.2025.129988","type":"journal-article","created":{"date-parts":[[2025,10,29]],"date-time":"2025-10-29T07:54:00Z","timestamp":1761724440000},"page":"129988","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"PC","title":["Hierarchical image understanding and diffusion-enhanced generative prompting for human-object interaction detection"],"prefix":"10.1016","volume":"299","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-1036-3321","authenticated-orcid":false,"given":"Pinzhu","family":"An","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1215-9807","authenticated-orcid":false,"given":"Zhi","family":"Tan","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"issue":"5","key":"10.1016\/j.eswa.2025.129988_bib0001","doi-asserted-by":"crossref","first-page":"469","DOI":"10.1016\/j.robot.2008.10.024","article-title":"A survey of robot learning from demonstration","volume":"57","author":"Argall","year":"2009","journal-title":"Robotics and Autonomous Systems"},{"key":"10.1016\/j.eswa.2025.129988_bib0002","first-page":"739","article-title":"Detecting any human-object interaction relationship: universal hoi detector with spatial prompt learning on foundation models","volume":"36","author":"Cao","year":"2023","journal-title":"Advances in Neural Information Processing systems"},{"key":"10.1016\/j.eswa.2025.129988_bib0003","series-title":"European conference on computer vision","first-page":"213","article-title":"End-to-end object detection with transformers","author":"Carion","year":"2020"},{"key":"10.1016\/j.eswa.2025.129988_bib0004","series-title":"2018 IEEE winter conference on applications of computer vision (wacv)","first-page":"381","article-title":"Learning to detect human-object interactions","author":"Chao","year":"2018"},{"key":"10.1016\/j.eswa.2025.129988_bib0005","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"9004","article-title":"Reformulating hoi detection as adaptive set prediction","author":"Chen","year":"2021"},{"key":"10.1016\/j.eswa.2025.129988_bib0006","series-title":"International conference on learning representations","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2021"},{"key":"10.1016\/j.eswa.2025.129988_bib0007","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"224","article-title":"Use the force, luke! learning to predict physical forces by simulating effects","author":"Ehsani","year":"2020"},{"key":"10.1016\/j.eswa.2025.129988_bib0008","doi-asserted-by":"crossref","first-page":"3125","DOI":"10.1109\/TMM.2023.3307896","article-title":"Hodn: Disentangling human-object feature for hoi detection","volume":"26","author":"Fang","year":"2023","journal-title":"IEEE Transactions on Multimedia"},{"key":"10.1016\/j.eswa.2025.129988_bib0009","series-title":"Computer vision\u2013ECCV 2020: 16th European conference, glasgow, UK, august 23\u201328, 2020, proceedings, part XII 16","first-page":"696","article-title":"DRG: Dual relation graph for human-object interaction detection","author":"Gao","year":"2020"},{"key":"10.1016\/j.eswa.2025.129988_bib0010","unstructured":"Gao, C., Zou, Y., & Huang, J.-B. (2018). ICAN: Instance-centric attention network for human-object interaction detection. arXiv preprint arXiv:1808.10437."},{"key":"10.1016\/j.eswa.2025.129988_bib0011","series-title":"Proceedings of the IEEE International conference on computer vision","first-page":"1440","article-title":"Fast R-CNN","author":"Girshick","year":"2015"},{"key":"10.1016\/j.eswa.2025.129988_bib0012","series-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","first-page":"580","article-title":"Rich feature hierarchies for accurate object detection and semantic segmentation","author":"Girshick","year":"2014"},{"key":"10.1016\/j.eswa.2025.129988_bib0013","series-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","first-page":"8359","article-title":"Detecting and recognizing human-object interactions","author":"Gkioxari","year":"2018"},{"key":"10.1016\/j.eswa.2025.129988_bib0014","series-title":"Proceedings of the 32nd ACM International conference on multimedia","first-page":"1711","article-title":"Unseen no more: Unlocking the potential of CLIP for generative zero-shot HOI detection","author":"Guo","year":"2024"},{"key":"10.1016\/j.eswa.2025.129988_bib0015","unstructured":"Gupta, S., & Malik, J. (2015). Visual semantic role labeling. arXiv preprint arXiv:1505.04474."},{"key":"10.1016\/j.eswa.2025.129988_bib0016","series-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","first-page":"770","article-title":"Deep residual learning for image recognition","author":"He","year":"2016"},{"key":"10.1016\/j.eswa.2025.129988_bib0017","series-title":"European conference on computer vision","first-page":"584","article-title":"Visual compositional learning for human-object interaction detection","author":"Hou","year":"2020"},{"key":"10.1016\/j.eswa.2025.129988_bib0018","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"495","article-title":"Affordance transfer learning for human-object interaction detection","author":"Hou","year":"2021"},{"key":"10.1016\/j.eswa.2025.129988_bib0019","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"14646","article-title":"Detecting human-object interaction via fabricated compositional learning","author":"Hou","year":"2021"},{"key":"10.1016\/j.eswa.2025.129988_bib0020","series-title":"ACM siggraph 2024 conference papers","first-page":"1","article-title":"Hand-object interaction controller (HOIC): Deep reinforcement learning for reconstructing interactions with physics","author":"Hu","year":"2024"},{"issue":"1","key":"10.1016\/j.eswa.2025.129988_bib0021","first-page":"211","article-title":"Robotic tactile recognition and adaptive grasping control based on CNN-LSTM","volume":"40","author":"Hui","year":"2019","journal-title":"Chinese Journal of Scientific Instrument"},{"key":"10.1016\/j.eswa.2025.129988_bib0022","series-title":"Computer vision\u2013ECCV 2020: 16th European conference, glasgow, UK, august 23\u201328, 2020, proceedings, part XV 16","first-page":"498","article-title":"UnionDet: Union-level detector towards real-time human-object interaction detection","author":"Kim","year":"2020"},{"key":"10.1016\/j.eswa.2025.129988_bib0023","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"74","article-title":"HOTR: End-to-end human-object interaction detection with transformers","author":"Kim","year":"2021"},{"key":"10.1016\/j.eswa.2025.129988_bib0024","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"2925","article-title":"Relational context learning for human-object interaction detection","author":"Kim","year":"2023"},{"key":"10.1016\/j.eswa.2025.129988_bib0025","doi-asserted-by":"crossref","first-page":"23655","DOI":"10.52202\/079017-0746","article-title":"Human-object interaction detection collaborated with large relation-driven diffusion models","volume":"37","author":"Li","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.eswa.2025.129988_bib0026","first-page":"21158","article-title":"Neural-logic human-object interaction detection","volume":"36","author":"Li","year":"2023","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.eswa.2025.129988_bib0027","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"482","article-title":"PPDM: Parallel point detection and matching for real-time human-object interaction detection","author":"Liao","year":"2020"},{"key":"10.1016\/j.eswa.2025.129988_bib0028","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"20123","article-title":"Gen-VLKT: Simplify association and enhance interaction understanding for hoi detection","author":"Liao","year":"2022"},{"key":"10.1016\/j.eswa.2025.129988_bib0029","series-title":"Computer vision\u2013ECCV 2014: 13th European conference, zurich, Switzerland, september 6\u201312, 2014, proceedings, part v 13","first-page":"740","article-title":"Microsoft COCO: Common objects in context","author":"Lin","year":"2014"},{"key":"10.1016\/j.eswa.2025.129988_bib0030","unstructured":"Loshchilov, I., & Hutter, F. (2017). Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101."},{"key":"10.1016\/j.eswa.2025.129988_bib0031","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"28212","article-title":"Discovering syntactic interaction clues for human-object interaction detection","author":"Luo","year":"2024"},{"key":"10.1016\/j.eswa.2025.129988_bib0032","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"23507","article-title":"HoiClip: Efficient knowledge transfer for hoi detection with vision-language models","author":"Ning","year":"2023"},{"key":"10.1016\/j.eswa.2025.129988_bib0033","series-title":"International conference on machine learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"issue":"6","key":"10.1016\/j.eswa.2025.129988_bib0034","doi-asserted-by":"crossref","first-page":"1137","DOI":"10.1109\/TPAMI.2016.2577031","article-title":"Faster R-CNN: Towards real-time object detection with region proposal networks","volume":"39","author":"Ren","year":"2016","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"10.1016\/j.eswa.2025.129988_bib0035","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"10410","article-title":"QPIC: Query-based pairwise human-object interaction detection with image-wide contextual information","author":"Tamura","year":"2021"},{"key":"10.1016\/j.eswa.2025.129988_bib0036","doi-asserted-by":"crossref","unstructured":"Tang, R., Liu, L., Pandey, A., Jiang, Z., Yang, G., Kumar, K., Stenetorp, P., Lin, J., & Ture, F. (2022). What the daam: Interpreting stable diffusion using cross attention. arXiv preprint arXiv:2210.04885.","DOI":"10.18653\/v1\/2023.acl-long.310"},{"key":"10.1016\/j.eswa.2025.129988_bib0037","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"13617","article-title":"VSGNet: Spatial attention network for detecting human object interactions using graph convolutions","author":"Ulutan","year":"2020"},{"key":"10.1016\/j.eswa.2025.129988_bib0038","series-title":"Proceedings of the IEEE\/CVF International conference on computer vision","first-page":"9469","article-title":"Pose-aware multi-level feature network for human object interaction detection","author":"Wan","year":"2019"},{"key":"10.1016\/j.eswa.2025.129988_bib0039","doi-asserted-by":"crossref","first-page":"6583","DOI":"10.1109\/TIP.2021.3096333","article-title":"IPGN: Interactiveness proposal graph network for human-object interaction detection","volume":"30","author":"Wang","year":"2021","journal-title":"IEEE Transactions on Image Processing"},{"key":"10.1016\/j.eswa.2025.129988_bib0040","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"939","article-title":"Learning transferable human-object interaction detector with natural language supervision","author":"Wang","year":"2022"},{"key":"10.1016\/j.eswa.2025.129988_bib0041","series-title":"Proceedings of the IEEE\/CVF International conference on computer vision","first-page":"13475","article-title":"Discovering human interactions with large-vocabulary objects via query and multi-scale detection","author":"Wang","year":"2021"},{"key":"10.1016\/j.eswa.2025.129988_bib0042","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"4116","article-title":"Learning human-object interaction detection using interaction points","author":"Wang","year":"2020"},{"key":"10.1016\/j.eswa.2025.129988_bib0043","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"17815","article-title":"Exploring pose-aware human-object interaction via hybrid learning","author":"Wu","year":"2024"},{"key":"10.1016\/j.eswa.2025.129988_bib0044","series-title":"Proceedings of the AAAI conference on artificial intelligence","first-page":"2839","article-title":"End-to-end zero-shot hoi detection via vision and language knowledge distillation","volume":"vol. 37","author":"Wu","year":"2023"},{"key":"10.1016\/j.eswa.2025.129988_bib0045","series-title":"European conference on computer vision","first-page":"121","article-title":"Mining cross-person cues for body-part interactiveness learning in hoi detection","author":"Wu","year":"2022"},{"key":"10.1016\/j.eswa.2025.129988_bib0046","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"16954","article-title":"Open-world human-object interaction detection via multi-modal prompts","author":"Yang","year":"2024"},{"key":"10.1016\/j.eswa.2025.129988_bib0047","series-title":"Proceedings of the IEEE\/CVF International conference on computer vision","first-page":"21649","article-title":"Rlipv2: Fast scaling of relational language-image pre-training","author":"Yuan","year":"2023"},{"key":"10.1016\/j.eswa.2025.129988_bib0048","first-page":"17209","article-title":"Mining the benefits of two-stage and one-stage hoi detection","volume":"34","author":"Zhang","year":"2021","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.eswa.2025.129988_bib0049","series-title":"Proceedings of the IEEE\/CVF International conference on computer vision","first-page":"13319","article-title":"Spatially conditioned graphs for detecting human-object interactions","author":"Zhang","year":"2021"},{"key":"10.1016\/j.eswa.2025.129988_bib0050","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"20104","article-title":"Efficient two-stage detection of human-object interactions with a novel unary-pairwise transformer","author":"Zhang","year":"2022"},{"key":"10.1016\/j.eswa.2025.129988_bib0051","series-title":"Proceedings of the IEEE\/CVF International conference on computer vision","first-page":"10411","article-title":"Exploring predicate visual context in detecting of human-object interactions","author":"Zhang","year":"2023"},{"key":"10.1016\/j.eswa.2025.129988_bib0052","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"19548","article-title":"Exploring structure-aware transformer over interaction proposals for human-object interaction detection","author":"Zhang","year":"2022"},{"issue":"6","key":"10.1016\/j.eswa.2025.129988_bib0053","doi-asserted-by":"crossref","first-page":"1910","DOI":"10.1007\/s11263-021-01458-8","article-title":"Polysemy deciphering network for robust human\u2013object interaction detection","volume":"129","author":"Zhong","year":"2021","journal-title":"International Journal of Computer Vision"},{"key":"10.1016\/j.eswa.2025.129988_bib0054","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"19568","article-title":"Human-object interaction detection via disentangled transformer","author":"Zhou","year":"2022"},{"key":"10.1016\/j.eswa.2025.129988_bib0055","series-title":"2022 8th international conference on virtual reality (ICVR)","first-page":"298","article-title":"ER-Net: Efficient recalibration network for multi-view multi-person 3D pose estimation","author":"Zhou","year":"2022"},{"key":"10.1016\/j.eswa.2025.129988_bib0056","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"843","article-title":"Relation parsing neural network for human-object interaction detection","author":"Zhou","year":"2019"},{"key":"10.1016\/j.eswa.2025.129988_bib0057","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"11825","article-title":"End-to-end human object interaction detection with hoi transformer","author":"Zou","year":"2021"}],"container-title":["Expert Systems with Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0957417425036036?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0957417425036036?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T00:31:50Z","timestamp":1765499510000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0957417425036036"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3]]},"references-count":57,"alternative-id":["S0957417425036036"],"URL":"https:\/\/doi.org\/10.1016\/j.eswa.2025.129988","relation":{},"ISSN":["0957-4174"],"issn-type":[{"type":"print","value":"0957-4174"}],"subject":[],"published":{"date-parts":[[2026,3]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Hierarchical image understanding and diffusion-enhanced generative prompting for human-object interaction detection","name":"articletitle","label":"Article Title"},{"value":"Expert Systems with Applications","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.eswa.2025.129988","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2025 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"129988"}}