{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,4]],"date-time":"2026-04-04T17:26:05Z","timestamp":1775323565217,"version":"3.50.1"},"reference-count":69,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2025,1,22]],"date-time":"2025-01-22T00:00:00Z","timestamp":1737504000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,22]],"date-time":"2025-01-22T00:00:00Z","timestamp":1737504000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Appl Intell"],"published-print":{"date-parts":[[2025,4]]},"DOI":"10.1007\/s10489-025-06279-7","type":"journal-article","created":{"date-parts":[[2025,1,22]],"date-time":"2025-01-22T01:26:43Z","timestamp":1737509203000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Knowledge guided relation enhancement for human-object interaction detection"],"prefix":"10.1007","volume":"55","author":[{"given":"Rui","family":"Su","sequence":"first","affiliation":[]},{"given":"Yongbin","family":"Gao","sequence":"additional","affiliation":[]},{"given":"Wenjun","family":"Yu","sequence":"additional","affiliation":[]},{"given":"Chenmou","family":"Wu","sequence":"additional","affiliation":[]},{"given":"Xiaoyan","family":"Jiang","sequence":"additional","affiliation":[]},{"given":"Shubo","family":"Zhou","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,1,22]]},"reference":[{"issue":"3","key":"6279_CR1","doi-asserted-by":"publisher","first-page":"395","DOI":"10.1007\/s41095-021-0252-6","volume":"8","author":"Y Lan","year":"2022","unstructured":"Lan Y, Duan Y, Liu C, Zhu C, Xiong Y, Huang H, Xu K (2022) Arm3d: Attention-based relation module for indoor 3d object detection. Comput Vis Media 8(3):395\u2013414","journal-title":"Comput Vis Media"},{"key":"6279_CR2","unstructured":"Seita D, Wang Y, Shetty SJ, Li EY, Erickson Z, Held D (2023) Toolflownet: Robotic manipulation with tools via predicting tool flow from point clouds. In: conference on robot learning, PMLR, pp 1038\u20131049"},{"key":"6279_CR3","doi-asserted-by":"crossref","unstructured":"Zhang Y, Tokmakov P, Hebert M, Schmid C (2019) A structured model for action detection. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 9975\u20139984","DOI":"10.1109\/CVPR.2019.01021"},{"key":"6279_CR4","doi-asserted-by":"crossref","unstructured":"Nadeem A, Jalal A, Kim K (2020) Human actions tracking and recognition based on body parts detection via artificial neural network. In: 2020 3rd international conference on advancements in computational sciences (ICACS), IEEE, pp 1\u20136","DOI":"10.1109\/ICACS47775.2020.9055951"},{"key":"6279_CR5","doi-asserted-by":"crossref","unstructured":"Moon G, Kwon H, Lee KM, Cho M (2021) Integralaction: Pose-driven feature integration for robust human action recognition in videos. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 3339\u20133348","DOI":"10.1109\/CVPRW53098.2021.00372"},{"key":"6279_CR6","doi-asserted-by":"crossref","unstructured":"Yao T, Pan Y, Li Y, Mei T (2018) Exploring visual relationship for image captioning. In: Proceedings of the European conference on computer vision (ECCV), pp 684\u2013699","DOI":"10.1007\/978-3-030-01264-9_42"},{"key":"6279_CR7","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2022.104575","volume":"128","author":"J Hu","year":"2022","unstructured":"Hu J, Yang Y, Yao L, An Y, Pan L (2022) Position-guided transformer for image captioning. Image Vis Comput 128:104575","journal-title":"Image Vis Comput"},{"key":"6279_CR8","doi-asserted-by":"crossref","unstructured":"Guo J (2023) Robust image captioning using knowledge distillation. In: 2023 IEEE international conference on sensors, electronics and computer engineering (ICSECE), IEEE, pp 881\u2013885","DOI":"10.1109\/ICSECE58870.2023.10263327"},{"key":"6279_CR9","doi-asserted-by":"crossref","unstructured":"Zhang FZ, Yuan Y, Campbell D, Zhong Z, Gould S (2023) Exploring predicate visual context in detecting of human-object interactions. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 10411\u201310421","DOI":"10.1109\/ICCV51070.2023.00955"},{"key":"6279_CR10","doi-asserted-by":"crossref","unstructured":"Park J, Park JW, Lee JS (2023) Viplo: Vision transformer based pose-conditioned self-loop graph for human-object interaction detection. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 17152\u201317162","DOI":"10.1109\/CVPR52729.2023.01645"},{"key":"6279_CR11","doi-asserted-by":"crossref","unstructured":"Tamura M, Ohashi H, Yoshinaga T (2021) Qpic: Query-based pairwise human-object interaction detection with image-wide contextual information. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 10410\u201310419","DOI":"10.1109\/CVPR46437.2021.01027"},{"key":"6279_CR12","doi-asserted-by":"crossref","unstructured":"Zhang FZ, Campbell D, Gould S (2022) Efficient two-stage detection of human-object interactions with a novel unary-pairwise transformer. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 20104\u201320112","DOI":"10.1109\/CVPR52688.2022.01947"},{"key":"6279_CR13","first-page":"37416","volume":"35","author":"H Yuan","year":"2022","unstructured":"Yuan H, Jiang J, Albanie S, Feng T, Huang Z, Ni D, Tang M (2022) Rlip: Relational language-image pre-training for human-object interaction detection. Adv Neural Inf Process Syst 35:37416\u201337431","journal-title":"Adv Neural Inf Process Syst"},{"key":"6279_CR14","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Dehghani M, Minderer M, Heigold G, Gelly S et al (2020) An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929"},{"key":"6279_CR15","doi-asserted-by":"crossref","unstructured":"Carion N, Massa F, Synnaeve G, Usunier N, Kirillov A, Zagoruyko S (2020) End-to-end object detection with transformers. In: European conference on computer vision, Springer, pp 213\u2013229","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"6279_CR16","doi-asserted-by":"crossref","unstructured":"Zou C, Wang B, Hu Y, Liu J, Wu Q, Zhao Y, Li B, Zhang C, Zhang C, Wei Y et al (2021) End-to-end human object interaction detection with hoi transformer. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 11825\u201311834","DOI":"10.1109\/CVPR46437.2021.01165"},{"key":"6279_CR17","doi-asserted-by":"crossref","unstructured":"Liao Y, Zhang A, Lu M, Wang Y, Li X, Liu S (2022) Gen-vlkt: Simplify association and enhance interaction understanding for hoi detection. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 20123\u201320132","DOI":"10.1109\/CVPR52688.2022.01949"},{"key":"6279_CR18","doi-asserted-by":"crossref","unstructured":"Ning S, Qiu L, Liu Y, He X (2023) Hoiclip: Efficient knowledge transfer for hoi detection with vision-language models. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 23507\u201323517","DOI":"10.1109\/CVPR52729.2023.02251"},{"key":"6279_CR19","doi-asserted-by":"crossref","unstructured":"Fang S, Lin Z, Yan K, Li J, Lin X, Ji R (2023) Hodn: Disentangling human-object feature for hoi detection. IEEE Trans Multimed","DOI":"10.1109\/TMM.2023.3307896"},{"key":"6279_CR20","doi-asserted-by":"crossref","unstructured":"Dong J, Yang H, Pan R (2024) Exploring interactive semantic alignment for efficient hoi detection with vision-language model. In: 2024 IEEE international conference on multimedia and expo (ICME), IEEE, pp 1\u20136","DOI":"10.1109\/ICME57554.2024.10688344"},{"key":"6279_CR21","doi-asserted-by":"crossref","unstructured":"Li Z, Li X, Ding C, Xu X (2024) Disentangled pre-training for human-object interaction detection. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 28191\u201328201","DOI":"10.1109\/CVPR52733.2024.02663"},{"key":"6279_CR22","unstructured":"Radford A, Kim JW, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J et al (2021) Learning transferable visual models from natural language supervision. In: international conference on machine learning, PMLR, pp 8748\u20138763"},{"key":"6279_CR23","unstructured":"Ramesh A, Dhariwal P, Nichol A, Chu C, Chen M (2022) Hierarchical text-conditional image generation with clip latents. 1(2):3. arXiv preprint arXiv:2204.06125"},{"key":"6279_CR24","doi-asserted-by":"crossref","unstructured":"Patashnik O, Wu Z, Shechtman E, Cohen-Or D, Lischinski D (2021) Styleclip: Text-driven manipulation of stylegan imagery. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 2085\u20132094","DOI":"10.1109\/ICCV48922.2021.00209"},{"key":"6279_CR25","doi-asserted-by":"crossref","unstructured":"Wan B, Zhou D, Liu Y, Li R, He X (2019) Pose-aware multi-level feature network for human object interaction detection. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 9469\u20139478","DOI":"10.1109\/ICCV.2019.00956"},{"key":"6279_CR26","doi-asserted-by":"publisher","first-page":"229","DOI":"10.1007\/s41095-020-0188-2","volume":"7","author":"H Liu","year":"2021","unstructured":"Liu H, Mu T-J, Huang X (2021) Detecting human\u2014object interaction with multi-level pairwise feature network. Comput Vis Media 7:229\u2013239","journal-title":"Comput Vis Media"},{"key":"6279_CR27","first-page":"20014","volume":"34","author":"A Ali","year":"2021","unstructured":"Ali A, Touvron H, Caron M, Bojanowski P, Douze M, Joulin A, Laptev I, Neverova N, Synnaeve G, Verbeek J et al (2021) Xcit: Cross-covariance image transformers. Adv Neural Inf Process Syst 34:20014\u201320027","journal-title":"Adv Neural Inf Process Syst"},{"key":"6279_CR28","doi-asserted-by":"crossref","unstructured":"Chao YW, Liu Y, Liu X, Zeng H, Deng J (2018) Learning to detect human-object interactions. In: 2018 IEEE winter conference on applications of computer vision (wacv), IEEE, pp 381\u2013389","DOI":"10.1109\/WACV.2018.00048"},{"key":"6279_CR29","unstructured":"Gupta S, Malik J (2015) Visual semantic role labeling. arXiv preprint arXiv:1505.04474"},{"key":"6279_CR30","unstructured":"Gao C, Zou Y, Huang JB (2018) ican: Instance-centric attention network for human-object interaction detection. arXiv preprint arXiv:1808.10437"},{"key":"6279_CR31","doi-asserted-by":"crossref","unstructured":"Kim DJ, Sun X, Choi J, Lin S, Kweon IS (2020) Detecting human-object interactions with action co-occurrence priors. In: Computer Vision\u2013ECCV 2020: 16th European conference, glasgow, UK, August 23\u201328, 2020, Proceedings, Part XXI 16, Springer, pp 718\u2013736","DOI":"10.1007\/978-3-030-58589-1_43"},{"issue":"12","key":"6279_CR32","doi-asserted-by":"publisher","first-page":"4495","DOI":"10.1007\/s10489-020-01794-1","volume":"50","author":"L Xia","year":"2020","unstructured":"Xia L, Li R (2020) Multi-stream neural network fused with local information and global information for hoi detection. Appl Intell 50(12):4495\u20134505","journal-title":"Appl Intell"},{"key":"6279_CR33","first-page":"5011","volume":"33","author":"Y-L Li","year":"2020","unstructured":"Li Y-L, Liu X, Wu X, Li Y, Lu C (2020) Hoi analysis: Integrating and decomposing human-object interaction. Adv Neural Inf Process Syst 33:5011\u20135022","journal-title":"Adv Neural Inf Process Syst"},{"key":"6279_CR34","doi-asserted-by":"crossref","unstructured":"Zhang FZ, Campbell D, Gould S (2021) Spatially conditioned graphs for detecting human-object interactions. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 13319\u201313327","DOI":"10.1109\/ICCV48922.2021.01307"},{"key":"6279_CR35","doi-asserted-by":"crossref","unstructured":"Lei T, Caba F, Chen Q, Jin H, Peng Y, Liu Y (2023) Efficient adaptive human-object interaction detection with concept-guided memory. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 6480\u20136490","DOI":"10.1109\/ICCV51070.2023.00596"},{"key":"6279_CR36","doi-asserted-by":"crossref","unstructured":"Kim B, Choi T, Kang J, Kim HJ (2020) Uniondet: Union-level detector towards real-time human-object interaction detection. In: Computer Vision\u2013ECCV 2020: 16th European conference, glasgow, UK, August 23\u201328, 2020, Proceedings, Part XV 16, Springer, pp 498\u2013514","DOI":"10.1007\/978-3-030-58555-6_30"},{"key":"6279_CR37","doi-asserted-by":"crossref","unstructured":"Liao Y, Liu S, Wang F, Chen Y, Qian C, Feng J (2020) Ppdm: Parallel point detection and matching for real-time human-object interaction detection. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 482\u2013490","DOI":"10.1109\/CVPR42600.2020.00056"},{"key":"6279_CR38","doi-asserted-by":"crossref","unstructured":"Chen M, Liao Y, Liu S, Chen Z, Wang F, Qian C (2021) Reformulating hoi detection as adaptive set prediction. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 9004\u20139013","DOI":"10.1109\/CVPR46437.2021.00889"},{"key":"6279_CR39","doi-asserted-by":"crossref","unstructured":"Kim B, Mun J, On KW, Shin M, Lee J, Kim ES (2022) Mstr: Multi-scale transformer for end-to-end human-object interaction detection. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 19578\u201319587","DOI":"10.1109\/CVPR52688.2022.01897"},{"key":"6279_CR40","doi-asserted-by":"crossref","unstructured":"Kim S, Jung D, Cho M (2023) Relational context learning for human-object interaction detection. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 2925\u20132934","DOI":"10.1109\/CVPR52729.2023.00286"},{"key":"6279_CR41","doi-asserted-by":"crossref","unstructured":"Tu D, Sun W, Zhai G, Shen W (2023) Agglomerative transformer for human-object interaction detection. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 21614\u201321624","DOI":"10.1109\/ICCV51070.2023.01976"},{"issue":"3","key":"6279_CR42","doi-asserted-by":"publisher","first-page":"2831","DOI":"10.1007\/s10489-024-05324-1","volume":"54","author":"L Xia","year":"2024","unstructured":"Xia L, Ding X (2024) Human-object interaction detection based on cascade multi-scale transformer. Appl Intell 54(3):2831\u20132850","journal-title":"Appl Intell"},{"key":"6279_CR43","doi-asserted-by":"crossref","unstructured":"Yuan H, Zhang S, Wang X, Albanie S, Pan Y, Feng T, Jiang J, Ni D, Zhang Y, Zhao D (2023) Rlipv2: Fast scaling of relational language-image pre-training. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 21649\u201321661","DOI":"10.1109\/ICCV51070.2023.01979"},{"key":"6279_CR44","unstructured":"Kipf TN, Welling M (2016) Semi-supervised classification with graph convolutional networks. arXiv preprint arXiv:1609.02907"},{"key":"6279_CR45","unstructured":"Veli\u010dkovi\u0107 P, Cucurull G, Casanova A, Romero A, Lio P, Bengio Y (2017) Graph attention networks. arXiv preprint arXiv:1710.10903"},{"key":"6279_CR46","doi-asserted-by":"crossref","unstructured":"Liu Y, Chen Q, Zisserman A (2020) Amplifying key cues for human-object-interaction detection. In: European conference on computer vision, Springer, pp 248\u2013265","DOI":"10.1007\/978-3-030-58568-6_15"},{"key":"6279_CR47","doi-asserted-by":"crossref","unstructured":"Ulutan O, Iftekhar A, Manjunath BS (2020) Vsgnet: Spatial attention network for detecting human object interactions using graph convolutions. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 13617\u201313626","DOI":"10.1109\/CVPR42600.2020.01363"},{"key":"6279_CR48","doi-asserted-by":"crossref","unstructured":"Li YL, Xu L, Liu X, Huang X, Xu Y, Wang S, Fang HS, Ma Z, Chen M, Lu C (2020) Pastanet: Toward human activity knowledge engine. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 382\u2013391","DOI":"10.1109\/CVPR42600.2020.00046"},{"issue":"6","key":"6279_CR49","doi-asserted-by":"publisher","first-page":"1910","DOI":"10.1007\/s11263-021-01458-8","volume":"129","author":"X Zhong","year":"2021","unstructured":"Zhong X, Ding C, Qu X, Tao D (2021) Polysemy deciphering network for robust human-object interaction detection. Int J Comput Vision 129(6):1910\u20131929","journal-title":"Int J Comput Vision"},{"key":"6279_CR50","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2023.110590","volume":"272","author":"Y Hao","year":"2023","unstructured":"Hao Y, Jing X-Y, Chen R, Liu W (2023) Learning enhanced specific representations for multi-view feature learning. Knowl-Based Syst 272:110590","journal-title":"Knowl-Based Syst"},{"key":"6279_CR51","doi-asserted-by":"crossref","unstructured":"Chen Y, Ma Z, Zhang Z, Qi Z, Yuan C, Shan Y, Li B, Hu W, Qie X, Wu J (2023) Vilem: Visual-language error modeling for image-text retrieval. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 11018\u201311027","DOI":"10.1109\/CVPR52729.2023.01060"},{"key":"6279_CR52","doi-asserted-by":"crossref","unstructured":"Xu Z, Zhong W, Su Q, Zhang F (2023) Cross-modal-aware representation learning with syntactic hypergraph convolutional network for videoqa. In: 2023 IEEE international conference on multimedia and expo (ICME), IEEE, pp 384\u2013389","DOI":"10.1109\/ICME55011.2023.00073"},{"issue":"1","key":"6279_CR53","doi-asserted-by":"publisher","first-page":"4542","DOI":"10.1038\/s41467-023-40260-7","volume":"14","author":"X Zhang","year":"2023","unstructured":"Zhang X, Wu C, Zhang Y, Xie W, Wang Y (2023) Knowledge-enhanced visual-language pre-training on chest radiology images. Nat Commun 14(1):4542","journal-title":"Nat Commun"},{"key":"6279_CR54","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2022.109763","volume":"255","author":"H Pan","year":"2022","unstructured":"Pan H, He S, Zhang K, Qu B, Chen C, Shi K (2022) Amam: an attention-based multimodal alignment model for medical visual question answering. Knowl-Based Syst 255:109763","journal-title":"Knowl-Based Syst"},{"key":"6279_CR55","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2023.110706","volume":"275","author":"C Chen","year":"2023","unstructured":"Chen C, Han D, Shen X (2023) Clvin: Complete language-vision interaction network for visual question answering. Knowl-Based Syst 275:110706","journal-title":"Knowl-Based Syst"},{"issue":"1","key":"6279_CR56","doi-asserted-by":"publisher","first-page":"54","DOI":"10.1007\/s44196-023-00233-6","volume":"16","author":"S Lu","year":"2023","unstructured":"Lu S, Ding Y, Liu M, Yin Z, Yin L, Zheng W (2023) Multiscale feature extraction and fusion of image and text in vqa. Int J Comput Intell Syst 16(1):54","journal-title":"Int J Comput Intell Syst"},{"key":"6279_CR57","doi-asserted-by":"crossref","unstructured":"Wang Z, Li J, Hong Y, Wang Y, Wu Q, Bansal M, Gould S, Tan H, Qiao Y (2023) Scaling data generation in vision-and-language navigation. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 12009\u201312020","DOI":"10.1109\/ICCV51070.2023.01103"},{"key":"6279_CR58","doi-asserted-by":"crossref","unstructured":"Huo J, Sun Q, Jiang B, Lin H, Fu Y (2023) Geovln: Learning geometry-enhanced visual representation with slot attention for vision-and-language navigation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 23212\u201323221","DOI":"10.1109\/CVPR52729.2023.02223"},{"key":"6279_CR59","doi-asserted-by":"crossref","unstructured":"Barraco M, Cornia M, Cascianelli S, Baraldi L, Cucchiara R (2022) The unreasonable effectiveness of clip features for image captioning: an experimental analysis. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 4662\u20134670","DOI":"10.1109\/CVPRW56347.2022.00512"},{"key":"6279_CR60","doi-asserted-by":"crossref","unstructured":"Zeng Z, Xie Y, Zhang H, Chen C, Chen B, Wang Z (2024) Meacap: Memory-augmented zero-shot image captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 14100\u201314110","DOI":"10.1109\/CVPR52733.2024.01337"},{"key":"6279_CR61","doi-asserted-by":"crossref","unstructured":"Rotstein N, Bensa\u00efd D, Brody S, Ganz R, Kimmel R (2024) Fusecap: Leveraging large language models for enriched fused image captions. In: Proceedings of the IEEE\/CVF winter conference on applications of computer vision, pp 5689\u20135700","DOI":"10.1109\/WACV57701.2024.00559"},{"key":"6279_CR62","doi-asserted-by":"crossref","unstructured":"Hou Z, Yu B, Qiao Y, Peng X, Tao D (2021) Affordance transfer learning for human-object interaction detection. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 495\u2013504","DOI":"10.1109\/CVPR46437.2021.00056"},{"key":"6279_CR63","doi-asserted-by":"crossref","unstructured":"Hou Z, Yu B, Qiao Y, Peng X, Tao D (2021) Detecting human-object interaction via fabricated compositional learning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 14646\u201314655","DOI":"10.1109\/CVPR46437.2021.01441"},{"key":"6279_CR64","doi-asserted-by":"crossref","unstructured":"Kim B, Lee J, Kang J, Kim ES, Kim HJ (2021) Hotr: End-to-end human-object interaction detection with transformers. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 74\u201383","DOI":"10.1109\/CVPR46437.2021.00014"},{"key":"6279_CR65","doi-asserted-by":"crossref","unstructured":"Dong Q, Tu Z, Liao H, Zhang Y, Mahadevan V, Soatto S (2021) Visual relationship detection using part-and-sum transformers with composite queries. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 3550\u20133559","DOI":"10.1109\/ICCV48922.2021.00353"},{"key":"6279_CR66","first-page":"17209","volume":"34","author":"A Zhang","year":"2021","unstructured":"Zhang A, Liao Y, Liu S, Lu M, Wang Y, Gao C, Li X (2021) Mining the benefits of two-stage and one-stage hoi detection. Adv Neural Inf Process Syst 34:17209\u201317220","journal-title":"Adv Neural Inf Process Syst"},{"key":"6279_CR67","doi-asserted-by":"crossref","unstructured":"Zhang Y, Pan Y, Yao T, Huang R, Mei T, Chen CW (2022) Exploring structure-aware transformer over interaction proposals for human-object interaction detection. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 19548\u201319557","DOI":"10.1109\/CVPR52688.2022.01894"},{"key":"6279_CR68","doi-asserted-by":"crossref","unstructured":"Iftekhar A, Chen H, Kundu K, Li X, Tighe J, Modolo D (2022) What to look at and where: Semantic and spatial refined transformer for detecting human-object interactions. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 5353\u20135363","DOI":"10.1109\/CVPR52688.2022.00528"},{"key":"6279_CR69","doi-asserted-by":"crossref","unstructured":"Tu D, Sun W, Zhai G, Shen W (2023) Agglomerative transformer for human-object interaction detection. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 21614\u201321624","DOI":"10.1109\/ICCV51070.2023.01976"}],"container-title":["Applied Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-025-06279-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10489-025-06279-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-025-06279-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,6]],"date-time":"2025-09-06T03:41:18Z","timestamp":1757130078000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10489-025-06279-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,1,22]]},"references-count":69,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2025,4]]}},"alternative-id":["6279"],"URL":"https:\/\/doi.org\/10.1007\/s10489-025-06279-7","relation":{},"ISSN":["0924-669X","1573-7497"],"issn-type":[{"value":"0924-669X","type":"print"},{"value":"1573-7497","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,1,22]]},"assertion":[{"value":"9 January 2025","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 January 2025","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The corresponding author of this paper is the associate editor of Applied Intelligence.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}},{"value":"Informed consent was obtained from the Shanghai University of Engineering Science for the publication of this article and from all authors.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical and Informed Consent for Data Used"}}],"article-number":"363"}}