{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,16]],"date-time":"2026-04-16T13:55:24Z","timestamp":1776347724913,"version":"3.51.2"},"reference-count":40,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100017610","name":"Shenzhen Science and Technology Innovation Program","doi-asserted-by":"publisher","award":["ZDCY20250901100201002"],"award-info":[{"award-number":["ZDCY20250901100201002"]}],"id":[{"id":"10.13039\/501100017610","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100021171","name":"Basic and Applied Basic Research Foundation of Guangdong Province","doi-asserted-by":"publisher","award":["2025A1515011954"],"award-info":[{"award-number":["2025A1515011954"]}],"id":[{"id":"10.13039\/501100021171","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition"],"published-print":{"date-parts":[[2026,11]]},"DOI":"10.1016\/j.patcog.2026.113557","type":"journal-article","created":{"date-parts":[[2026,3,26]],"date-time":"2026-03-26T08:02:57Z","timestamp":1774512177000},"page":"113557","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"PB","title":["M3-HOI: Multi-modal mining network for video-based human-object interaction recognition"],"prefix":"10.1016","volume":"179","author":[{"given":"Bohong","family":"Wu","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5395-6175","authenticated-orcid":false,"given":"Qing","family":"Gao","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.patcog.2026.113557_bib0001","unstructured":"E. V. Mascaro, D. Sliwowski, D. Lee, Hoi4abot: Human-object interaction anticipation for human intention reading collaborative robots, (2023). arXiv: 2309.16524\">arxiv preprint arXiv: 2309.16524."},{"key":"10.1016\/j.patcog.2026.113557_bib0002","series-title":"7th Conference on Robot Learning, CoRL 2023","first-page":"1111","article-title":"HOI4ABOT: human-Object interaction anticipation for human intention reading assistive robots","author":"Valls Mascar\u00f3","year":"2023"},{"key":"10.1016\/j.patcog.2026.113557_bib0003","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2025.111452","article-title":"Knowledge-driven compositional action recognition","volume":"163","author":"Liu","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.113557_bib0004","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.110897","article-title":"Human\u2013object interaction detection via recycling of ground-truth annotations","volume":"157","author":"Lin","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.113557_bib0005","series-title":"European Conference on Computer Vision","first-page":"474","article-title":"Geometric features informed multi-person human-object interaction recognition in videos","author":"Qiao","year":"2022"},{"issue":"1","key":"10.1016\/j.patcog.2026.113557_bib0006","doi-asserted-by":"crossref","first-page":"13","DOI":"10.1007\/s00530-024-01604-5","article-title":"HierGAT: hierarchical spatial-temporal network with graph and transformer for video HOI detection","volume":"31","author":"Wu","year":"2025","journal-title":"Multim. Syst."},{"key":"10.1016\/j.patcog.2026.113557_bib0007","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"3079","article-title":"Open set video hoi detection from action-centric chain-of-look prompting","author":"Xi","year":"2023"},{"issue":"5","key":"10.1016\/j.patcog.2026.113557_bib0008","doi-asserted-by":"crossref","first-page":"3728","DOI":"10.1109\/TCSVT.2023.3317877","article-title":"Graph regularized and feature aware matrix factorization for robust incomplete multi-view clustering","volume":"34","author":"Wen","year":"2023","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"8","key":"10.1016\/j.patcog.2026.113557_bib0009","doi-asserted-by":"crossref","first-page":"11396","DOI":"10.1109\/TNNLS.2023.3260349","article-title":"Deep double incomplete multi-view multi-label learning with incomplete labels and missing views","volume":"35","author":"Wen","year":"2023","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"10.1016\/j.patcog.2026.113557_bib0010","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"8359","article-title":"Detecting and recognizing human-object interactions","author":"Gkioxari","year":"2018"},{"key":"10.1016\/j.patcog.2026.113557_bib0011","article-title":"Faster r-cnn: towards real-time object detection with region proposal networks","volume":"28","author":"Ren","year":"2015","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2026.113557_bib0012","series-title":"Proceedings of the European Conference on Computer Vision (ECCV)","first-page":"401","article-title":"Learning human-object interactions by graph parsing neural networks","author":"Qi","year":"2018"},{"key":"10.1016\/j.patcog.2026.113557_bib0013","series-title":"European Conference on Computer Vision","first-page":"248","article-title":"Contextual heterogeneous graph network for human-object interaction detection","author":"Wang","year":"2020"},{"key":"10.1016\/j.patcog.2026.113557_bib0014","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"13617","article-title":"Vsgnet: spatial attention network for detecting human object interactions using graph convolutions","author":"Ulutan","year":"2020"},{"key":"10.1016\/j.patcog.2026.113557_bib0015","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"15984","article-title":"Exploiting scene graphs for human-object interaction detection","author":"He","year":"2021"},{"key":"10.1016\/j.patcog.2026.113557_bib0016","series-title":"2018 IEEE Winter Conference on Applications of Computer Vision (Wacv)","first-page":"381","article-title":"Learning to detect human-object interactions","author":"Chao","year":"2018"},{"key":"10.1016\/j.patcog.2026.113557_bib0017","unstructured":"C. Gao, Y. Zou, J.-B. Huang, ican: Instance-centric attention network for human-object interaction detection, (2018).arXiv: 1808.10437\">arxiv preprint arXiv: 1808.10437."},{"issue":"10","key":"10.1016\/j.patcog.2026.113557_bib0018","doi-asserted-by":"crossref","first-page":"6826","DOI":"10.1109\/TPAMI.2024.3386891","article-title":"PPDM++: Parallel point detection and matching for fast and accurate HOI detection","volume":"46","author":"Liao","year":"2024","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.patcog.2026.113557_bib0019","series-title":"Chinese Conference on Pattern Recognition and Computer Vision (PRCV)","first-page":"481","article-title":"MIT: Multi-cue injected transformer for two-Stage HOI detection","author":"Peng","year":"2024"},{"key":"10.1016\/j.patcog.2026.113557_bib0020","series-title":"Proceedings of the 2021 ACM Workshop on Intelligent Cross-Data Analysis and Retrieval","first-page":"9","article-title":"St-hoi: a spatial-temporal baseline for human-object interaction detection in videos","author":"Chiou","year":"2021"},{"key":"10.1016\/j.patcog.2026.113557_bib0021","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"5308","article-title":"Structural-rnn: deep learning on spatio-temporal graphs","author":"Jain","year":"2016"},{"key":"10.1016\/j.patcog.2026.113557_bib0022","series-title":"Proceedings of the 29th ACM International Conference on Multimedia","first-page":"4985","article-title":"Spatio-temporal interaction graph parsing networks for human-object interaction recognition","author":"Wang","year":"2021"},{"key":"10.1016\/j.patcog.2026.113557_bib0023","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"16041","article-title":"Learning asynchronous and sparse human-object interaction in videos","author":"Morais","year":"2021"},{"key":"10.1016\/j.patcog.2026.113557_bib0024","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2023.110050","article-title":"Sharing-net: lightweight feedforward network for skeleton-based action recognition based on information sharing mechanism","volume":"146","author":"Zhao","year":"2024","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.113557_bib0025","article-title":"Partial multiview incomplete multilabel learning via uncertainty-driven reliable dynamic fusion","author":"Wen","year":"2025","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.patcog.2026.113557_bib0026","article-title":"Attention is all you need","volume":"30","author":"Vaswani","year":"2017","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2026.113557_bib0027","series-title":"International Conference on Pattern Recognition","first-page":"262","article-title":"From category to scenery: an end-to-end framework for multi-person human-object interaction recognition in videos","author":"Qiao","year":"2024"},{"key":"10.1016\/j.patcog.2026.113557_bib0028","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"482","article-title":"Ppdm: parallel point detection and matching for real-time human-object interaction detection","author":"Liao","year":"2020"},{"key":"10.1016\/j.patcog.2026.113557_bib0029","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"8472","article-title":"Interaction compass: multi-label zero-shot learning of human-object interactions via spatial relations","author":"Huynh","year":"2021"},{"key":"10.1016\/j.patcog.2026.113557_bib0030","doi-asserted-by":"crossref","first-page":"133","DOI":"10.1016\/j.patrec.2025.02.014","article-title":"VHOIP: Video-based human\u2013Object interaction recognition with CLIP prior knowledge","volume":"190","author":"Baek","year":"2025","journal-title":"Pattern Recognit. Lett."},{"key":"10.1016\/j.patcog.2026.113557_bib0031","series-title":"International Conference on Machine Learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.patcog.2026.113557_bib0032","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"16954","article-title":"Open-World human-Object interaction detection via multi-modal prompts","author":"Yang","year":"2024"},{"issue":"2","key":"10.1016\/j.patcog.2026.113557_bib0033","doi-asserted-by":"crossref","first-page":"1169","DOI":"10.1007\/s10586-023-04004-y","article-title":"Human\u2013object interaction recognition based on interactivity detection and multi-feature fusion","volume":"27","author":"Xia","year":"2024","journal-title":"Cluster Comput."},{"key":"10.1016\/j.patcog.2026.113557_bib0034","doi-asserted-by":"crossref","unstructured":"T. Qiao, R. Li, F.W.B. Li, Y. Kubotani, S. Morishima, H.P.H. Shum, Geometric visual fusion graph neural networks for multi-person human-object interaction recognition in videos, (2025). arXiv: 2506.03440\">arxiv preprint arXiv: 2506.03440.","DOI":"10.1016\/j.eswa.2025.128344"},{"key":"10.1016\/j.patcog.2026.113557_bib0035","article-title":"Transflow-HOI: transformer-Driven multimodal fusion for high-Performance human-Object interaction recognition","author":"Sajid","year":"2026","journal-title":"IEEE Access"},{"key":"10.1016\/j.patcog.2026.113557_bib0036","doi-asserted-by":"crossref","DOI":"10.1016\/j.ins.2023.119921","article-title":"A novel distance measure based on dynamic time warping to improve time series classification","volume":"656","author":"Liu","year":"2024","journal-title":"Inf. Sci."},{"issue":"1","key":"10.1016\/j.patcog.2026.113557_bib0037","doi-asserted-by":"crossref","first-page":"187","DOI":"10.1109\/LRA.2019.2949221","article-title":"Learning object-action relations from bimanual human demonstration using graph networks","volume":"5","author":"Dreher","year":"2019","journal-title":"IEEE Rob. Autom. Lett."},{"issue":"8","key":"10.1016\/j.patcog.2026.113557_bib0038","doi-asserted-by":"crossref","first-page":"951","DOI":"10.1177\/0278364913478446","article-title":"Learning human activities and object affordances from rgb-d videos","volume":"32","author":"Koppula","year":"2013","journal-title":"Int. J. Rob. Res."},{"key":"10.1016\/j.patcog.2026.113557_bib0039","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"156","article-title":"Temporal convolutional networks for action segmentation and detection","author":"Lea","year":"2017"},{"key":"10.1016\/j.patcog.2026.113557_bib0040","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"3575","article-title":"Ms-tcn: multi-stage temporal convolutional network for action segmentation","author":"Farha","year":"2019"}],"container-title":["Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320326005236?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320326005236?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,16]],"date-time":"2026-04-16T13:05:17Z","timestamp":1776344717000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0031320326005236"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,11]]},"references-count":40,"alternative-id":["S0031320326005236"],"URL":"https:\/\/doi.org\/10.1016\/j.patcog.2026.113557","relation":{},"ISSN":["0031-3203"],"issn-type":[{"value":"0031-3203","type":"print"}],"subject":[],"published":{"date-parts":[[2026,11]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"M3-HOI: Multi-modal mining network for video-based human-object interaction recognition","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patcog.2026.113557","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"113557"}}