{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,30]],"date-time":"2026-05-30T07:01:28Z","timestamp":1780124488402,"version":"3.54.0"},"reference-count":35,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition"],"published-print":{"date-parts":[[2026,11]]},"DOI":"10.1016\/j.patcog.2026.113637","type":"journal-article","created":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T02:57:26Z","timestamp":1775098646000},"page":"113637","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"PB","title":["H2R-BM: Can leveraging human videos enhance performance and generalizability in robotic bimanual manipulation?"],"prefix":"10.1016","volume":"179","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-4209-6695","authenticated-orcid":false,"given":"Xiaoshuai","family":"Hao","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Huaihai","family":"Lyu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Lingfeng","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Rui","family":"Liu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Dayan","family":"Wu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jing","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Long","family":"Chen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"issue":"5","key":"10.1016\/j.patcog.2026.113637_bib0001","doi-asserted-by":"crossref","first-page":"3929","DOI":"10.1109\/TRO.2023.3281153","article-title":"AnyGrasp: robust and efficient grasp perception in spatial and temporal domains","volume":"39","author":"Fang","year":"2023","journal-title":"IEEE Trans. Rob."},{"issue":"6","key":"10.1016\/j.patcog.2026.113637_bib0002","doi-asserted-by":"crossref","first-page":"6012","DOI":"10.1109\/LRA.2024.3396101","article-title":"RGBGrasp: image-based object grasping by capturing multiple views during robot arm movement with neural radiance fields","volume":"9","author":"Liu","year":"2024","journal-title":"IEEE Rob. Autom. Lett."},{"key":"10.1016\/j.patcog.2026.113637_bib0003","doi-asserted-by":"crossref","unstructured":"D. Li, Y. Jin, Y. Sun, H. Yu, J. Shi, X. Hao, P. Hao, H. Liu, F. Sun, J. Zhang, et al., What foundation models can bring for robot learning in manipulation: a survey, (2024). arXiv: 2404.18201.","DOI":"10.1177\/02783649251390579"},{"key":"10.1016\/j.patcog.2026.113637_bib0004","series-title":"International Conference on Computer Vision","article-title":"AnyBimanual: transferring unimanual policy for general bimanual manipulation","author":"Lu","year":"2025"},{"key":"10.1016\/j.patcog.2026.113637_bib0005","series-title":"1st Workshop on X-Embodiment Robot Learning","article-title":"EgoMimic: scaling imitation learning via egocentric video","author":"Kareer","year":"2024"},{"key":"10.1016\/j.patcog.2026.113637_bib0006","doi-asserted-by":"crossref","unstructured":"T.Z. Zhao, V. Kumar, S. Levine, C. Finn, Learning fine-grained bimanual manipulation with low-cost hardware, (2023). arXiv: 2304.13705.","DOI":"10.15607\/RSS.2023.XIX.016"},{"key":"10.1016\/j.patcog.2026.113637_bib0007","series-title":"Robotics: Science and Systems XIX, Daegu, Republic of Korea, July 10\u201314, 2023","article-title":"Diffusion policy: visuomotor policy learning via action diffusion","author":"Chi","year":"2023"},{"key":"10.1016\/j.patcog.2026.113637_bib0008","series-title":"The Thirteenth International Conference on Learning Representations, ICLR 2025, Singapore, April 24\u201328, 2025","article-title":"RDT-1B: a diffusion foundation model for bimanual manipulation","author":"Liu","year":"2025"},{"key":"10.1016\/j.patcog.2026.113637_bib0009","series-title":"CoRL 2024 Workshop on Whole-body Control and Bimanual Manipulation: Applications in Humanoids and Beyond","article-title":"PerAct2: benchmarking and learning for robotic bimanual manipulation tasks","author":"Grotz","year":"2024"},{"key":"10.1016\/j.patcog.2026.113637_bib0010","series-title":"2024\u202fIEEE\/RSJ International Conference on Intelligent Robots and Systems","first-page":"12156","article-title":"GELLO: A general, low-cost, and intuitive teleoperation framework for robot manipulators","author":"Wu","year":"2024"},{"key":"10.1016\/j.patcog.2026.113637_bib0011","unstructured":"Zhaxizhuoma, K. Liu, C. Guan, Z. Jia, Z. Wu, X. Liu, T. Wang, S. Liang, P. Chen, P. Zhang, H. Song, D. Qu, D. Wang, Z. Wang, N. Cao, Y. Ding, B. Zhao, X. Li, FastUMI: a scalable and hardware-independent universal manipulation interface with dataset, (2025). arXiv: 2409.19499."},{"key":"10.1016\/j.patcog.2026.113637_bib0012","doi-asserted-by":"crossref","unstructured":"C. Chi, Z. Xu, C. Pan, E. Cousineau, B. Burchfiel, S. Feng, R. Tedrake, S. Song, Universal manipulation interface: in-the-wild robot teaching without in-the-wild robots, arXiv: 2402.10329(2024).","DOI":"10.15607\/RSS.2024.XX.045"},{"key":"10.1016\/j.patcog.2026.113637_bib0013","unstructured":"X. Cheng, J. Li, S. Yang, G. Yang, X. Wang, Open-television: teleoperation with immersive active visual feedback, (2024). arXiv: 2407.01512."},{"key":"10.1016\/j.patcog.2026.113637_bib0014","unstructured":"C. Wang, L. Fan, J. Sun, R. Zhang, L. Fei-Fei, D. Xu, Y. Zhu, A. Anandkumar, MimicPlay: Long-horizon imitation learning by watching human play, (2023). arXiv: 2302.12422."},{"key":"10.1016\/j.patcog.2026.113637_bib0015","article-title":"VLM see, robot do: human demo video to robot action plan via vision language model","author":"Wang","year":"2025","journal-title":"2025 IEEE\/RSJ International Conference on Intelligent Robots and Systems"},{"key":"10.1016\/j.patcog.2026.113637_bib0016","unstructured":"J. Clark, S. Mirchandani, D. Sadigh, S. Belkhale, Action-free reasoning for policy generalization, (2025). arXiv: 2502.03729."},{"key":"10.1016\/j.patcog.2026.113637_bib0017","doi-asserted-by":"crossref","unstructured":"S. Belkhale, T. Ding, T. Xiao, P. Sermanet, Q. Vuong, J. Tompson, Y. Chebotar, D. Dwibedi, D. Sadigh, RT-H: action hierarchies using language, (2024). arXiv: 2403.01823.","DOI":"10.15607\/RSS.2024.XX.049"},{"key":"10.1016\/j.patcog.2026.113637_bib0018","doi-asserted-by":"crossref","unstructured":"Y. Tang, S. Zhang, X. Hao, P. Wang, J. Wu, Z. Wang, S. Zhang, AffordGrasp: in-context affordance reasoning for open-vocabulary task-oriented grasping in clutter, (2025). arXiv: 2503.00778.","DOI":"10.1109\/IROS60139.2025.11245995"},{"key":"10.1016\/j.patcog.2026.113637_bib0019","series-title":"The Eleventh International Conference on Learning Representations","article-title":"DualAfford: learning collaborative visual affordance for dual-gripper manipulation","author":"Zhao","year":"2023"},{"key":"10.1016\/j.patcog.2026.113637_bib0020","series-title":"Conference on Robot Learning","article-title":"VoxAct-B: voxel-based acting and stabilizing policy for bimanual manipulation","author":"Liu","year":"2024"},{"key":"10.1016\/j.patcog.2026.113637_bib0021","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"13778","article-title":"Affordances from human videos as a versatile representation for robotics","author":"Bahl","year":"2023"},{"key":"10.1016\/j.patcog.2026.113637_bib0022","doi-asserted-by":"crossref","unstructured":"V. Jain, M. Attarian, N.J. Joshi, A. Wahid, D. Driess, Q. Vuong, P.R. Sanketi, P. Sermanet, S. Welker, C. Chan, et al., Vid2Robot: end-to-end video-conditioned policy learning with cross-attention transformers, (2024). arXiv: 2403.12943.","DOI":"10.15607\/RSS.2024.XX.052"},{"key":"10.1016\/j.patcog.2026.113637_bib0023","series-title":"IEEE International Conference on Robotics and Automation","article-title":"R+X: retrieval and execution from everyday human videos","author":"Papagiannis","year":"2025"},{"key":"10.1016\/j.patcog.2026.113637_bib0024","series-title":"European Conference on Computer Vision","first-page":"306","article-title":"Track2Act: predicting point tracks from internet videos enables generalizable robot manipulation","author":"Bharadhwaj","year":"2024"},{"key":"10.1016\/j.patcog.2026.113637_bib0025","article-title":"DVLTA-VQA: decoupled vision-language modeling with text-guided adaptation for blind video quality assessment","author":"Yu","year":"2025","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.patcog.2026.113637_bib0026","series-title":"Proceedings of the 29th ACM International Conference on Multimedia","first-page":"1248","article-title":"Perceptual quality assessment of internet videos","author":"Xu","year":"2021"},{"key":"10.1016\/j.patcog.2026.113637_bib0027","unstructured":"Y. Li, X. Yang, W. Liu, X. Jin, X. Jia, Y. Lai, P.L. Rosin, H. Liu, W. Zhou, Temporal inconsistency guidance for super-resolution video quality assessment, (2024). arXiv: 2412.18933."},{"key":"10.1016\/j.patcog.2026.113637_bib0028","unstructured":"C.-L. Cheang, G. Chen, Y. Jing, T. Kong, H. Li, Y. Li, Y. Liu, H. Wu, J. Xu, Y. Yang, et al., GR-2: a generative video-language-action model with web-scale knowledge for robot manipulation, (2024). arXiv: 2410.06158."},{"key":"10.1016\/j.patcog.2026.113637_bib0029","unstructured":"Octo Model Team, D. Ghosh, H. Walke, K. Pertsch, K. Black, O. Mees, S. Dasari, J. Hejna, C. Xu, J. Luo, T. Kreiman, Y. Tan, L.Y. Chen, P. Sanketi, Q. Vuong, T. Xiao, D. Sadigh, C. Finn, S. Levine, Octo: an open-source generalist robot policy, in: Proceedings of Robotics: Science and Systems, Delft, Netherlands, 2024."},{"key":"10.1016\/j.patcog.2026.113637_bib0030","unstructured":"X. Hao, L. Zhou, et al., Mimo-embodied: X-embodied foundation model technical report, (2025). arXiv: 2511.16518."},{"key":"10.1016\/j.patcog.2026.113637_bib0031","unstructured":"J. Liu, C. Li, G. Wang, L. Lee, K. Zhou, S. Chen, C. Xiong, J. Ge, R. Zhang, S. Zhang, Self-corrected multimodal large language model for end-to-end robot manipulation, (2024). arXiv: 2405.17418."},{"key":"10.1016\/j.patcog.2026.113637_bib0032","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.110927","article-title":"Distilling interaction knowledge for semi-supervised egocentric action recognition","volume":"157","author":"Wang","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.113637_bib0033","unstructured":"Z. Fu, T.Z. Zhao, C. Finn, Mobile aloha: Learning bimanual mobile manipulation with low-cost whole-body teleoperation, (2024). arXiv: 2401.02117."},{"key":"10.1016\/j.patcog.2026.113637_bib0034","doi-asserted-by":"crossref","unstructured":"K. Black, N. Brown, D. Driess, A. Esmail, M. Equi, C. Finn, N. Fusai, L. Groom, K. Hausman, B. Ichter, S. Jakubczak, T. Jones, L. Ke, S. Levine, A. Li-Bell, M. Mothukuri, S. Nair, K. Pertsch, L.X. Shi, J. Tanner, Q. Vuong, A. Walling, H. Wang, U. Zhilinsky, \u03c00: A vision-language-action flow model for general robot control, (2024). arXiv preprint arXiv: 2410.24164.","DOI":"10.15607\/RSS.2025.XXI.010"},{"key":"10.1016\/j.patcog.2026.113637_bib0035","series-title":"Proceedings of the Conference on Robot Learning","article-title":"OpenVLA: an open-source vision-language-action model","author":"Kim","year":"2024"}],"container-title":["Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320326006023?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320326006023?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,30]],"date-time":"2026-05-30T06:05:24Z","timestamp":1780121124000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0031320326006023"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,11]]},"references-count":35,"alternative-id":["S0031320326006023"],"URL":"https:\/\/doi.org\/10.1016\/j.patcog.2026.113637","relation":{},"ISSN":["0031-3203"],"issn-type":[{"value":"0031-3203","type":"print"}],"subject":[],"published":{"date-parts":[[2026,11]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"H2R-BM: Can leveraging human videos enhance performance and generalizability in robotic bimanual manipulation?","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patcog.2026.113637","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"113637"}}