{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:10:10Z","timestamp":1765339810640,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62372387, 62001400, 52441801, 61802053"],"award-info":[{"award-number":["62372387, 62001400, 52441801, 61802053"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100018542","name":"Natural Science Foundation of Sichuan Province","doi-asserted-by":"publisher","award":["2024NSFSC0494, 2024NSFSC0508"],"award-info":[{"award-number":["2024NSFSC0494, 2024NSFSC0508"]}],"id":[{"id":"10.13039\/501100018542","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["2682024ZTPY044, 2682025ZD004"],"award-info":[{"award-number":["2682024ZTPY044, 2682025ZD004"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100002858","name":"China Postdoctoral Science Foundation","doi-asserted-by":"publisher","award":["2021M702713"],"award-info":[{"award-number":["2021M702713"]}],"id":[{"id":"10.13039\/501100002858","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Special Research Funding under Yibin Municipal-University Dual Agreement","award":["YBSCXY2024010012, YBSCXY2024010006"],"award-info":[{"award-number":["YBSCXY2024010012, YBSCXY2024010006"]}]},{"name":"Fund of National Laboratory on Adaptive Optics, China","award":["FNLAO-24-ZD-O02"],"award-info":[{"award-number":["FNLAO-24-ZD-O02"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755281","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:51Z","timestamp":1761377211000},"page":"8058-8066","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["HOPNet: Learning Hand-Object-Person Interaction Network for Hand Contact State Detection"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8278-1765","authenticated-orcid":false,"given":"Wei","family":"Li","sequence":"first","affiliation":[{"name":"Southwest Jiaotong University, Chengdu, China and Engineering Research Center of Sustainable Urban Intelligent Transportation, Ministry of Education, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5883-1129","authenticated-orcid":false,"given":"Yizhao","family":"Wan","sequence":"additional","affiliation":[{"name":"Southwest Jiaotong University, Chengdu, China and Engineering Research Center of Sustainable Urban Intelligent Transportation, Ministry of Education, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8322-8558","authenticated-orcid":false,"given":"Xiao","family":"Wu","sequence":"additional","affiliation":[{"name":"Southwest Jiaotong University, Chengdu, China and Engineering Research Center of Sustainable Urban Intelligent Transportation, Ministry of Education, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-9256-0882","authenticated-orcid":false,"given":"Jianshuai","family":"Wang","sequence":"additional","affiliation":[{"name":"Southwest Jiaotong University, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3074-4620","authenticated-orcid":false,"given":"Penglin","family":"Dai","sequence":"additional","affiliation":[{"name":"Southwest Jiaotong University, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4083-5155","authenticated-orcid":false,"given":"Zhaoquan","family":"Yuan","sequence":"additional","affiliation":[{"name":"Southwest Jiaotong University, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 7291-7299","author":"Cao Z.","key":"e_1_3_2_2_1_1","unstructured":"Z. Cao, T. Simon, S.-E. Wei, and Y. Sheikh. 2017. Realtime Multi-Person 2D Pose Estimation using Part Affinity Fields. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 7291-7299."},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00893"},{"key":"e_1_3_2_2_3_1","unstructured":"T. Cheng D. Shan A. Hassen R. Higgins and D. Fouhey. 2023. Towards a richer 2d understanding of hands at scale. Advances in Neural Information Processing Systems (NeurIPS) (2023) 30453-30465."},{"volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 19888-19901","author":"Diller C.","key":"e_1_3_2_2_4_1","unstructured":"C. Diller and A. Dai. 2024. Cg-hoi: Contact-guided 3d human-object interaction generation. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 19888-19901."},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3069835"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-009-0275-4"},{"volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 4438-4446","author":"Fu J.","key":"e_1_3_2_2_7_1","unstructured":"J. Fu, H. Zheng, and T. Mei. 2017. Look closer to see better: Recurrent attention convolutional neural network for fine-grained image recognition. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 4438-4446."},{"key":"e_1_3_2_2_8_1","first-page":"79741","article-title":"Coohoi: Learning cooperative human-object interaction with manipulated object dynamics","author":"Gao J.","year":"2024","unstructured":"J. Gao, Z. Wang, Z. Xiao, J. Wang, T. Wang, J. Cao, X. Hu, S. Liu, J. Dai, and J. Pang. 2024. Coohoi: Learning cooperative human-object interaction with manipulated object dynamics. In Advances in Neural Information Processing Systems (NeurIPS). 79741-79763.","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)."},{"volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 10833-10842","author":"Ge L.","key":"e_1_3_2_2_9_1","unstructured":"L. Ge, Z. Ren, Y. Li, Z. Xue, Y. Wang, J. Cai, and J. Yuan. 2019. 3d hand shape and pose estimation from a single rgb image. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 10833-10842."},{"volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 1471-1481","author":"Grady P.","key":"e_1_3_2_2_10_1","unstructured":"P. Grady, C. Tang, C. D. Twigg, M. Vo, S. Brahmbhatt, and C. C. Kemp. 2021. Contactopt: Optimizing contact to improve grasps. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 1471-1481."},{"volume-title":"Proceedings of the 32nd ACM International Conference on Multimedia (ACM MM). 10363-10372","author":"Gu R.","key":"e_1_3_2_2_11_1","unstructured":"R. Gu, J. Zhu, Y. Si, F. Gao, J. Xu, and G. Xu. 2024. 3D Human Pose Estimation from Multiple Dynamic Views via Single-view Pretraining with Procrustes Alignment. In Proceedings of the 32nd ACM International Conference on Multimedia (ACM MM). 10363-10372."},{"volume-title":"Mask R-CNN. In Proceedings of the IEEE International Conference on Computer Vision (ICCV). 2961-2969","author":"He K.","key":"e_1_3_2_2_12_1","unstructured":"K. He, G. Gkioxari, P. Doll\u00e1r, and R. Girshick. 2017. Mask R-CNN. In Proceedings of the IEEE International Conference on Computer Vision (ICCV). 2961-2969."},{"volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 770-778","author":"He K.","key":"e_1_3_2_2_13_1","unstructured":"K. He, X. Zhang, S. Ren, and J. Sun. 2016. Deep Residual Learning for Image Recognition. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 770-778."},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.engappai.2024.109526"},{"volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 10468-10477","author":"Ji R.","key":"e_1_3_2_2_15_1","unstructured":"R. Ji, L. Wen, L. Zhang, D. Du, Y. Wu, C. Zhao, X. Liu, and F. Huang. 2020. Attention convolutional binary neural tree for fine-grained visual categorization. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 10468-10477."},{"volume-title":"Proceedings of the 32nd ACM International Conference on Multimedia (ACM MM). 4640-4649","author":"Jia C.","key":"e_1_3_2_2_16_1","unstructured":"C. Jia, M. Luo, X. Chang, Z. Dang, M. Han, M. Wang, G. Dai, S. Dang, and J. Wang. 2024. Generating Action-conditioned Prompts for Open-vocabulary Video Action Recognition. In Proceedings of the 32nd ACM International Conference on Multimedia (ACM MM). 4640-4649."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.3390\/rs12142209"},{"volume-title":"Proceedings of the IEEE International Conference on Robotics and Automation (ICRA). 714-720","author":"Li Y.","key":"e_1_3_2_2_18_1","unstructured":"Y. Li, W. Wei, D. Li, P. Wang, W. Li, and J. Zhong. 2022. HGC-Net: Deep anthropomorphic hand grasping in clutter. In Proceedings of the IEEE International Conference on Robotics and Automation (ICRA). 714-720."},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"crossref","unstructured":"T.-Y. Lin P. Doll\u00e1r R. Girshick K. He B. Hariharan and S. Belongie. 2017. Feature Pyramid Networks for Object Detection. arXiv:1612.03144 [cs.CV]","DOI":"10.1109\/CVPR.2017.106"},{"volume-title":"Proceedings of the European Conference on Computer Vision (ECCV). Springer, 740-755","author":"Lin T.-Y.","key":"e_1_3_2_2_20_1","unstructured":"T.-Y. Lin, M. Maire, S. Belongie, J. Hays, P. Perona, D. Ramanan, P. Doll\u00e1r, and C. L. Zitnick. 2014. Microsoft COCO: Common Objects in Context. In Proceedings of the European Conference on Computer Vision (ECCV). Springer, 740-755."},{"volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV). 20609-20620","author":"Liu S.","key":"e_1_3_2_2_21_1","unstructured":"S. Liu, Y. Zhou, J. Yang, S. Gupta, and S. Wang. 2023. Contactgen: Generative contact modeling for grasp generation. In Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV). 20609-20620."},{"key":"e_1_3_2_2_22_1","volume-title":"Parsenet: Looking wider to see better. arXiv:1506.04579 [cs.CV]","author":"Liu W.","year":"2015","unstructured":"W. Liu, A. Rabinovich, and A. C. Berg. 2015. Parsenet: Looking wider to see better. arXiv:1506.04579 [cs.CV]"},{"key":"e_1_3_2_2_23_1","first-page":"7841","article-title":"Detecting hands and recognizing physical contact in the wild","author":"Narasimhaswamy S.","year":"2020","unstructured":"S. Narasimhaswamy, T. Nguyen, and M. H. Nguyen. 2020. Detecting hands and recognizing physical contact in the wild. In Advances in Neural Information Processing Systems (NeurIPS). 7841-7851.","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)."},{"volume-title":"Proceedings of the IEEE International Conference on Computer Vision (ICCV). 2641-2649","author":"Plummer B. A.","key":"e_1_3_2_2_24_1","unstructured":"B. A. Plummer, L. Wang, C. M. Cervantes, J. C. Caicedo, J. Hockenmaier, and S. Lazebnik. 2015. Flickr30k Entities: Collecting Region-to-Phrase Correspondences for Richer Image-to-Sentence Models. In Proceedings of the IEEE International Conference on Computer Vision (ICCV). 2641-2649."},{"volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 9869-9878","author":"Shan D.","key":"e_1_3_2_2_25_1","unstructured":"D. Shan, J. Geng, M. Shu, and D. F. Fouhey. 2020. Understanding human hands in contact at internet scale. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 9869-9878."},{"volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 1145-1153","author":"Simon T.","key":"e_1_3_2_2_26_1","unstructured":"T. Simon, H. Joo, I. Matthews, and Y. Sheikh. 2017. Hand keypoint detection in single images using multiview bootstrapping. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 1145-1153."},{"volume-title":"Proceedings of the European Conference on Computer Vision (ECCV). 805-821","author":"Sun M.","key":"e_1_3_2_2_27_1","unstructured":"M. Sun, Y. Yuan, F. Zhou, and E. Ding. 2018. Multi-attention multi-class constraint for fine-grained image recognition. In Proceedings of the European Conference on Computer Vision (ECCV). 805-821."},{"volume-title":"Proceedings of the European Conference on Computer Vision (ECCV). Springer, 568-584","author":"Tse T. H. E.","key":"e_1_3_2_2_28_1","unstructured":"T. H. E. Tse, Z. Zhang, K. I. Kim, A. Leonardis, F. Zheng, and H. J. Chang. 2022. S 2 contact: Graph-based network for 3d hand-object contact estimation with semi-supervised learning. In Proceedings of the European Conference on Computer Vision (ECCV). Springer, 568-584."},{"volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 777-787","author":"Wang J.","key":"e_1_3_2_2_29_1","unstructured":"J. Wang, Z. Cao, D. Luvizon, L. Liu, K. Sarkar, D. Tang, T. Beeler, and C. Theobalt. 2024. Egocentric whole-body motion capture with fisheyevit and diffusion-based motion refinement. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 777-787."},{"volume-title":"Proceedings of the ACM International Conference on Multimedia (ACM MM). 5477-5485","author":"Wang J.","key":"e_1_3_2_2_30_1","unstructured":"J. Wang, Z. Yu, Z. Tong, H. Wang, J. Liu, W. Zhang, and X. Wu. 2022. Ocr-pose: Occlusion-aware contrastive representation for unsupervised 3d human pose estimation. In Proceedings of the ACM International Conference on Multimedia (ACM MM). 5477-5485."},{"volume-title":"Proceedings of the European Conference on Computer Vision (ECCV). 3-19","author":"Woo S.","key":"e_1_3_2_2_31_1","unstructured":"S. Woo, J. Park, J.-Y. Lee, and I. S. Kweon. 2018. CBAM: Convolutional Block Attention Module. In Proceedings of the European Conference on Computer Vision (ECCV). 3-19."},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3077512"},{"volume-title":"Proceedings of the 32nd ACM International Conference on Multimedia (ACM MM). 4660-4669","author":"Wu W.","key":"e_1_3_2_2_33_1","unstructured":"W. Wu, C. Zheng, Z. Yang, C. Chen, S. Das, and A. Lu. 2024. Frequency Guidance Matters: Skeletal Action Recognition by Frequency-Aware Mixed Transformer. In Proceedings of the 32nd ACM International Conference on Multimedia (ACM MM). 4660-4669."},{"key":"e_1_3_2_2_34_1","volume-title":"Maclr: Motion-aware contrastive learning of representations for videos. In Proceedings of the European Conference on Computer Vision (ECCV)","author":"Xiao F.","year":"2022","unstructured":"F. Xiao, J. Tighe, and D. Modolo. 2022. Maclr: Motion-aware contrastive learning of representations for videos. In Proceedings of the European Conference on Computer Vision (ECCV). Springer, 353-370."},{"key":"e_1_3_2_2_35_1","first-page":"77132","article-title":"Hoi-swap: Swapping objects in videos with hand-object interaction awareness","volume":"37","author":"Xue Z. S.","year":"2024","unstructured":"Z. S. Xue, R. Luo, C. Chen, and K. Grauman. 2024. Hoi-swap: Swapping objects in videos with hand-object interaction awareness. Advances in Neural Information Processing Systems (NeurIPS), Vol. 37 (2024), 77132-77164.","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"crossref","unstructured":"T. Yagi M. T. Hasan and Y. Sato. 2021. Hand-object contact prediction via motion-based pseudo-labeling and guided progressive label correction. arXiv preprint arXiv:2110.10174 (2021).","DOI":"10.5244\/C.35.25"},{"volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 16284-16295","author":"Yang Y.","key":"e_1_3_2_2_37_1","unstructured":"Y. Yang, W. Zhai, H. Luo, Y. Cao, and Z. J. Zha. 2024. Lemon: Learning 3d human-object interaction relation from 2d images. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 16284-16295."},{"volume-title":"Proceedings of the 32nd ACM International Conference on Multimedia (ACM MM). 8613-8622","author":"Zhang L.","key":"e_1_3_2_2_38_1","unstructured":"L. Zhang, W. Suo, P. Wang, and Y. Zhang. 2024. A Plug-and-Play Method for Rare Human-Object Interactions Detection by Bridging Domain Gap. In Proceedings of the 32nd ACM International Conference on Multimedia (ACM MM). 8613-8622."},{"volume-title":"Proceedings of the European Conference on Computer Vision (ECCV). Springer, 834-849","author":"Zhang N.","key":"e_1_3_2_2_39_1","unstructured":"N. Zhang, J. Donahue, R. Girshick, and T. Darrell. 2014. Part-based R-CNNs for fine-grained category detection. In Proceedings of the European Conference on Computer Vision (ECCV). Springer, 834-849."},{"volume-title":"Proceedings of the IEEE International Conference on Computer Vision (ICCV). 5209-5217","author":"Zheng H.","key":"e_1_3_2_2_40_1","unstructured":"H. Zheng, J. Fu, T. Mei, and J. Luo. 2017. Learning multi-attention convolutional neural network for fine-grained image recognition. In Proceedings of the IEEE International Conference on Computer Vision (ICCV). 5209-5217."},{"volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 20634-20643","author":"Zhou K.","key":"e_1_3_2_2_41_1","unstructured":"K. Zhou, B. L. Bhatnagar, J. E. Lenssen, and G. Pons-Moll. 2024. Gears: Local geometry-aware hand-object interaction synthesis. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 20634-20643."},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3272571"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755281","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:07:40Z","timestamp":1765339660000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755281"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":42,"alternative-id":["10.1145\/3746027.3755281","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755281","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}