{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,7]],"date-time":"2026-05-07T10:37:14Z","timestamp":1778150234632,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100003453","name":"Natural Science Foundation of Guangdong Province","doi-asserted-by":"publisher","award":["2023A1515030264"],"award-info":[{"award-number":["2023A1515030264"]}],"id":[{"id":"10.13039\/501100003453","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Shenzhen Basic Research Fund","award":["JCYJ20200109142217397"],"award-info":[{"award-number":["JCYJ20200109142217397"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612581","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"4368-4377","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":18,"title":["Learning from Easy to Hard Pairs: Multi-step Reasoning Network for Human-Object Interaction Detection"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2470-0293","authenticated-orcid":false,"given":"Yuchen","family":"Zhou","sequence":"first","affiliation":[{"name":"School of Intelligent Systems Engineering, Shenzhen Campus of Sun Yat-sen University, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0658-8867","authenticated-orcid":false,"given":"Guang","family":"Tan","sequence":"additional","affiliation":[{"name":"School of Intelligent Systems Engineering, Shenzhen Campus of Sun Yat-sen University, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3594-4808","authenticated-orcid":false,"given":"Mengtang","family":"Li","sequence":"additional","affiliation":[{"name":"School of Intelligent Systems Engineering, Shenzhen Campus of Sun Yat-sen University, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4128-886X","authenticated-orcid":false,"given":"Chao","family":"Gou","sequence":"additional","affiliation":[{"name":"School of Intelligent Systems Engineering, Shenzhen Campus of Sun Yat-sen University, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_2_1","volume-title":"Learning to detect human-object interactions. In 2018 ieee winter conference on applications of computer vision (wacv)","author":"Chao Yu-Wei","unstructured":"Yu-Wei Chao, Yunfan Liu, Xieyang Liu, Huayi Zeng, and Jia Deng. 2018. Learning to detect human-object interactions. In 2018 ieee winter conference on applications of computer vision (wacv). IEEE, 381--389."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00889"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01893"},{"key":"e_1_3_2_1_5_1","volume-title":"Drg: Dual relation graph for human-object interaction detection. In Computer Vision-ECCV 2020: 16th European Conference","author":"Gao Chen","year":"2020","unstructured":"Chen Gao, Jiarui Xu, Yuliang Zou, and Jia-Bin Huang. 2020. Drg: Dual relation graph for human-object interaction detection. In Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part XII 16. Springer, 696--712."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00872"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547870"},{"key":"e_1_3_2_1_8_1","volume-title":"Visual semantic role labeling. arXiv preprint arXiv:1505.04474","author":"Gupta Saurabh","year":"2015","unstructured":"Saurabh Gupta and Jitendra Malik. 2015. Visual semantic role labeling. arXiv preprint arXiv:1505.04474 (2015)."},{"key":"e_1_3_2_1_9_1","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision. 9677--9685","author":"Gupta Tanmay","year":"2019","unstructured":"Tanmay Gupta, Alexander Schwing, and Derek Hoiem. 2019. No-frills humanobject interaction detection: Factorization, layout encodings, and training techniques. In Proceedings of the IEEE\/CVF International Conference on Computer Vision. 9677--9685."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58555-6_35"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00056"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01441"},{"key":"e_1_3_2_1_13_1","volume-title":"Uniondet: Union-level detector towards real-time human-object interaction detection. In Computer Vision-ECCV 2020: 16th European Conference","author":"Kim Bumsoo","year":"2020","unstructured":"Bumsoo Kim, Taeho Choi, Jaewoo Kang, and Hyunwoo J Kim. 2020. Uniondet: Union-level detector towards real-time human-object interaction detection. In Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part XV 16. Springer, 498--514."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00014"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01897"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00286"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01018"},{"key":"e_1_3_2_1_18_1","first-page":"5011","article-title":"Hoi analysis: Integrating and decomposing human-object interaction","volume":"33","author":"Li Yong-Lu","year":"2020","unstructured":"Yong-Lu Li, Xinpeng Liu, Xiaoqian Wu, Yizhuo Li, and Cewu Lu. 2020. Hoi analysis: Integrating and decomposing human-object interaction. Advances in Neural Information Processing Systems 33 (2020), 5011--5022.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00370"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00056"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01949"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.324"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01948"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413600"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01240-3_25"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01895"},{"key":"e_1_3_2_1_28_1","volume-title":"Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems 28","author":"Ren Shaoqing","year":"2015","unstructured":"Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. 2015. Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems 28 (2015)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475607"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01027"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01363"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547793"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58520-4_15"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475636"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547974"},{"key":"e_1_3_2_1_36_1","volume-title":"Tel Aviv","author":"Wu Xiaoqian","year":"2022","unstructured":"Xiaoqian Wu, Yong-Lu Li, Xinpeng Liu, Junyi Zhang, Yuzhe Wu, and Cewu Lu. 2022. Mining cross-person cues for body-part interactiveness learning in hoi detection. In Computer Vision-ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part IV. Springer, 121--136."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548409"},{"key":"e_1_3_2_1_38_1","first-page":"17209","article-title":"Mining the benefits of two-stage and one-stage hoi detection","volume":"34","author":"Zhang Aixi","year":"2021","unstructured":"Aixi Zhang, Yue Liao, Si Liu, Miao Lu, Yongliang Wang, Chen Gao, and Xiaobo Li. 2021. Mining the benefits of two-stage and one-stage hoi detection. Advances in Neural Information Processing Systems 34 (2021), 17209--17220.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01307"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01947"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01894"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01246"},{"key":"e_1_3_2_1_43_1","volume-title":"Tel Aviv","author":"Zhong Xubin","year":"2022","unstructured":"Xubin Zhong, Changxing Ding, Zijian Li, and Shaoli Huang. 2022. Towards Hard-Positive Query Mining for DETR-Based Human-Object Interaction Detection. In Computer Vision-ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part XXVII. Springer, 444--460."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58565-5_5"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01303"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01896"},{"key":"e_1_3_2_1_47_1","volume-title":"Deformable DETR: Deformable Transformers for End-to-End Object Detection. In International Conference on Learning Representations.","author":"Zhu Xizhou","unstructured":"Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, and Jifeng Dai. [n. d.]. Deformable DETR: Deformable Transformers for End-to-End Object Detection. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01165"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612581","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612581","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:01:27Z","timestamp":1755820887000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612581"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":48,"alternative-id":["10.1145\/3581783.3612581","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612581","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}