{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T05:05:10Z","timestamp":1750309510684,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":47,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Postdoctoral Fellowship Program of CPSF","award":["GZB20240115"],"award-info":[{"award-number":["GZB20240115"]}]},{"name":"National Natural Science Foundation of China","award":["62306064"],"award-info":[{"award-number":["62306064"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680680","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"1495-1504","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Towards Open-vocabulary HOI Detection with Calibrated Vision-language Models and Locality-aware Queries"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-9185-3193","authenticated-orcid":false,"given":"Zhenhao","family":"Yang","sequence":"first","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-2802-0556","authenticated-orcid":false,"given":"Xin","family":"Liu","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2259-886X","authenticated-orcid":false,"given":"Deqiang","family":"Ouyang","sequence":"additional","affiliation":[{"name":"Chongqing University, Chongqing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6857-7744","authenticated-orcid":false,"given":"Guiduo","family":"Duan","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4839-0234","authenticated-orcid":false,"given":"Dongyang","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8676-7429","authenticated-orcid":false,"given":"Tao","family":"He","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4651-2821","authenticated-orcid":false,"given":"Yuan-Fang","family":"Li","sequence":"additional","affiliation":[{"name":"Monash University, Melbourne, Australia"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"volume-title":"Learning to detect human-object interactions. In 2018 ieee winter conference on applications of computer vision (wacv)","author":"Chao Yu-Wei","key":"e_1_3_2_1_2_1","unstructured":"Yu-Wei Chao, Yunfan Liu, Xieyang Liu, Huayi Zeng, and Jia Deng. 2018. Learning to detect human-object interactions. In 2018 ieee winter conference on applications of computer vision (wacv). IEEE, 381--389."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.23919\/MVA57639.2023.10215534"},{"key":"e_1_3_2_1_4_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Christoph Feichtenhofer Axel Pinz and Richard P Wildes. 2017. Spatiotemporal multiplier networks for video action recognition. In CVPR. 4768--4777.","DOI":"10.1109\/CVPR.2017.787"},{"key":"e_1_3_2_1_6_1","volume-title":"Clip-adapter: Better vision-language models with feature adapters. ICCV","author":"Gao Peng","year":"2023","unstructured":"Peng Gao, Shijie Geng, Renrui Zhang, Teli Ma, Rongyao Fang, Yongfeng Zhang, Hongsheng Li, and Yu Qiao. 2023. Clip-adapter: Better vision-language models with feature adapters. ICCV (2023), 1--15."},{"key":"e_1_3_2_1_7_1","volume-title":"Open-vocabulary object detection via vision and language knowledge distillation. arXiv preprint arXiv:2104.13921","author":"Gu Xiuye","year":"2021","unstructured":"Xiuye Gu, Tsung-Yi Lin, Weicheng Kuo, and Yin Cui. 2021. Open-vocabulary object detection via vision and language knowledge distillation. arXiv preprint arXiv:2104.13921 (2021)."},{"key":"e_1_3_2_1_8_1","volume-title":"Visual semantic role labeling. arXiv preprint arXiv:1505.04474","author":"Gupta Saurabh","year":"2015","unstructured":"Saurabh Gupta and Jitendra Malik. 2015. Visual semantic role labeling. arXiv preprint arXiv:1505.04474 (2015)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01568"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_4"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3330304"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i04.5832"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58555-6_35"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00056"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01441"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00528"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01887"},{"key":"e_1_3_2_1_19_1","volume-title":"The overlooked classifier in human-object interaction recognition. arXiv preprint arXiv:2203.05676","author":"Jin Ying","year":"2022","unstructured":"Ying Jin, Yinpeng Chen, Lijuan Wang, Jianfeng Wang, Pei Yu, Lin Liang, Jenq-Neng Hwang, and Zicheng Liu. 2022. The overlooked classifier in human-object interaction recognition. arXiv preprint arXiv:2203.05676 (2022)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01897"},{"key":"e_1_3_2_1_21_1","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision. 6480--6490","author":"Lei Ting","year":"2023","unstructured":"Ting Lei, Fabian Caba, Qingchao Chen, Hailin Jin, Yuxin Peng, and Yang Liu. 2023. Efficient Adaptive Human-Object Interaction Detection with Conceptguided Memory. In Proceedings of the IEEE\/CVF International Conference on Computer Vision. 6480--6490."},{"key":"e_1_3_2_1_22_1","volume-title":"Neural-Logic Human-Object Interaction Detection. arXiv preprint arXiv:2311.09817","author":"Li Liulei","year":"2023","unstructured":"Liulei Li, Jianan Wei, Wenguan Wang, and Yi Yang. 2023. Neural-Logic Human-Object Interaction Detection. arXiv preprint arXiv:2311.09817 (2023)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01949"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.324"},{"key":"e_1_3_2_1_25_1","volume-title":"Dab-detr: Dynamic anchor boxes are better queries for detr. arXiv preprint arXiv:2201.12329","author":"Liu Shilong","year":"2022","unstructured":"Shilong Liu, Feng Li, Hao Zhang, Xiao Yang, Xianbiao Qi, Hang Su, Jun Zhu, and Lei Zhang. 2022. Dab-detr: Dynamic anchor boxes are better queries for detr. arXiv preprint arXiv:2201.12329 (2022)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413600"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_28_1","volume-title":"Thirty-seventh Conference on Neural Information Processing Systems.","author":"Mao Yunyao","year":"2023","unstructured":"Yunyao Mao, Jiajun Deng, Wengang Zhou, Li Li, Yao Fang, and Houqiang Li. 2023. CLIP4HOI: Towards Adap ting CLIP for Practical Zero-Shot HOI Detection. In Thirty-seventh Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02251"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-srw.11"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01645"},{"key":"e_1_3_2_1_32_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-71249-9_14"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01027"},{"key":"e_1_3_2_1_35_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00101"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00558"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i3.25385"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28422"},{"key":"e_1_3_2_1_40_1","volume-title":"Boosting human-object interaction detection with text-to-image diffusion model. arXiv preprint arXiv:2305.12252","author":"Yang Jie","year":"2023","unstructured":"Jie Yang, Bingliang Li, Fengyu Yang, Ailing Zeng, Lei Zhang, and Ruimao Zhang. 2023. Boosting human-object interaction detection with text-to-image diffusion model. arXiv preprint arXiv:2305.12252 (2023)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Hangjie Yuan Shiwei Zhang Xiang Wang Samuel Albanie Yining Pan Tao Feng Jianwen Jiang Dong Ni Yingya Zhang and Deli Zhao. 2023. RLIPv2: Fast Scaling of Relational Language-Image Pre-training. In ICCV. 21649--21661.","DOI":"10.1109\/ICCV51070.2023.01979"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01416"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01947"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00955"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01858"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"e_1_3_2_1_47_1","volume-title":"Zero-Shot Human-Object Interaction Detection via Similarity Propagation","author":"Zong Daoming","year":"2023","unstructured":"Daoming Zong and Shiliang Sun. 2023. Zero-Shot Human-Object Interaction Detection via Similarity Propagation. IEEE Transactions on Neural Networks and Learning Systems (2023)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680680","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680680","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:57Z","timestamp":1750295877000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680680"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":47,"alternative-id":["10.1145\/3664647.3680680","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680680","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}