{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T18:46:59Z","timestamp":1755802019152,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","funder":[{"name":"Fund of National Laboratory on Adaptive Optics, China","award":["FNLAO-24-ZD-O02"],"award-info":[{"award-number":["FNLAO-24-ZD-O02"]}]},{"DOI":"10.13039\/501100006374","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61802053,62372387"],"award-info":[{"award-number":["61802053,62372387"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100006374","name":"Natural Science Foundation of Sichuan Province","doi-asserted-by":"publisher","award":["2024NSFSC0508"],"award-info":[{"award-number":["2024NSFSC0508"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1145\/3731715.3733360","type":"proceedings-article","created":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T18:31:04Z","timestamp":1750876264000},"page":"1786-1794","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["HOOI Detection: Cascade-Clue Integrated Modeling over Multiple Temporal Segments"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-7302-5335","authenticated-orcid":false,"given":"Mingxuan","family":"Zhang","sequence":"first","affiliation":[{"name":"Southwest Jiaotong University, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6109-9417","authenticated-orcid":false,"given":"Qi","family":"He","sequence":"additional","affiliation":[{"name":"Southwest Jiaotong University, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4083-5155","authenticated-orcid":false,"given":"Zhaoquan","family":"Yuan","sequence":"additional","affiliation":[{"name":"Southwest Jiaotong University, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-5533-6152","authenticated-orcid":false,"given":"Tingquan","family":"He","sequence":"additional","affiliation":[{"name":"Southwest Jiaotong University, Chengdu, China and Guangxi Xinfazhan Communications Group Co. Ltd, Guangxi, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-0221-338X","authenticated-orcid":false,"given":"Rong","family":"Li","sequence":"additional","affiliation":[{"name":"PetroChina Shaanxi Marketing Company, Xian, China"}]}],"member":"320","published-online":{"date-parts":[[2025,6,30]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2023.103741"},{"volume-title":"IEEE Conference on Computer Vision and Pattern Recognition. 6299--6308","author":"Carreira A.","key":"e_1_3_2_1_2_1","unstructured":"J. Carreira and A. Zisserman. 2017. Quo vadis, action recognition? a new model and the kinetics dataset. In IEEE Conference on Computer Vision and Pattern Recognition. 6299--6308."},{"volume-title":"PaStaNet: Toward Human Activity Knowledge Engine. In IEEE Conference on Computer Vision and Pattern Recognition. 379--388","author":"Li L.","key":"e_1_3_2_1_3_1","unstructured":"Y. L. Li L. Xu X. Liu X. Huang Y. Xu S. Wang H. S. Fang Z. Ma M. Chen and C. Lu. 2020. PaStaNet: Toward Human Activity Knowledge Engine. In IEEE Conference on Computer Vision and Pattern Recognition. 379--388."},{"volume-title":"IEEE Conference on Computer Vision and Pattern Recognition. 264--272","author":"Yang X.","key":"e_1_3_2_1_4_1","unstructured":"X. Yang X. Yang M. Y. Liu F. Xiao L. S. Davis and J. Kautz. 2019. Step: Spatio-temporal progressive learning for video action detection. In IEEE Conference on Computer Vision and Pattern Recognition. 264--272."},{"volume-title":"IEEE International Conference on Computer Vision. 8106--8116","author":"Ji R.","key":"e_1_3_2_1_5_1","unstructured":"J. Ji R. Desai and J. C. Niebles. 2021. Detecting human-object relationships in videos. In IEEE International Conference on Computer Vision. 8106--8116."},{"volume-title":"IEEE Conference on Computer Vision and Pattern Recognition. 640--649","author":"Rajasegaran G.","key":"e_1_3_2_1_6_1","unstructured":"J. Rajasegaran G. Pavlakos A. Kanazawa C. Feichtenhofer and J. Malik. 2023. On the benefits of 3D pose and tracking for human action recognition. In IEEE Conference on Computer Vision and Pattern Recognition. 640--649."},{"volume-title":"IEEE International Conference on Computer Vision. 4405--4413","author":"Kalogeiton P.","key":"e_1_3_2_1_7_1","unstructured":"V. Kalogeiton P. Weinzaepfel V. Ferrari and C. Schmid. 2017. Action tubelet detector for spatio-temporal action localization. In IEEE International Conference on Computer Vision. 4405--4413."},{"key":"e_1_3_2_1_8_1","first-page":"17209","article-title":"Mining the benefits of two-stage and one-stage hoi detection","volume":"34","author":"Zhang Y.","year":"2021","unstructured":"A. Zhang Y. Liao S. Liu M. Lu Y. Wang C. Gao and X. Li. 2021. Mining the benefits of two-stage and one-stage hoi detection. Advances in Neural Information Processing Systems, Vol. 34 (2021), 17209--17220.","journal-title":"Advances in Neural Information Processing Systems"},{"volume-title":"Non-local Neural Networks. In IEEE Conference on Computer Vision and Pattern Recognition. 7794--7803","author":"Wang R.","key":"e_1_3_2_1_9_1","unstructured":"X. Wang R. Girshick A. Gupta and K. He. 2018. Non-local Neural Networks. In IEEE Conference on Computer Vision and Pattern Recognition. 7794--7803."},{"volume-title":"Unifying Nonlocal Blocks for Neural Networks. In IEEE International Conference on Computer Vision. 12292--12301","author":"Zhu Q.","key":"e_1_3_2_1_10_1","unstructured":"L. Zhu Q. She D. Li Y. Lu X. Kang J. Hu and C. Wang. 2021. Unifying Nonlocal Blocks for Neural Networks. In IEEE International Conference on Computer Vision. 12292--12301."},{"volume-title":"IEEE Conference on Computer Vision and Pattern Recognition. 13617--13626","author":"Ulutan A. S. M.","key":"e_1_3_2_1_11_1","unstructured":"O. Ulutan A. S. M. Iftekhar and B. S. Manjunath. 2020. Vsgnet: Spatial attention network for detecting human object interactions using graph convolutions. In IEEE Conference on Computer Vision and Pattern Recognition. 13617--13626."},{"volume-title":"Vision Relation Transformer for Unbiased Scene Graph Generation. In IEEE International Conference on Computer Vision. 21882--21893","author":"Sudhakaran D. S.","key":"e_1_3_2_1_12_1","unstructured":"G. Sudhakaran D. S. Dhami K. Kersting and S. Roth. 2023. Vision Relation Transformer for Unbiased Scene Graph Generation. In IEEE International Conference on Computer Vision. 21882--21893."},{"key":"e_1_3_2_1_13_1","volume-title":"HOTR: End-to-End Human-Object Interaction Detection With Transformers. In IEEE Conference on Computer Vision and Pattern Recognition. 74--83","author":"Kim J.","year":"2021","unstructured":"B. Kim J. Lee J. Kang E. S. Kim and H. W. J Kim. 2021. HOTR: End-to-End Human-Object Interaction Detection With Transformers. In IEEE Conference on Computer Vision and Pattern Recognition. 74--83."},{"volume-title":"European Conference on Computer Vision. Springer, 213--229","author":"Carion F.","key":"e_1_3_2_1_14_1","unstructured":"N. Carion F. Massa G. Synnaeve N. Usunier A. Kirillov and S. Zagoruyko. 2020. End-to-end object detection with transformers. In European Conference on Computer Vision. Springer, 213--229."},{"volume-title":"IEEE Conference on Computer Vision and Pattern Recognition. 19925--19934","author":"Chen G.","key":"e_1_3_2_1_15_1","unstructured":"J. Chen G. Mittal Y. Yu Y. Kong and M. Chen. 2022. Gatehub: Gated history unit with background suppression for online action detection. In IEEE Conference on Computer Vision and Pattern Recognition. 19925--19934."},{"volume-title":"IEEE Conference on Computer Vision and Pattern Recognition. 17969--17979","author":"Kuo Z.","key":"e_1_3_2_1_16_1","unstructured":"C.-W. Kuo and Z. Kira. 2022. Beyond a pre-trained object detector: Cross-modal textual and visual context for image captioning. In IEEE Conference on Computer Vision and Pattern Recognition. 17969--17979."},{"volume-title":"ACM International Conference on Multimedia. 4985--4993","author":"Wang G.","key":"e_1_3_2_1_17_1","unstructured":"N. Wang G. Zhu L. Zhang P. Shen H. Li and C. Hua. 2021. Spatio-temporal interaction graph parsing networks for human-object interaction recognition. In ACM International Conference on Multimedia. 4985--4993."},{"volume-title":"IEEE Conference on Computer Vision and Pattern Recognition. 9523--9532","author":"Xu X.","key":"e_1_3_2_1_18_1","unstructured":"Y. Xu X. Yang L. Gong H.-C. Lin T.-Y. Wu Y. Li and N. Vasconcelos. 2020. Explainable object-induced action decision for autonomous vehicles. In IEEE Conference on Computer Vision and Pattern Recognition. 9523--9532."},{"volume-title":"IEEE International Conference on Computer Vision. 2953--2962","author":"Qian X.","key":"e_1_3_2_1_19_1","unstructured":"Z. Qian X. Wang X. Duan P. Qin Y. Li and W. Zhu. 2023. Decouple before interact: Multi-modal prompt learning for continual visual question answering. In IEEE International Conference on Computer Vision. 2953--2962."},{"volume-title":"IEEE Conference on Computer Vision and Pattern Recognition. 18675--18685","author":"Yang W.","key":"e_1_3_2_1_20_1","unstructured":"J. Yang W. Peng X. Li Z. Guo L. Chen B. Li Z. Ma K. Zhou W. Zhang C. C. Loy and Z. Liu. 2023. Panoptic video scene graph generation. In IEEE Conference on Computer Vision and Pattern Recognition. 18675--18685."},{"volume-title":"IEEE International Conference on Computer Vision. 6202--6211","author":"Feichtenhofer H.","key":"e_1_3_2_1_21_1","unstructured":"C. Feichtenhofer H. Fan J. Malik and K. He. 2019. Slowfast networks for video recognition. In IEEE International Conference on Computer Vision. 6202--6211."},{"volume-title":"IEEE Conference on Computer Vision and Pattern Recognition. 13587--13597","author":"Wu Y.","key":"e_1_3_2_1_22_1","unstructured":"C. Y. Wu Y. Li K. Mangalam H. Fan B. Xiong J. Malik and C. Feichtenhofer. 2022a. Memvit: Memory-augmented multiscale vision transformer for efficient long-term video recognition. In IEEE Conference on Computer Vision and Pattern Recognition. 13587--13597."},{"volume-title":"IEEE International Conference on Computer Vision. 6824--6835","author":"Fan B.","key":"e_1_3_2_1_23_1","unstructured":"H. Fan B. Xiong K. Mangalam Y. Li Z. Yan J. Malik and C. Feichtenhofer. 2021. Multiscale vision transformers. In IEEE International Conference on Computer Vision. 6824--6835."},{"volume-title":"IEEE Conference on Computer Vision and Pattern Recognition. 4804--4814","author":"Li C. Y.","key":"e_1_3_2_1_24_1","unstructured":"Y. Li C. Y. Wu H. Fan K. Mangalam B. Xiong J. Malik and C. Feichtenhofer. 2022b. Mvitv2: Improved multiscale vision transformers for classification and detection. In IEEE Conference on Computer Vision and Pattern Recognition. 4804--4814."},{"volume-title":"Convolutional Transformer Based Dual Discriminator Generative Adversarial Networks for Video Anomaly Detection. In ACM International Conference on Multimedia. 5546--5554","author":"Feng D.","key":"e_1_3_2_1_25_1","unstructured":"X. Feng D. Song Y. Chen Z. Chen J. Ni and H. Chen. 2021. Convolutional Transformer Based Dual Discriminator Generative Adversarial Networks for Video Anomaly Detection. In ACM International Conference on Multimedia. 5546--5554."},{"volume-title":"European Conference on Computer Vision. 71--87","author":"Tang J.","key":"e_1_3_2_1_26_1","unstructured":"J. Tang J. Xia X. Mu B. Pang and C. Lu. 2020. Asynchronous interaction aggregation for action detection. In European Conference on Computer Vision. 71--87."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01657-x"},{"volume-title":"IEEE Conference on Computer Vision and Pattern Recognition. 6047--6056","author":"Gu C.","key":"e_1_3_2_1_28_1","unstructured":"C. Gu C. Sun D. A. Ross C. Vondrick C. Pantofaru Y. Li S. Vijayanarasimhan G. Toderici S. Ricco R. Sukthankar C. Schmid and J. Malik. 2018. Ava: A video dataset of spatio-temporally localized atomic visual actions. In IEEE Conference on Computer Vision and Pattern Recognition. 6047--6056."},{"volume-title":"IEEE Conference on Computer Vision and Pattern Recognition. 464--474","author":"Pan S.","key":"e_1_3_2_1_29_1","unstructured":"J. Pan S. Chen M. Z. Shou Y. Liu J. Shao and H. Li. 2021. Actor-context-actor relation network for spatio-temporal action localization. In IEEE Conference on Computer Vision and Pattern Recognition. 464--474."},{"volume-title":"European Conference on Computer Vision. 401--417","author":"Qi W.","key":"e_1_3_2_1_30_1","unstructured":"S. Qi W. Wang B. Jia J. Shen and S. C. Zhu. 2018. Learning human-object interactions by graph parsing neural networks. In European Conference on Computer Vision. 401--417."},{"volume-title":"IEEE Conference on Computer Vision and Pattern Recognition. 13598--13607","author":"Zhao Y.","key":"e_1_3_2_1_31_1","unstructured":"J. Zhao Y. Zhang X. Li H. Chen B. Shuai M. Xu C. Liu K. Kundu Y. Xiong D. Modolo I. Marsic C. G. M. Snoek and J. Tighe. 2022. Tuber: Tubelet transformer for video action detection. In IEEE Conference on Computer Vision and Pattern Recognition. 13598--13607."},{"volume-title":"IEEE Conference on Computer Vision and Pattern Recognition. 18020--18029","author":"Wu X.","key":"e_1_3_2_1_32_1","unstructured":"M. Wu X. Zhang X. Sun Y. Zhou C. Chen J. Gu X. Sun and R. Ji. 2022. Difnet: Boosting visual information flow for image captioning. In IEEE Conference on Computer Vision and Pattern Recognition. 18020--18029."},{"volume-title":"SkeleTR: Towards Skeleton-based Action Recognition in the Wild. In IEEE International Conference on Computer Vision. 13634--13644","author":"Duan M.","key":"e_1_3_2_1_33_1","unstructured":"H. Duan M. Xu B. Shuai D. Modolo Z. Tu J. Tighe and A. Bergamo. 2023. SkeleTR: Towards Skeleton-based Action Recognition in the Wild. In IEEE International Conference on Computer Vision. 13634--13644."},{"key":"e_1_3_2_1_34_1","first-page":"1086","article-title":"Long short-term transformer for online action detection","volume":"34","author":"Xu Y.","year":"2021","unstructured":"M. Xu Y. Xiong H. Chen X. Li W. Xia Z. Tu and S. Soatto. 2021. Long short-term transformer for online action detection. Advances in Neural Information Processing Systems, Vol. 34 (2021), 1086--1099.","journal-title":"Advances in Neural Information Processing Systems"},{"volume-title":"IEEE Conference on Computer Vision and Pattern Recognition. 16041--16050","author":"Morais V.","key":"e_1_3_2_1_35_1","unstructured":"R. Morais V. Le S. Venkatesh and T. Tran. 2021. Learning asynchronous and sparse human-object interaction in videos. In IEEE Conference on Computer Vision and Pattern Recognition. 16041--16050."},{"volume-title":"IEEE International Conference on Computer Vision. 21582--21592","author":"Wang M.","key":"e_1_3_2_1_36_1","unstructured":"Y. Wang M. Yasunaga H. Ren S. Wada and J. Leskovec. 2023. Vqa-gnn: Reasoning with multimodal knowledge via graph neural networks for visual question answering. In IEEE International Conference on Computer Vision. 21582--21592."},{"volume-title":"Memory-and-Anticipation Transformer for Online Action Understanding. In IEEE International Conference on Computer Vision. 13824--13835","author":"Wang G.","key":"e_1_3_2_1_37_1","unstructured":"J. Wang G. Chen Y. Huang L. Wang and T. Lu. 2023. Memory-and-Anticipation Transformer for Online Action Understanding. In IEEE International Conference on Computer Vision. 13824--13835."},{"key":"e_1_3_2_1_38_1","volume-title":"Videomamba: State space model for efficient video understanding. In European Conference on Computer Vision","author":"Li X.","year":"2024","unstructured":"K. Li X. Li Y. Wang Y. He Y. Wang L. Wang and Y. Qiao. 2024. Videomamba: State space model for efficient video understanding. In European Conference on Computer Vision. Springer, 237--255."},{"volume-title":"IEEE International Conference on Computer Vision. 19948--19960","author":"Li Y.","key":"e_1_3_2_1_39_1","unstructured":"K. Li Y. Wang Y. Li Y. Wang Y. He L. Wang and Y. Qiao. 2023. Unmasked teacher: Towards training-efficient video foundation models. In IEEE International Conference on Computer Vision. 19948--19960."},{"key":"e_1_3_2_1_40_1","first-page":"21158","article-title":"Neural-logic human-object interaction detection","volume":"36","author":"Li J.","year":"2023","unstructured":"L. Li J. Wei W. Wang and Y. Yang. 2023. Neural-logic human-object interaction detection. Advances in Neural Information Processing Systems, Vol. 36 (2023), 21158--21171.","journal-title":"Advances in Neural Information Processing Systems"},{"volume-title":"European Conference on Computer Vision. Springer, 396--416","author":"Wang K.","key":"e_1_3_2_1_41_1","unstructured":"Y. Wang K. Li X. Li J. Yu Y. He G. Chen B. Pei R. Zheng Z. Wang Y. Shi T. Jiang S. Li J. Xu H. Zhang Y. Huang Y. Qiao Y. Wang and L. Wang. 2024. Internvideo2: Scaling foundation models for multimodal video understanding. In European Conference on Computer Vision. Springer, 396--416."},{"key":"e_1_3_2_1_42_1","first-page":"10078","article-title":"Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training","volume":"35","author":"Tong Y.","year":"2022","unstructured":"Z. Tong Y. Song J. Wang and L. Wang. 2022. Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training. Advances in Neural Information Processing Systems, Vol. 35 (2022), 10078--10093.","journal-title":"Advances in Neural Information Processing Systems"},{"volume-title":"ACM International Conference on Multimedia. 521--529","author":"Zhuo Z.","key":"e_1_3_2_1_43_1","unstructured":"T. Zhuo Z. Cheng P. Zhang Y. Wong and M. Kankanhalli. 2019. Explainable video action reasoning via prior knowledge and state transitions. In ACM International Conference on Multimedia. 521--529."},{"volume-title":"IEEE International Conference on Computer Vision. 10388--10399","author":"Chen Z.","key":"e_1_3_2_1_44_1","unstructured":"L. Chen Z. Tong Y. Song G. Wu and L. Wang. 2023a. Efficient video action detection with token dropout and context refinement. In IEEE International Conference on Computer Vision. 10388--10399."},{"volume-title":"IEEE Conference on Computer Vision and Pattern Recognition. 14720--14729","author":"Wu M.","key":"e_1_3_2_1_45_1","unstructured":"T. Wu M. Cao Z. Gao G. Wu and L. Wang. 2023b. Stmixer: A one-stage sparse action detector. In IEEE Conference on Computer Vision and Pattern Recognition. 14720--14729."},{"volume-title":"Global Information Guided Video Anomaly Detection. In ACM International Conference on Multimedia. 4679--4683","author":"Lv C.","key":"e_1_3_2_1_46_1","unstructured":"H. Lv C. Xu and Z. Cui. 2020. Global Information Guided Video Anomaly Detection. In ACM International Conference on Multimedia. 4679--4683."},{"volume-title":"IEEE Winter Conference on Applications of Computer Vision. 381--389","author":"Chao Y.","key":"e_1_3_2_1_47_1","unstructured":"Y. W. Chao Y. Liu X. Liu H. Zeng and J. Deng. 2018. Learning to Detect Human-Object Interactions. In IEEE Winter Conference on Applications of Computer Vision. 381--389."},{"volume-title":"IEEE International Conference on Computer Vision. 21614--21624","author":"Tu W.","key":"e_1_3_2_1_48_1","unstructured":"D. Tu W. Sun G. Zhai and W. Shen. 2023. Agglomerative transformer for human-object interaction detection. In IEEE International Conference on Computer Vision. 21614--21624."},{"volume-title":"IEEE International Conference on Computer Vision. 10422--10432","author":"Cao W.","key":"e_1_3_2_1_49_1","unstructured":"S. Cao W. Luo B. Wang W. Zhang and L. Ma. 2023. E2e-load: end-to-end long-form online action detection. In IEEE International Conference on Computer Vision. 10422--10432."},{"volume-title":"European Conference on Computer Vision. Springer, 485--502","author":"Zhao P.","key":"e_1_3_2_1_50_1","unstructured":"Y. Zhao and P. Kr\u00e4henb\u00fchl. 2022. Real-time online video detection with temporal smoothing transformers. In European Conference on Computer Vision. Springer, 485--502."},{"volume-title":"Proceedings of the 2021 Workshop on Intelligent Cross-Data Analysis and Retrieval. 9--17","author":"Chiou C. Y.","key":"e_1_3_2_1_51_1","unstructured":"M. J. Chiou C. Y. Liao L. W. Wang R. Zimmermann and J. Feng. 2021. St-hoi: A spatial-temporal baseline for human-object interaction detection in videos. In Proceedings of the 2021 Workshop on Intelligent Cross-Data Analysis and Retrieval. 9--17."}],"event":{"name":"ICMR '25: International Conference on Multimedia Retrieval","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Chicago IL USA","acronym":"ICMR '25"},"container-title":["Proceedings of the 2025 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731715.3733360","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T04:05:13Z","timestamp":1755749113000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731715.3733360"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":51,"alternative-id":["10.1145\/3731715.3733360","10.1145\/3731715"],"URL":"https:\/\/doi.org\/10.1145\/3731715.3733360","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]},"assertion":[{"value":"2025-06-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}