{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:09:56Z","timestamp":1765343396370,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62376186"],"award-info":[{"award-number":["62376186"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754712","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:55Z","timestamp":1761377215000},"page":"7239-7247","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Ex Pede Herculem, Predicting Global Actionness Curve from Local Clips"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1805-5435","authenticated-orcid":false,"given":"Xu","family":"Chen","sequence":"first","affiliation":[{"name":"College of Intelligence and Computing, Tianjin University, Tianjin, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-3028-5972","authenticated-orcid":false,"given":"Yang","family":"Li","sequence":"additional","affiliation":[{"name":"College of Intelligence and Computing, Tianjin University, Tianjin, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2768-1398","authenticated-orcid":false,"given":"Yahong","family":"Han","sequence":"additional","affiliation":[{"name":"College of Intelligence and Computing, Tianjin University, Tianjin, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4560-8509","authenticated-orcid":false,"given":"Jialie","family":"Shen","sequence":"additional","affiliation":[{"name":"Department of Computer Science, City St George's, University of London, London, United Kingdom"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"volume-title":"End-to-end object detection with transformers","author":"Carion Nicolas","key":"e_1_3_2_1_2_1","unstructured":"Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. 2020. End-to-end object detection with transformers. In ECCV. Springer, 213-229."},{"key":"e_1_3_2_1_3_1","first-page":"6299","article-title":"Quo vadis, action recognition? a new model and the kinetics dataset","author":"Carreira Joao","year":"2017","unstructured":"Joao Carreira and Andrew Zisserman. 2017. Quo vadis, action recognition? a new model and the kinetics dataset. In CVPR. 6299-6308.","journal-title":"CVPR."},{"key":"e_1_3_2_1_4_1","volume-title":"CTRN: Class-Temporal Relational Network for Action Detection","author":"Dai Rui","year":"2021","unstructured":"Rui Dai, Srijan Das, and Fran\u00e7ois Br\u00e9mond. 2021a. CTRN: Class-Temporal Relational Network for Action Detection. In BMVC. BMVA Press, 224."},{"key":"e_1_3_2_1_5_1","first-page":"20041","article-title":"MS-TCT: multi-scale temporal convtransformer for action detection","author":"Dai Rui","year":"2022","unstructured":"Rui Dai, Srijan Das, Kumara Kahatapitiya, Michael S Ryoo, and Fran\u00e7ois Br\u00e9mond. 2022. MS-TCT: multi-scale temporal convtransformer for action detection. In CVPR. 20041-20051.","journal-title":"CVPR."},{"key":"e_1_3_2_1_6_1","first-page":"2970","article-title":"Pdan: Pyramid dilated attention network for action detection","author":"Dai Rui","year":"2021","unstructured":"Rui Dai, Srijan Das, Luca Minciullo, Lorenzo Garattoni, Gianpiero Francesca, and Fran\u00e7ois Bremond. 2021b. Pdan: Pyramid dilated attention network for action detection. In WACV. 2970-2979.","journal-title":"WACV."},{"key":"e_1_3_2_1_7_1","volume-title":"Words: Transformers for Image Recognition at Scale. In ICLR.","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, et al., 2020. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In ICLR."},{"key":"e_1_3_2_1_8_1","unstructured":"Jufang Duan Wei Zheng Yangzhou Du Wenfa Wu Haipeng Jiang and Hongsheng Qi. [n.d.]. MF-CLR: Multi-Frequency Contrastive Learning Representation for Time Series. In ICML."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2025.3543108"},{"key":"e_1_3_2_1_10_1","unstructured":"Daniel Y Fu Hermann Kumbong Eric Nguyen and Christopher Re. 2024. FlashFFTConv: Efficient Convolutions for Long Sequences with Tensor Cores. In ICLR. https:\/\/openreview.net\/forum?id=gPKTTAfYBp"},{"key":"e_1_3_2_1_11_1","first-page":"5059","article-title":"Haan: Human action aware network for multi-label temporal action detection","author":"Gao Zikai","year":"2023","unstructured":"Zikai Gao, Peng Qiao, and Yong Dou. 2023. Haan: Human action aware network for multi-label temporal action detection. In ACM MM. 5059-5069.","journal-title":"ACM MM."},{"key":"e_1_3_2_1_12_1","first-page":"5842","article-title":"The'' something something'' video database for learning and evaluating visual common sense","author":"Goyal Raghav","year":"2017","unstructured":"Raghav Goyal, Samira Ebrahimi Kahou, Vincent Michalski, Joanna Materzynska, Susanne Westphal, Heuna Kim, Valentin Haenel, Ingo Fruend, Peter Yianilos, Moritz Mueller-Freitag, et al., 2017. The'' something something'' video database for learning and evaluating visual common sense. In ICCV. 5842-5850.","journal-title":"ICCV."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2008.4517537"},{"key":"e_1_3_2_1_14_1","first-page":"6049","article-title":"Adaptive Frequency Filters As Efficient Global Token Mixers","author":"Huang Zhipeng","year":"2023","unstructured":"Zhipeng Huang, Zhizheng Zhang, Cuiling Lan, Zheng-Jun Zha, Yan Lu, and Baining Guo. 2023. Adaptive Frequency Filters As Efficient Global Token Mixers. In ICCV. 6049-6059.","journal-title":"ICCV."},{"key":"e_1_3_2_1_15_1","first-page":"352","volume-title":"IEEE TPAMI","volume":"40","author":"Jiang Yu-Gang","year":"2017","unstructured":"Yu-Gang Jiang, Zuxuan Wu, Jun Wang, Xiangyang Xue, and Shih-Fu Chang. 2017. Exploiting feature and class relationships in video categorization with regularized deep neural networks. IEEE TPAMI, Vol. 40, 2 (2017), 352-364."},{"key":"e_1_3_2_1_16_1","first-page":"8385","article-title":"Coarse-fine networks for temporal activity detection in videos","author":"Kahatapitiya Kumara","year":"2021","unstructured":"Kumara Kahatapitiya and Michael S Ryoo. 2021. Coarse-fine networks for temporal activity detection in videos. In CVPR. 8385-8394.","journal-title":"CVPR."},{"key":"e_1_3_2_1_17_1","volume-title":"Far: Fourier aerial video recognition","author":"Kothandaraman Divya","year":"2022","unstructured":"Divya Kothandaraman, Tianrui Guan, Xijun Wang, Shuowen Hu, Ming Lin, and Dinesh Manocha. 2022. Far: Fourier aerial video recognition. In ECCV. Springer, 657-676."},{"key":"e_1_3_2_1_18_1","first-page":"2556","article-title":"HMDB: a large video database for human motion recognition","author":"Kuehne Hildegard","year":"2011","unstructured":"Hildegard Kuehne, Hueihan Jhuang, Est\u00edbaliz Garrote, Tomaso Poggio, and Thomas Serre. 2011. HMDB: a large video database for human motion recognition. In ICCV. 2556-2563.","journal-title":"ICCV."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2024.3367599"},{"key":"e_1_3_2_1_20_1","first-page":"513","article-title":"Resound: Towards action recognition without representation bias","author":"Li Yingwei","year":"2018","unstructured":"Yingwei Li, Yi Li, and Nuno Vasconcelos. 2018. Resound: Towards action recognition without representation bias. In ECCV. 513-528.","journal-title":"ECCV."},{"key":"e_1_3_2_1_21_1","first-page":"4804","article-title":"Mvitv2: Improved multiscale vision transformers for classification and detection","author":"Li Yanghao","year":"2022","unstructured":"Yanghao Li, Chao-Yuan Wu, Haoqi Fan, Karttikeya Mangalam, Bo Xiong, Jitendra Malik, and Christoph Feichtenhofer. 2022. Mvitv2: Improved multiscale vision transformers for classification and detection. In CVPR. 4804-4814.","journal-title":"CVPR."},{"key":"e_1_3_2_1_22_1","first-page":"3889","article-title":"Bmn: Boundary-matching network for temporal action proposal generation","author":"Lin Tianwei","year":"2019","unstructured":"Tianwei Lin, Xiao Liu, Xin Li, Errui Ding, and Shilei Wen. 2019. Bmn: Boundary-matching network for temporal action proposal generation. In ICCV. 3889-3898.","journal-title":"ICCV."},{"key":"e_1_3_2_1_23_1","first-page":"988","article-title":"Single shot temporal action detection","author":"Lin Tianwei","year":"2017","unstructured":"Tianwei Lin, Xu Zhao, and Zheng Shou. 2017b. Single shot temporal action detection. In ACM MM. 988-996.","journal-title":"ACM MM."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2019.2962063"},{"key":"e_1_3_2_1_25_1","first-page":"3","article-title":"Bsn: Boundary sensitive network for temporal action proposal generation","author":"Lin Tianwei","year":"2018","unstructured":"Tianwei Lin, Xu Zhao, Haisheng Su, Chongjing Wang, and Ming Yang. 2018. Bsn: Boundary sensitive network for temporal action proposal generation. In ECCV. 3-19.","journal-title":"ECCV."},{"key":"e_1_3_2_1_26_1","first-page":"2980","article-title":"Focal loss for dense object detection","author":"Lin Tsung-Yi","year":"2017","unstructured":"Tsung-Yi Lin, Priya Goyal, Ross Girshick, Kaiming He, and Piotr Doll\u00e1r. 2017a. Focal loss for dense object detection. In ICCV. 2980-2988.","journal-title":"ICCV."},{"key":"e_1_3_2_1_27_1","first-page":"4524","article-title":"ETAD: Training Action Detection End to End on a Laptop","author":"Liu Shuming","year":"2023","unstructured":"Shuming Liu, Mengmeng Xu, Chen Zhao, Xu Zhao, and Bernard Ghanem. 2023. ETAD: Training Action Detection End to End on a Laptop. In CVPR. 4524-4533.","journal-title":"CVPR."},{"key":"e_1_3_2_1_28_1","volume-title":"Ssd: Single shot multibox detector","author":"Liu Wei","year":"2016","unstructured":"Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, Cheng-Yang Fu, and Alexander C Berg. 2016. Ssd: Single shot multibox detector. In ECCV. Springer, 21-37."},{"key":"e_1_3_2_1_29_1","first-page":"20010","article-title":"An empirical study of end-to-end temporal action detection","author":"Liu Xiaolong","year":"2022","unstructured":"Xiaolong Liu, Song Bai, and Xiang Bai. 2022a. An empirical study of end-to-end temporal action detection. In CVPR. 20010-20019.","journal-title":"CVPR."},{"key":"e_1_3_2_1_30_1","first-page":"5427","article-title":"End-to-end temporal action detection with transformer","volume":"31","author":"Liu Xiaolong","year":"2022","unstructured":"Xiaolong Liu, Qimeng Wang, Yao Hu, Xu Tang, Shiwei Zhang, Song Bai, and Xiang Bai. 2022b. End-to-end temporal action detection with transformer. IEEE TIP, Vol. 31 (2022), 5427-5441.","journal-title":"IEEE TIP"},{"key":"e_1_3_2_1_31_1","first-page":"10012","article-title":"Swin transformer: Hierarchical vision transformer using shifted windows","author":"Liu Ze","year":"2021","unstructured":"Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, and Baining Guo. 2021. Swin transformer: Hierarchical vision transformer using shifted windows. In ICCV. 10012-10022.","journal-title":"ICCV."},{"key":"e_1_3_2_1_32_1","first-page":"5152","article-title":"Temporal gaussian mixture layer for videos","author":"Piergiovanni AJ","year":"2019","unstructured":"AJ Piergiovanni and Michael Ryoo. 2019. Temporal gaussian mixture layer for videos. In ICML. PMLR, 5152-5161.","journal-title":"ICML. PMLR"},{"key":"e_1_3_2_1_33_1","first-page":"5304","article-title":"Learning latent super-events to detect multiple activities in videos","author":"Piergiovanni AJ","year":"2018","unstructured":"AJ Piergiovanni and Michael S Ryoo. 2018. Learning latent super-events to detect multiple activities in videos. In CVPR. 5304-5313.","journal-title":"CVPR."},{"key":"e_1_3_2_1_34_1","first-page":"1137","volume-title":"IEEE TPAMI","volume":"39","author":"Ren Shaoqing","year":"2016","unstructured":"Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. 2016. Faster R-CNN: Towards real-time object detection with region proposal networks. IEEE TPAMI, Vol. 39, 6 (2016), 1137-1149."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"e_1_3_2_1_36_1","first-page":"2988","article-title":"Pat: Position-aware transformer for dense multi-label action detection","author":"Sardari Faegheh","year":"2023","unstructured":"Faegheh Sardari, Armin Mustafa, Philip JB Jackson, and Adrian Hilton. 2023. Pat: Position-aware transformer for dense multi-label action detection. In ICCV. 2988-2997.","journal-title":"ICCV."},{"key":"e_1_3_2_1_37_1","first-page":"18857","article-title":"Tridet: Temporal action detection with relative boundary modeling","author":"Shi Dingfeng","year":"2023","unstructured":"Dingfeng Shi, Yujie Zhong, Qiong Cao, Lin Ma, Jia Li, and Dacheng Tao. 2023. Tridet: Temporal action detection with relative boundary modeling. In CVPR. 18857-18866.","journal-title":"CVPR."},{"key":"e_1_3_2_1_38_1","volume-title":"NeurIPS","volume":"36","author":"Shin Donghyeok","year":"2024","unstructured":"Donghyeok Shin, Seungjae Shin, and Il-Chul Moon. 2024. Frequency domain-based dataset distillation. NeurIPS, Vol. 36 (2024)."},{"volume-title":"Hollywood in homes: Crowdsourcing data collection for activity understanding","author":"Sigurdsson Gunnar A","key":"e_1_3_2_1_39_1","unstructured":"Gunnar A Sigurdsson, G\u00fcl Varol, Xiaolong Wang, Ali Farhadi, Ivan Laptev, and Abhinav Gupta. 2016. Hollywood in homes: Crowdsourcing data collection for activity understanding. In ECCV. Springer, 510-526."},{"key":"e_1_3_2_1_40_1","first-page":"13526","article-title":"Relaxed transformer decoders for direct action proposal generation","author":"Tan Jing","year":"2021","unstructured":"Jing Tan, Jiaqi Tang, Limin Wang, and Gangshan Wu. 2021. Relaxed transformer decoders for direct action proposal generation. In ICCV. 13526-13535.","journal-title":"ICCV."},{"key":"e_1_3_2_1_41_1","first-page":"15268","article-title":"Pointtad: Multi-label temporal action detection with learnable query points","volume":"35","author":"Tan Jing","year":"2022","unstructured":"Jing Tan, Xiaotong Zhao, Xintian Shi, Bin Kang, and Limin Wang. 2022. Pointtad: Multi-label temporal action detection with learnable query points. In NeurIPS, Vol. 35. 15268-15280.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_42_1","first-page":"1460","article-title":"Modeling multi-label action dependencies for temporal action localization","author":"Tirupattur Praveen","year":"2021","unstructured":"Praveen Tirupattur, Kevin Duarte, Yogesh S Rawat, and Mubarak Shah. 2021. Modeling multi-label action dependencies for temporal action localization. In CVPR. 1460-1470.","journal-title":"CVPR."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3372297.3417254"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3589334.3645710"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i9.26232"},{"key":"e_1_3_2_1_46_1","first-page":"5783","article-title":"R-c3d: Region convolutional 3d network for temporal activity detection","author":"Xu Huijuan","year":"2017","unstructured":"Huijuan Xu, Abir Das, and Kate Saenko. 2017. R-c3d: Region convolutional 3d network for temporal activity detection. In ICCV. 5783-5792.","journal-title":"ICCV."},{"key":"e_1_3_2_1_47_1","first-page":"10156","article-title":"G-tad: Sub-graph localization for temporal action detection","author":"Xu Mengmeng","year":"2020","unstructured":"Mengmeng Xu, Chen Zhao, David S Rojas, Ali Thabet, and Bernard Ghanem. 2020. G-tad: Sub-graph localization for temporal action detection. In CVPR. 10156-10165.","journal-title":"CVPR."},{"key":"e_1_3_2_1_48_1","volume-title":"Overview frequency principle\/spectral bias in deep learning. Communications on Applied Mathematics and Computation","author":"John Xu Zhi-Qin","year":"2024","unstructured":"Zhi-Qin John Xu, Yaoyu Zhang, and Tao Luo. 2024. Overview frequency principle\/spectral bias in deep learning. Communications on Applied Mathematics and Computation (2024), 1-38."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-017-1013-y"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2020.3048440"},{"key":"e_1_3_2_1_51_1","first-page":"15931","article-title":"FreeKD: Knowledge Distillation via Semantic Frequency Prompt","author":"Zhang Yuan","year":"2024","unstructured":"Yuan Zhang, Tao Huang, Jiaming Liu, Tao Jiang, Kuan Cheng, and Shanghang Zhang. 2024. FreeKD: Knowledge Distillation via Semantic Frequency Prompt. In CVPR. 15931-15940.","journal-title":"CVPR."},{"key":"e_1_3_2_1_52_1","first-page":"2914","article-title":"Temporal action detection with structured segment networks","author":"Zhao Yue","year":"2017","unstructured":"Yue Zhao, Yuanjun Xiong, Limin Wang, Zhirong Wu, Xiaoou Tang, and Dahua Lin. 2017. Temporal action detection with structured segment networks. In ICCV. 2914-2923.","journal-title":"ICCV."},{"key":"e_1_3_2_1_53_1","first-page":"27268","article-title":"Fedformer: Frequency enhanced decomposed transformer for long-term series forecasting","author":"Zhou Tian","year":"2022","unstructured":"Tian Zhou, Ziqing Ma, Qingsong Wen, Xue Wang, Liang Sun, and Rong Jin. 2022. Fedformer: Frequency enhanced decomposed transformer for long-term series forecasting. In ICML. PMLR, 27268-27286.","journal-title":"ICML. PMLR"},{"key":"e_1_3_2_1_54_1","first-page":"18559","article-title":"Dual DETRs for Multi-Label Temporal Action Detection","author":"Zhu Yuhan","year":"2024","unstructured":"Yuhan Zhu, Guozhen Zhang, Jing Tan, Gangshan Wu, and Limin Wang. 2024. Dual DETRs for Multi-Label Temporal Action Detection. In CVPR. 18559-18569.","journal-title":"CVPR."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754712","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:06:08Z","timestamp":1765343168000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754712"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":54,"alternative-id":["10.1145\/3746027.3754712","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754712","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}