{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T18:50:26Z","timestamp":1755802226704,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":29,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"Supported by the Fundamental Research Funds for the Central Universities"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3652583.3657617","type":"proceedings-article","created":{"date-parts":[[2024,6,7]],"date-time":"2024-06-07T06:30:40Z","timestamp":1717741840000},"page":"1135-1139","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Directly Locating Actions in Video with Single Frame Annotation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-4975-194X","authenticated-orcid":false,"given":"Haoran","family":"Tong","sequence":"first","affiliation":[{"name":"University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2638-4324","authenticated-orcid":false,"given":"Xinyan","family":"Liu","sequence":"additional","affiliation":[{"name":"University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3954-2387","authenticated-orcid":false,"given":"Guorong","family":"Li","sequence":"additional","affiliation":[{"name":"University of Chinese Academy of Sciences, Bejing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9923-5034","authenticated-orcid":false,"given":"Laiyun","family":"Qing","sequence":"additional","affiliation":[{"name":"University of Chinese Academy of Sciences, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,6,7]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"volume-title":"Rethinking the Faster R-CNN Architecture for Temporal Action Localization","author":"Chao Yu-Wei","key":"e_1_3_2_1_2_1","unstructured":"Yu-Wei Chao, Sudheendra Vijayanarasimhan, Bryan Seybold, David A. Ross, Jia Deng, and Rahul Sukthankar. 2018. Rethinking the Faster R-CNN Architecture for Temporal Action Localization. In CVPR. Computer Vision Foundation \/ IEEE Computer Society, 1130--1139."},{"volume-title":"ECCV (4) (Lecture Notes in Computer Science","author":"Chen Mengyuan","key":"e_1_3_2_1_3_1","unstructured":"Mengyuan Chen, Junyu Gao, Shicai Yang, and Changsheng Xu. 2022. Dual-Evidential Learning for Weakly-supervised Temporal Action Localization. In ECCV (4) (Lecture Notes in Computer Science, Vol. 13664). Springer, 192--208."},{"volume-title":"ECCV (34) (Lecture Notes in Computer Science","author":"Cheng Feng","key":"e_1_3_2_1_4_1","unstructured":"Feng Cheng and Gedas Bertasius. 2022. TallFormer: Temporal Action Localization with a Long-Memory Transformer. In ECCV (34) (Lecture Notes in Computer Science, Vol. 13694). Springer, 503--521."},{"key":"e_1_3_2_1_5_1","volume-title":"Mayol-Cuevas","author":"Damen Dima","year":"2014","unstructured":"Dima Damen, Teesid Leelasawassuk, Osian Haines, Andrew Calway, and Walterio W. Mayol-Cuevas. 2014. You-Do, I-Learn: Discovering Task Relevant Objects and their Modes of Interaction from Multi-User Egocentric Video. In BMVC. BMVA Press."},{"volume-title":"ASM-Loc: Action-aware Segment Modeling for Weakly-Supervised Temporal Action Localization","author":"He Bo","key":"e_1_3_2_1_6_1","unstructured":"Bo He, Xitong Yang, Le Kang, Zhiyu Cheng, Xin Zhou, and Abhinav Shrivastava. 2022. ASM-Loc: Action-aware Segment Modeling for Weakly-Supervised Temporal Action Localization. In CVPR. IEEE, 13915--13925."},{"volume-title":"Foreground-Action Consistency Network for Weakly Supervised Temporal Action Localization","author":"Huang Linjiang","key":"e_1_3_2_1_7_1","unstructured":"Linjiang Huang, Liang Wang, and Hongsheng Li. 2021. Foreground-Action Consistency Network for Weakly Supervised Temporal Action Localization. In ICCV. IEEE, 7982--7991."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2016.10.018"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Yuan Ji Xu Jia Huchuan Lu and Xiang Ruan. 2021. Weakly-Supervised Temporal Action Localization via Cross-Stream Collaborative Learning. In ACM Multimedia. ACM 853--861.","DOI":"10.1145\/3474085.3475261"},{"volume-title":"Divide and Conquer for Single-frame Temporal Action Localization","author":"Ju Chen","key":"e_1_3_2_1_10_1","unstructured":"Chen Ju, Peisen Zhao, Siheng Chen, Ya Zhang, Yanfeng Wang, and Qi Tian. 2021. Divide and Conquer for Single-frame Temporal Action Localization. In ICCV. IEEE, 13435--13444."},{"key":"e_1_3_2_1_11_1","volume-title":"Karen Simonyan, Brian Zhang, Chloe Hillier, Sudheendra Vijayanarasimhan, Fabio Viola, Tim Green, Trevor Back, Paul Natsev, Mustafa Suleyman, and Andrew Zisserman.","author":"Kay Will","year":"2017","unstructured":"Will Kay, Jo a o Carreira, Karen Simonyan, Brian Zhang, Chloe Hillier, Sudheendra Vijayanarasimhan, Fabio Viola, Tim Green, Trevor Back, Paul Natsev, Mustafa Suleyman, and Andrew Zisserman. 2017. The Kinetics Human Action Video Dataset. CoRR , Vol. abs\/1705.06950 (2017). showeprint[arXiv]1705.06950 http:\/\/arxiv.org\/abs\/1705.06950"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Pilhyeon Lee and Hyeran Byun. 2021. Learning Action Completeness from Points for Weakly-supervised Temporal Action Localization. In ICCV. 13628--13637.","DOI":"10.1109\/ICCV48922.2021.01339"},{"volume-title":"Background Suppression Network for Weakly-Supervised Temporal Action Localization","author":"Lee Pilhyeon","key":"e_1_3_2_1_13_1","unstructured":"Pilhyeon Lee, Youngjung Uh, and Hyeran Byun. 2020. Background Suppression Network for Weakly-Supervised Temporal Action Localization. In AAAI. AAAI Press, 11320--11327."},{"volume-title":"Temporal Deformable Residual Networks for Action Segmentation in Videos","author":"Lei Peng","key":"e_1_3_2_1_14_1","unstructured":"Peng Lei and Sinisa Todorovic. 2018. Temporal Deformable Residual Networks for Action Segmentation in Videos. In CVPR. Computer Vision Foundation \/ IEEE Computer Society, 6742--6751."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2022.118965"},{"key":"e_1_3_2_1_16_1","volume-title":"Yazan Abu Farha, and J\u00fc rgen Gall","author":"Li Zhe","year":"2021","unstructured":"Zhe Li, Yazan Abu Farha, and J\u00fc rgen Gall. 2021. Temporal Action Segmentation From Timestamp Supervision. In CVPR. Computer Vision Foundation \/ IEEE, 8365--8374."},{"volume-title":"ECCV (4) (Lecture Notes in Computer Science","author":"Ma Fan","key":"e_1_3_2_1_17_1","unstructured":"Fan Ma, Linchao Zhu, Yi Yang, Shengxin Zha, Gourab Kundu, Matt Feiszli, and Zheng Shou. 2020. SF-Net: Single-Frame Supervision for Temporal Action Localization. In ECCV (4) (Lecture Notes in Computer Science, Vol. 12349). Springer, 420--437."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","unstructured":"Davide Moltisanti Sanja Fidler and Dima Damen. 2019. Action Recognition From Single Timestamp Supervision in Untrimmed Videos. In CVPR. 9915--9924.","DOI":"10.1109\/CVPR.2019.01015"},{"volume-title":"Weakly Supervised Action Localization by Sparse Temporal Pooling Network","author":"Nguyen Phuc","key":"e_1_3_2_1_19_1","unstructured":"Phuc Nguyen, Ting Liu, Gautam Prasad, and Bohyung Han. 2018. Weakly Supervised Action Localization by Sparse Temporal Pooling Network. In CVPR. Computer Vision Foundation \/ IEEE Computer Society, 6752--6761."},{"key":"e_1_3_2_1_20_1","volume-title":"Roy-Chowdhury","author":"Paul Sujoy","year":"2018","unstructured":"Sujoy Paul, Sourya Roy, and Amit K. Roy-Chowdhury. 2018. W-TALC: Weakly-Supervised Temporal Activity Localization and Classification. In ECCV (4) (Lecture Notes in Computer Science, Vol. 11208). Springer, 588--607."},{"volume-title":"Temporal Action Localization in Untrimmed Videos via Multi-stage CNNs","author":"Shou Zheng","key":"e_1_3_2_1_21_1","unstructured":"Zheng Shou, Dongang Wang, and Shih-Fu Chang. 2016. Temporal Action Localization in Untrimmed Videos via Multi-stage CNNs. In CVPR. IEEE Computer Society, 1049--1058."},{"volume-title":"ACCV (2) (Lecture Notes in Computer Science","author":"Su Haisheng","key":"e_1_3_2_1_22_1","unstructured":"Haisheng Su, Xu Zhao, and Tianwei Lin. 2018. Cascaded Pyramid Mining Network for Weakly Supervised Temporal Action Localization. In ACCV (2) (Lecture Notes in Computer Science, Vol. 11362). Springer, 558--574."},{"volume-title":"UntrimmedNets for Weakly Supervised Action Recognition and Detection","author":"Wang Limin","key":"e_1_3_2_1_23_1","unstructured":"Limin Wang, Yuanjun Xiong, Dahua Lin, and Luc Van Gool. 2017. UntrimmedNets for Weakly Supervised Action Recognition and Detection. In CVPR. IEEE Computer Society, 6402--6411."},{"volume-title":"ECCV (34) (Lecture Notes in Computer Science","author":"Weng Yuetian","key":"e_1_3_2_1_24_1","unstructured":"Yuetian Weng, Zizheng Pan, Mingfei Han, Xiaojun Chang, and Bohan Zhuang. 2022. An Efficient Spatio-Temporal Pyramid Transformer for Action Detection. In ECCV (34) (Lecture Notes in Computer Science, Vol. 13694). Springer, 358--375."},{"volume-title":"R-C3D: Region Convolutional 3D Network for Temporal Activity Detection","author":"Xu Huijuan","key":"e_1_3_2_1_25_1","unstructured":"Huijuan Xu, Abir Das, and Kate Saenko. 2017. R-C3D: Region Convolutional 3D Network for Temporal Activity Detection. In ICCV. IEEE Computer Society, 5794--5803."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Runhao Zeng Wenbing Huang Chuang Gan Mingkui Tan Yu Rong Peilin Zhao and Junzhou Huang. 2019. Graph Convolutional Networks for Temporal Action Localization. In ICCV. 7093--7102.","DOI":"10.1109\/ICCV.2019.00719"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"Chen-Lin Zhang Jianxin Wu and Yin Li. 2022. ActionFormer: Localizing Moments of Actions with Transformers. In ECCV. 492--510.","DOI":"10.1007\/978-3-031-19772-7_29"},{"volume-title":"Temporal Action Detection with Structured Segment Networks","author":"Zhao Yue","key":"e_1_3_2_1_28_1","unstructured":"Yue Zhao, Yuanjun Xiong, Limin Wang, Zhirong Wu, Xiaoou Tang, and Dahua Lin. 2017. Temporal Action Detection with Structured Segment Networks. In ICCV. IEEE Computer Society, 2933--2942."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Zixin Zhu Wei Tang Nanning Zheng and Gang Hua. 2021. Enriching Local and Global Contexts for Temporal Action Localization. In ICCV. 13496--13505.","DOI":"10.1109\/ICCV48922.2021.01326"}],"event":{"name":"ICMR '24: International Conference on Multimedia Retrieval","sponsor":["SIGMM ACM Special Interest Group on Multimedia","SIGSOFT ACM Special Interest Group on Software Engineering"],"location":"Phuket Thailand","acronym":"ICMR '24"},"container-title":["Proceedings of the 2024 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3657617","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3652583.3657617","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T08:49:09Z","timestamp":1755766149000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3657617"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":29,"alternative-id":["10.1145\/3652583.3657617","10.1145\/3652583"],"URL":"https:\/\/doi.org\/10.1145\/3652583.3657617","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}