{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,21]],"date-time":"2025-11-21T06:15:35Z","timestamp":1763705735092,"version":"3.45.0"},"reference-count":65,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J. Comput. Sci. Technol."],"published-print":{"date-parts":[[2025,9]]},"DOI":"10.1007\/s11390-025-5281-7","type":"journal-article","created":{"date-parts":[[2025,11,21]],"date-time":"2025-11-21T04:25:47Z","timestamp":1763699147000},"page":"1270-1284","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Concept-Guided Open-Vocabulary Temporal Action Detection"],"prefix":"10.1007","volume":"40","author":[{"given":"Song-Miao","family":"Wang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rui-Ze","family":"Han","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wei","family":"Feng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,11,21]]},"reference":[{"key":"5281_CR1","doi-asserted-by":"publisher","first-page":"2933","DOI":"10.1109\/ICCV.2017.317","volume-title":"Proc. the 2017 IEEE International Conference on Computer Vision","author":"Y Zhao","year":"2017","unstructured":"Zhao Y, Xiong Y, Wang L, Wu Z, Tang X, Lin D. Temporal action detection with structured segment networks. In Proc. the 2017 IEEE International Conference on Computer Vision, Oct. 2017, pp.2933\u20132942. DOI: https:\/\/doi.org\/10.1109\/ICCV.2017.317."},{"key":"5281_CR2","doi-asserted-by":"publisher","first-page":"162","DOI":"10.1007\/978-3-030-01270-0_10","volume-title":"Proc. the 15th European Conference on Computer Vision","author":"Z Shou","year":"2018","unstructured":"Shou Z, Gao H, Zhang L, Miyazawa K, Chang S F. AutoLoc: Weakly-supervised temporal action localization in untrimmed videos. In Proc. the 15th European Conference on Computer Vision, Sept. 2018, pp.162\u2013179. DOI: https:\/\/doi.org\/10.1007\/978-3-030-01270-0_10."},{"issue":"4","key":"5281_CR3","doi-asserted-by":"publisher","first-page":"4302","DOI":"10.1109\/TPAMI.2022.3193611","volume":"45","author":"E Vahdani","year":"2023","unstructured":"Vahdani E, Tian Y. Deep learning-based action detection in untrimmed videos: A survey. IEEE Trans. Pattern Analysis and Machine Intelligence, 2023, 45(4): 4302\u20134320. DOI: https:\/\/doi.org\/10.1109\/TPAMI.2022.3193611.","journal-title":"IEEE Trans. Pattern Analysis and Machine Intelligence"},{"key":"5281_CR4","doi-asserted-by":"publisher","first-page":"5794","DOI":"10.1109\/iccv.2017.617","volume-title":"Proc. the 2017 IEEE International Conference on Computer Vision","author":"H Xu","year":"2017","unstructured":"Xu H, Das A, Saenko K. R-C3D: Region convolutional 3D network for temporal activity detection. In Proc. the 2017 IEEE International Conference on Computer Vision, Oct. 2017, pp.5794\u20135803. DOI: https:\/\/doi.org\/10.1109\/iccv.2017.617."},{"key":"5281_CR5","doi-asserted-by":"publisher","first-page":"4002","DOI":"10.1109\/TIP.2024.3413599","volume":"33","author":"Y Tang","year":"2024","unstructured":"Tang Y, Wang W, Zhang C, Liu J, Zhao Y. Learnable feature augmentation framework for temporal action localization. IEEE Trans. Image Processing, 2024, 33: 4002\u20134015. DOI: https:\/\/doi.org\/10.1109\/TIP.2024.3413599.","journal-title":"IEEE Trans. Image Processing"},{"key":"5281_CR6","doi-asserted-by":"publisher","first-page":"10153","DOI":"10.1109\/cvpr42600.2020.01017","volume-title":"Proc. the 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"M Xu","year":"2020","unstructured":"Xu M, Zhao C, Rojas D S, Thabet A, Ghanem B. G-TAD: Sub-graph localization for temporal action detection. In Proc. the 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, Jun. 2020, pp.10153\u201310162. DOI: https:\/\/doi.org\/10.1109\/cvpr42600.2020.01017."},{"key":"5281_CR7","doi-asserted-by":"publisher","first-page":"6402","DOI":"10.1109\/cvpr.2017.678","volume-title":"Proc. the 2017 IEEE Conference on Computer Vision and Pattern Recognition","author":"L Wang","year":"2017","unstructured":"Wang L, Xiong Y, Lin D, Van Gool L. UntrimmedNets for weakly supervised action recognition and detection. In Proc. the 2017 IEEE Conference on Computer Vision and Pattern Recognition, Jul. 2017, pp.6402\u20136411. DOI: https:\/\/doi.org\/10.1109\/cvpr.2017.678."},{"issue":"12","key":"5281_CR8","doi-asserted-by":"publisher","first-page":"7728","DOI":"10.1109\/TPAMI.2024.3395778","volume":"46","author":"Z Li","year":"2024","unstructured":"Li Z, Zhong Y, Song R, Li T, Ma L, Zhang W. DeTAL: Open-vocabulary temporal action localization with decoupled networks. IEEE Trans. Pattern Analysis and Machine Intelligence, 2024, 46(12): 7728\u20137741. DOI: https:\/\/doi.org\/10.1109\/TPAMI.2024.3395778.","journal-title":"IEEE Trans. Pattern Analysis and Machine Intelligence"},{"key":"5281_CR9","doi-asserted-by":"publisher","first-page":"876","DOI":"10.1109\/cvpr42600.2020.00096","volume-title":"Proc. the 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"L Zhang","year":"2020","unstructured":"Zhang L, Chang X, Liu J, Luo M, Wang S, Ge Z, Hauptmann A. ZSTAD: Zero-shot temporal activity detection. In Proc. the 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, Jun. 2020, pp.876\u2013885. DOI: https:\/\/doi.org\/10.1109\/cvpr42600.2020.00096."},{"key":"5281_CR10","first-page":"8748","volume-title":"Proc. the 38th International Conference on Machine Learning","author":"A Radford","year":"2021","unstructured":"Radford A, Kim J W, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J, Krueger G, Sutskever I. Learning transferable visual models from natural language supervision. In Proc. the 38th International Conference on Machine Learning, Jul. 2021, pp.8748\u20138763."},{"key":"5281_CR11","doi-asserted-by":"publisher","first-page":"105","DOI":"10.1007\/978-3-031-19833-5_7","volume-title":"Proc. the 17th European Conference on Computer Vision","author":"C Ju","year":"2022","unstructured":"Ju C, Han T, Zheng K, Zhang Y, Xie W. Prompting visual-language models for efficient video understanding. In Proc. the 17th European Conference on Computer Vision, Oct. 2022, pp.105\u2013124. DOI: https:\/\/doi.org\/10.1007\/978-3-031-19833-5_7."},{"key":"5281_CR12","doi-asserted-by":"publisher","first-page":"681","DOI":"10.1007\/978-3-031-20062-5_39","volume-title":"Proc. the 17th European Conference on Computer Vision","author":"S Nag","year":"2022","unstructured":"Nag S, Zhu X, Song Y Z, Xiang T. Zero-shot temporal action detection via vision-language prompting. In Proc. the 17th European Conference on Computer Vision, Oct. 2022, pp.681\u2013697. DOI: https:\/\/doi.org\/10.1007\/978-3-031-20062-5_39."},{"key":"5281_CR13","doi-asserted-by":"publisher","first-page":"961","DOI":"10.1109\/cvpr.2015.7298698","volume-title":"Proc. the 2015 IEEE Conference on Computer Vision and Pattern Recognition","author":"F Caba Heilbron","year":"2015","unstructured":"Caba Heilbron F, Escorcia V, Ghanem B, Niebles J C. ActivityNet: A large-scale video benchmark for human activity understanding. In Proc. the 2015 IEEE Conference on Computer Vision and Pattern Recognition, Jun. 2015, pp.961\u2013970. DOI: https:\/\/doi.org\/10.1109\/cvpr.2015.7298698."},{"key":"5281_CR14","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/j.cviu.2016.10.018","volume":"155","author":"H Idrees","year":"2017","unstructured":"Idrees H, Zamir A R, Jiang Y G, Gorban A, Laptev I, Sukthankar R, Shah M. The THUMOS challenge on action recognition for videos \u201cin the wild\u201d. Computer Vision and Image Understanding, 2017, 155: 1\u201323. DOI: https:\/\/doi.org\/10.1016\/j.cviu.2016.10.018.","journal-title":"Computer Vision and Image Understanding"},{"key":"5281_CR15","doi-asserted-by":"publisher","first-page":"1914","DOI":"10.1109\/cvpr.2016.211","volume-title":"Proc. the 2016 IEEE Conference on Computer Vision and Pattern Recognition","author":"F Caba Heilbron","year":"2016","unstructured":"Caba Heilbron F, Niebles J C, Ghanem B. Fast temporal activity proposals for efficient detection of human actions in untrimmed videos. In Proc. the 2016 IEEE Conference on Computer Vision and Pattern Recognition, Jun. 2016, pp.1914\u20131923. DOI: https:\/\/doi.org\/10.1109\/cvpr.2016.211."},{"key":"5281_CR16","doi-asserted-by":"publisher","first-page":"768","DOI":"10.1007\/978-3-319-46487-9_47","volume-title":"Proc. the 14th European Conference on Computer Vision","author":"V Escorcia","year":"2016","unstructured":"Escorcia V, Caba Heilbron F, Niebles J C, Ghanem B. DAPs: Deep action proposals for action understanding. In Proc. the 14th European Conference on Computer Vision, Oct. 2016, pp.768\u2013784. DOI: https:\/\/doi.org\/10.1007\/978-3-319-46487-9_47."},{"key":"5281_CR17","doi-asserted-by":"publisher","first-page":"3888","DOI":"10.1109\/iccv.2019.00399","volume-title":"Proc. the 2019 IEEE\/CVF International Conference on Computer Vision","author":"T Lin","year":"2019","unstructured":"Lin T, Liu X, Li X, Ding E, Wen S. BMN: Boundary-matching network for temporal action proposal generation. In Proc. the 2019 IEEE\/CVF International Conference on Computer Vision, Oct. 27\u2013Nov. 2, 2019, pp.3888\u20133897. DOI: https:\/\/doi.org\/10.1109\/iccv.2019.00399."},{"key":"5281_CR18","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1007\/978-3-030-01225-0_1","volume-title":"Proc. the 15th European Conference on Computer Vision","author":"T Lin","year":"2018","unstructured":"Lin T, Zhao X, Su H, Wang C, Yang M. BSN: Boundary sensitive network for temporal action proposal generation. In Proc. the 15th European Conference on Computer Vision, Sept. 2018, pp.3\u201321. DOI: https:\/\/doi.org\/10.1007\/978-3-030-01225-0_1."},{"key":"5281_CR19","doi-asserted-by":"publisher","first-page":"11612","DOI":"10.1609\/aaai.v34i07.6829","volume-title":"Proc. the 34th AAAI Conference on Artificial Intelligence","author":"Q Liu","year":"2020","unstructured":"Liu Q, Wang Z. Progressive boundary refinement network for temporal action detection. In Proc. the 34th AAAI Conference on Artificial Intelligence, Feb. 2020, pp.11612\u201311619. DOI: https:\/\/doi.org\/10.1609\/aaai.v34i07.6829."},{"key":"5281_CR20","doi-asserted-by":"publisher","first-page":"7093","DOI":"10.1109\/iccv.2019.00719","volume-title":"Proc. the 2019 IEEE\/CVF International Conference on Computer Vision","author":"R Zeng","year":"2019","unstructured":"Zeng R, Huang W, Gan C, Tan M, Rong Y, Zhao P, Huang J. Graph convolutional networks for temporal action localization. In Proc. the 2019 IEEE\/CVF International Conference on Computer Vision, Oct. 27\u2013Nov. 2, 2019, pp.7093\u20137102. DOI: https:\/\/doi.org\/10.1109\/iccv.2019.00719."},{"key":"5281_CR21","doi-asserted-by":"publisher","first-page":"13638","DOI":"10.1109\/iccv48922.2021.01340","volume-title":"Proc. the 2021 IEEE\/CVF International Conference on Computer Vision","author":"C Zhao","year":"2021","unstructured":"Zhao C, Thabet A, Ghanem B. Video self-stitching graph network for temporal action localization. In Proc. the 2021 IEEE\/CVF International Conference on Computer Vision, Oct. 2021, pp.13638\u201313647. DOI: https:\/\/doi.org\/10.1109\/iccv48922.2021.01340."},{"key":"5281_CR22","doi-asserted-by":"publisher","first-page":"4626","DOI":"10.1609\/aaai.v34i04.5893","volume-title":"Proc. the 34th AAAI Conference on Artificial Intelligence","author":"J Li","year":"2020","unstructured":"Li J, Liu X, Zong Z, Zhao W, Zhang M, Song J. Graph attention based proposal 3D ConvNets for action detection. In Proc. the 34th AAAI Conference on Artificial Intelligence, Feb. 2020, pp.4626\u20134633. DOI: https:\/\/doi.org\/10.1609\/aaai.v34i04.5893."},{"issue":"1","key":"5281_CR23","doi-asserted-by":"publisher","first-page":"118","DOI":"10.1007\/s11263-023-01857-z","volume":"132","author":"R Han","year":"2024","unstructured":"Han R, Feng W, Wang F, Qian Z, Yan H, Wang S. Benchmarking the complementary-view multi-human association and tracking. International Journal of Computer Vision, 2024, 132(1): 118\u2013136. DOI: https:\/\/doi.org\/10.1007\/s11263-023-01857-z.","journal-title":"International Journal of Computer Vision"},{"issue":"5","key":"5281_CR24","doi-asserted-by":"publisher","first-page":"1180","DOI":"10.1007\/s11390-022-2194-6","volume":"39","author":"S K Zhang","year":"2024","unstructured":"Zhang S K, Xie W Y, Wang C, Zhang S H. ScenePalette: Contextually exploring object collections through multiplex relations in 3D scenes. Journal of Computer Science and Technology, 2024, 39(5): 1180\u20131192. DOI: https:\/\/doi.org\/10.1007\/s11390-022-2194-6.","journal-title":"Journal of Computer Science and Technology"},{"issue":"1","key":"5281_CR25","doi-asserted-by":"publisher","first-page":"351","DOI":"10.1109\/TPAMI.2024.3463966","volume":"47","author":"W Feng","year":"2025","unstructured":"Feng W, Wang F, Han R, Gan Y, Qian Z, Hou J, Wang S. Unveiling the power of self-supervision for multi-view multi-human association and tracking. IEEE Trans. Pattern Analysis and Machine Intelligence, 2025, 47(1): 351\u2013368. DOI: https:\/\/doi.org\/10.1109\/TPAMI.2024.3463966.","journal-title":"IEEE Trans. Pattern Analysis and Machine Intelligence"},{"issue":"3","key":"5281_CR26","doi-asserted-by":"publisher","first-page":"626","DOI":"10.1007\/s11390-022-2204-8","volume":"37","author":"X Feng","year":"2022","unstructured":"Feng X, Wu H M, Yin Y H, Lan L B. CGTracker: Center graph network for one-stage multi-pedestrian-object detection and tracking. Journal of Computer Science and Technology, 2022, 37(3): 626\u2013640. DOI: https:\/\/doi.org\/10.1007\/s11390-022-2204-8.","journal-title":"Journal of Computer Science and Technology"},{"key":"5281_CR27","doi-asserted-by":"publisher","first-page":"244","DOI":"10.1007\/978-3-031-19772-7_15","volume-title":"Proc. the 17th European Conference on Computer Vision","author":"R Han","year":"2022","unstructured":"Han R, Yan H, Li J, Wang S, Feng W, Wang S. Panoramic human activity recognition. In Proc. the 17th European Conference on Computer Vision, Oct. 2022, pp.244\u2013261. DOI: https:\/\/doi.org\/10.1007\/978-3-031-19772-7_15."},{"issue":"4","key":"5281_CR28","doi-asserted-by":"publisher","first-page":"811","DOI":"10.1007\/s11390-024-4125-1","volume":"39","author":"X Y Qin","year":"2024","unstructured":"Qin X Y, Li L S, Tang J Y, Hao F, Ge M L, Pang G Y. Multi-task visual semantic embedding network for imagetext retrieval. Journal of Computer Science and Technology, 2024, 39(4): 811\u2013826. DOI: https:\/\/doi.org\/10.1007\/s11390-024-4125-1.","journal-title":"Journal of Computer Science and Technology"},{"issue":"1","key":"5281_CR29","doi-asserted-by":"publisher","first-page":"123","DOI":"10.1007\/s41095-022-0272-x","volume":"9","author":"A Zimmer","year":"2023","unstructured":"Zimmer A, Hilsmann A, Morgenstern W, Eisert P. Imposing temporal consistency on deep monocular body shape and pose estimation. Computational Visual Media, 2023, 9(1): 123\u2013139. DOI: https:\/\/doi.org\/10.1007\/s41095-022-0272-x.","journal-title":"Computational Visual Media"},{"key":"5281_CR30","doi-asserted-by":"publisher","first-page":"120683","DOI":"10.1016\/j.eswa.2023.120683","volume":"232","author":"L Hu","year":"2023","unstructured":"Hu L, Liu S, Feng W. Skeleton-based action recognition with local dynamic spatial\u2013Temporal aggregation. Expert Systems with Applications, 2023, 232: 120683. DOI: https:\/\/doi.org\/10.1016\/j.eswa.2023.120683.","journal-title":"Expert Systems with Applications"},{"issue":"5","key":"5281_CR31","doi-asserted-by":"publisher","first-page":"185325","DOI":"10.1007\/s11704-023-2418-0","volume":"18","author":"J Li","year":"2024","unstructured":"Li J, Han R, Feng W et al. Contactless interaction recognition and interactor detection in multi-person scenes. Frontiers of Computer Science, 2024, 18(5): 185325. DOI: https:\/\/doi.org\/10.1007\/s11704-023-2418-0.","journal-title":"Frontiers of Computer Science"},{"key":"5281_CR32","unstructured":"Wang L, Yang H, Wu W, Yao H, Huang H. Temporal action proposal generation with transformers. arXiv: 2105.12043, 2021. https:\/\/arxiv.org\/abs\/2105.12043, Jun. 2025."},{"key":"5281_CR33","doi-asserted-by":"publisher","first-page":"13506","DOI":"10.1109\/iccv48922.2021.01327","volume-title":"Proc. the 2021 IEEE\/CVF International Conference on Computer Vision","author":"J Tan","year":"2021","unstructured":"Tan J, Tang J, Wang L, Wu G. Relaxed transformer decoders for direct action proposal generation. In Proc. the 2021 IEEE\/CVF International Conference on Computer Vision, Oct. 2021, pp.13506\u201313515. DOI: https:\/\/doi.org\/10.1109\/iccv48922.2021.01327."},{"key":"5281_CR34","doi-asserted-by":"publisher","first-page":"41","DOI":"10.1145\/3552458.3556443","volume-title":"Proc. the 3rd International Workshop on Human-Centric Multimedia Analysis","author":"S Chang","year":"2022","unstructured":"Chang S, Wang P, Wang F, Li H, Shou Z. Augmented transformer with adaptive graph for temporal action proposal generation. In Proc. the 3rd International Workshop on Human-Centric Multimedia Analysis, Oct. 2022, pp.41\u201350. DOI: https:\/\/doi.org\/10.1145\/3552458.3556443."},{"key":"5281_CR35","doi-asserted-by":"publisher","first-page":"988","DOI":"10.1145\/3123266.3123343","volume-title":"Proc. the 25th ACM International Conference on Multimedia","author":"T Lin","year":"2017","unstructured":"Lin T, Zhao X, Shou Z. Single shot temporal action detection. In Proc. the 25th ACM International Conference on Multimedia, Oct. 2017, pp.988\u2013996. DOI: https:\/\/doi.org\/10.1145\/3123266.3123343."},{"key":"5281_CR36","doi-asserted-by":"publisher","first-page":"3648","DOI":"10.1109\/iccv.2017.392","volume-title":"Proc. the 2017 IEEE International Conference on Computer Vision","author":"J Gao","year":"2017","unstructured":"Gao J, Yang Z, Chen K, Sun C, Nevatia R. TURN TAP: Temporal unit regression network for temporal action proposals. In Proc. the 2017 IEEE International Conference on Computer Vision, Oct. 2017, pp.3648\u20133656. DOI: https:\/\/doi.org\/10.1109\/iccv.2017.392."},{"key":"5281_CR37","doi-asserted-by":"publisher","first-page":"7477","DOI":"10.1609\/aaai.v32i1.12234","volume-title":"Proc. the 32nd AAAI Conference on Artificial Intelligence","author":"K Yang","year":"2018","unstructured":"Yang K, Qiao P, Li D, Lv S, Dou Y. Exploring temporal preservation networks for precise temporal action localization. In Proc. the 32nd AAAI Conference on Artificial Intelligence, Feb. 2018, pp.7477\u20137484. DOI: https:\/\/doi.org\/10.1609\/aaai.v32i1.12234."},{"key":"5281_CR38","unstructured":"Wang C, Cai H, Zou Y, Xiong Y. RGB stream is enough for temporal action detection. arXiv: 2107.04362, 2021. https:\/\/arxiv.org\/abs\/2107.04362, Jun. 2025."},{"key":"5281_CR39","doi-asserted-by":"publisher","first-page":"344","DOI":"10.1109\/cvpr.2019.00043","volume-title":"Proc. the 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"F Long","year":"2019","unstructured":"Long F, Yao T, Qiu Z, Tian X, Luo J, Mei T. Gaussian temporal awareness networks for action localization. In Proc. the 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, Jun. 2019, pp.344\u2013353. DOI: https:\/\/doi.org\/10.1109\/cvpr.2019.00043."},{"key":"5281_CR40","doi-asserted-by":"publisher","first-page":"1288","DOI":"10.1109\/icme.2019.00224","volume-title":"Proc. the 2019 IEEE International Conference on Multimedia and Expo","author":"Y Huang","year":"2019","unstructured":"Huang Y, Dai Q, Lu Y. Decoupling localization and classification in single shot temporal action detection. In Proc. the 2019 IEEE International Conference on Multimedia and Expo, Jul. 2019, pp.1288\u20131293. DOI: https:\/\/doi.org\/10.1109\/icme.2019.00224."},{"issue":"4","key":"5281_CR41","doi-asserted-by":"publisher","first-page":"2171","DOI":"10.1109\/TPAMI.2023.3330794","volume":"46","author":"B Wang","year":"2024","unstructured":"Wang B, Zhao Y, Yang L, Long T, Li X. Temporal action localization in the deep learning era: A survey. IEEE Trans. Pattern Analysis and Machine Intelligence, 2024, 46(4): 2171\u20132190. DOI: https:\/\/doi.org\/10.1109\/TPAMI.2023.3330794.","journal-title":"IEEE Trans. Pattern Analysis and Machine Intelligence"},{"key":"5281_CR42","doi-asserted-by":"publisher","first-page":"3319","DOI":"10.1109\/cvpr46437.2021.00333","volume-title":"Proc. the 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"C Lin","year":"2021","unstructured":"Lin C, Xu C, Luo D, Wang Y, Tai Y, Wang C, Li J, Huang F, Fu Y. Learning salient boundary feature for anchor-free temporal action localization. In Proc. the 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, Jun. 2021, pp.3319\u20133328. DOI: https:\/\/doi.org\/10.1109\/cvpr46437.2021.00333."},{"key":"5281_CR43","doi-asserted-by":"publisher","first-page":"492","DOI":"10.1007\/978-3-031-19772-7_29","volume-title":"Proc. the 17th European Conference on Computer Vision","author":"C L Zhang","year":"2022","unstructured":"Zhang C L, Wu J, Li Y. ActionFormer: Localizing moments of actions with transformers. In Proc. the 17th European Conference on Computer Vision, Oct. 2022, pp.492\u2013510. DOI: https:\/\/doi.org\/10.1007\/978-3-031-19772-7_29."},{"key":"5281_CR44","doi-asserted-by":"publisher","first-page":"503","DOI":"10.1007\/978-3-031-19830-4_29","volume-title":"Proc. the 17th European Conference on Computer Vision","author":"F Cheng","year":"2022","unstructured":"Cheng F, Bertasius G. TALLFormer: Temporal action localization with a long-memory transformer. In Proc. the 17th European Conference on Computer Vision, Oct. 2022, pp.503\u2013521. DOI: https:\/\/doi.org\/10.1007\/978-3-031-19830-4_29."},{"key":"5281_CR45","doi-asserted-by":"publisher","first-page":"10354","DOI":"10.1109\/iccv51070.2023.00953","volume-title":"Proc. the 2023 IEEE\/CVF International Conference on Computer Vision","author":"L Chen","year":"2023","unstructured":"Chen L, Tong Z, Song Y, Wu G, Wang L. Efficient video action detection with token dropout and context refinement. In Proc. the 2023 IEEE\/CVF International Conference on Computer Vision, Oct. 2023, pp.10354\u201310365. DOI: https:\/\/doi.org\/10.1109\/iccv51070.2023.00953."},{"issue":"9","key":"5281_CR46","doi-asserted-by":"publisher","first-page":"2251","DOI":"10.1109\/TPAMI.2018.2857768","volume":"41","author":"Y Xian","year":"2019","unstructured":"Xian Y, Lampert C H, Schiele B, Akata Z. Zero-shot learning\u2014A comprehensive evaluation of the good, the bad and the ugly. IEEE Trans. Pattern Analysis and Machine Intelligence, 2019, 41(9): 2251\u20132265. DOI: https:\/\/doi.org\/10.1109\/TPAMI.2018.2857768.","journal-title":"IEEE Trans. Pattern Analysis and Machine Intelligence"},{"key":"5281_CR47","doi-asserted-by":"publisher","first-page":"107816","DOI":"10.1109\/ACCESS.2019.2925383","volume":"7","author":"F Wang","year":"2019","unstructured":"Wang F, Liu J, Zhang S, Zhang G, Li Y, Yuan F. Inductive zero-shot image annotation via embedding graph. IEEE Access, 2019, 7: 107816\u2013107830. DOI: https:\/\/doi.org\/10.1109\/ACCESS.2019.2925383.","journal-title":"IEEE Access"},{"key":"5281_CR48","doi-asserted-by":"publisher","first-page":"6288","DOI":"10.1109\/cvpr.2017.666","volume-title":"Proc. the 2017 IEEE Conference on Computer Vision and Pattern Recognition","author":"M Elhoseiny","year":"2017","unstructured":"Elhoseiny M, Zhu Y, Zhang H, Elgammal A. Link the head to the \u201cbeak\u201d: Zero shot learning from noisy text description at part precision. In Proc. the 2017 IEEE Conference on Computer Vision and Pattern Recognition, Jul. 2017, pp.6288\u20136297. DOI: https:\/\/doi.org\/10.1109\/cvpr.2017.666."},{"key":"5281_CR49","doi-asserted-by":"publisher","first-page":"1004","DOI":"10.1109\/CVPR.2018.00111","volume-title":"Proc. the 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Y Zhu","year":"2018","unstructured":"Zhu Y, Elhoseiny M, Liu B, Peng X, Elgammal A. A generative adversarial approach for zero-shot learning from noisy texts. In Proc. the 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, Jun. 2018, pp.1004\u20131013. DOI: https:\/\/doi.org\/10.1109\/cvpr.2018.00111."},{"key":"5281_CR50","doi-asserted-by":"publisher","first-page":"514","DOI":"10.1007\/978-3-030-69544-6_31","volume-title":"Proc. the 15th Asian Conference on Computer Vision","author":"Y Le Cacheux","year":"2020","unstructured":"Le Cacheux Y, Popescu A, Le Borgne H. Webly supervised semantic embeddings for large scale zero-shot learning. In Proc. the 15th Asian Conference on Computer Vision, Nov. 2020, pp.514\u2013531. DOI: https:\/\/doi.org\/10.1007\/978-3-030-69544-6_31."},{"key":"5281_CR51","doi-asserted-by":"publisher","first-page":"59","DOI":"10.1109\/cvpr.2016.14","volume-title":"Proc. the 2016 IEEE Conference on Computer Vision and Pattern Recognition","author":"Z Akata","year":"2016","unstructured":"Akata Z, Malinowski M, Fritz M, Schiele B. Multi-cue zero-shot learning with strong supervision. In Proc. the 2016 IEEE Conference on Computer Vision and Pattern Recognition, Jun. 2016, pp.59\u201368. DOI: https:\/\/doi.org\/10.1109\/cvpr.2016.14."},{"key":"5281_CR52","unstructured":"Goldberg Y, Levy O. word2vec explained: Deriving Mikolov et al.\u2019s negative-sampling word-embedding method. arXiv: 1402.3722, 2014. https:\/\/arxiv.org\/abs\/1402.3722, Jun. 2025."},{"key":"5281_CR53","doi-asserted-by":"publisher","first-page":"1532","DOI":"10.3115\/v1\/d14-1162","volume-title":"Proc. the 2014 Conference on Empirical Methods in Natural Language Processing","author":"J Pennington","year":"2014","unstructured":"Pennington J, Socher R, Manning C D. GloVe: Global vectors for word representation. In Proc. the 2014 Conference on Empirical Methods in Natural Language Processing, Oct. 2014, pp.1532\u20131543. DOI: https:\/\/doi.org\/10.3115\/v1\/d14-1162."},{"key":"5281_CR54","doi-asserted-by":"publisher","DOI":"10.1109\/fg59268.2024.10581896","volume-title":"Proc. the 18th IEEE International Conference on Automatic Face and Gesture Recognition (FG)","author":"T T Nguyen","year":"2024","unstructured":"Nguyen T T, Kawanishi Y, Komamizu T, Ide I. One-stage open-vocabulary temporal action detection leveraging temporal multi-scale and action label features. In Proc. the 18th IEEE International Conference on Automatic Face and Gesture Recognition (FG), May 2024. DOI: https:\/\/doi.org\/10.1109\/fg59268.2024.10581896."},{"key":"5281_CR55","doi-asserted-by":"publisher","first-page":"7175","DOI":"10.1145\/3581783.3611774","volume-title":"Proc. the 31st ACM International Conference on Multimedia","author":"J Zhang","year":"2023","unstructured":"Zhang J, Lin L, Liu J. Prompted contrast with masked motion modeling: Towards versatile 3D action representation learning. In Proc. the 31st ACM International Conference on Multimedia, Oct. 2023, pp.7175\u20137183. DOI: https:\/\/doi.org\/10.1145\/3581783.3611774."},{"key":"5281_CR56","doi-asserted-by":"publisher","DOI":"10.5555\/3666122.3669399","volume-title":"Proc. the 37th International Conference on Neural Information Processing Systems","author":"K Ranasinghe","year":"2023","unstructured":"Ranasinghe K, Ryoo M. Language-based action concept spaces improve video self-supervised learning. In Proc. the 37th International Conference on Neural Information Processing Systems, Dec. 2023, Article No. 3277. DOI: https:\/\/doi.org\/10.5555\/3666122.3669399."},{"key":"5281_CR57","doi-asserted-by":"publisher","first-page":"6000","DOI":"10.5555\/3295222.3295349","volume-title":"Proc. the 31st International Conference on Neural Information Processing Systems","author":"A Vaswani","year":"2017","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez A N, Kaiser \u0141, Polosukhin I. Attention is all you need. In Proc. the 31st International Conference on Neural Information Processing Systems, Dec. 2017, pp.6000\u20136010. DOI: https:\/\/doi.org\/10.5555\/3295222.3295349."},{"key":"5281_CR58","unstructured":"van den Oord A, Li Y, Vinyals O. Representation learning with contrastive predictive coding. arXiv: 1807.03748, 2018. https:\/\/arxiv.org\/abs\/1807.03748, Jun. 2025."},{"key":"5281_CR59","doi-asserted-by":"publisher","first-page":"5562","DOI":"10.1109\/iccv.2017.593","volume-title":"Proc. the 2017 IEEE International Conference on Computer Vision","author":"N Bodla","year":"2017","unstructured":"Bodla N, Singh B, Chellappa R, Davis L S. Soft-NMS \u2014 Improving object detection with one line of code. In Proc. the 2017 IEEE International Conference on Computer Vision, Oct. 2017, pp.5562\u20135570. DOI: https:\/\/doi.org\/10.1109\/iccv.2017.593."},{"key":"5281_CR60","unstructured":"Tang T N, Kim K, Sohn K. TemporalMaxer: Maximize temporal context with only max pooling for temporal action localization. arXiv: 2303.09055, 2023. https:\/\/arxiv.org\/abs\/2303.09055, Jun. 2025."},{"key":"5281_CR61","doi-asserted-by":"publisher","first-page":"18857","DOI":"10.1109\/cvpr52729.2023.01808","volume-title":"Proc. the 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"D Shi","year":"2023","unstructured":"Shi D, Zhong Y, Cao Q, Ma L, Lit J, Tao D. TriDet: Temporal action detection with relative boundary modeling. In Proc. the 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, Jun. 2023, pp.18857\u201318866. DOI: https:\/\/doi.org\/10.1109\/cvpr52729.2023.01808."},{"key":"5281_CR62","unstructured":"Achiam J, Adler S, Agarwal S et al. GPT-4 technical report. arXiv: 2303.08774, 2023. https:\/\/arxiv.org\/abs\/2303.08774, Jun. 2025."},{"key":"5281_CR63","doi-asserted-by":"publisher","first-page":"4724","DOI":"10.1109\/cvpr.2017.502","volume-title":"Proc. the 2017 IEEE Conference on Computer Vision and Pattern Recognition","author":"J Carreira","year":"2017","unstructured":"Carreira J, Zisserman A. Quo Vadis, action recognition? A new model and the Kinetics dataset. In Proc. the 2017 IEEE Conference on Computer Vision and Pattern Recognition, Jul. 2017, pp.4724\u20134733. DOI: https:\/\/doi.org\/10.1109\/cvpr.2017.502."},{"key":"5281_CR64","unstructured":"Soomro K, Zamir A R, Shah M. UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv: 1212.0402, 2012. https:\/\/arxiv.org\/abs\/1212.0402, Jun. 2025."},{"issue":"2","key":"5281_CR65","doi-asserted-by":"publisher","first-page":"502","DOI":"10.1109\/TPAMI.2019.2901464","volume":"42","author":"M Monfort","year":"2020","unstructured":"Monfort M, Andonian A, Zhou B, Ramakrishnan K, Bargal S A, Yan T, Brown L, Fan Q, Gutfreund D, Vondrick C, Oliva A. Moments in time dataset: One million videos for event understanding. IEEE Trans. Pattern Analysis and Machine Intelligence, 2020, 42(2): 502\u2013508. DOI: https:\/\/doi.org\/10.1109\/TPAMI.2019.2901464.","journal-title":"IEEE Trans. Pattern Analysis and Machine Intelligence"}],"container-title":["Journal of Computer Science and Technology"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11390-025-5281-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11390-025-5281-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11390-025-5281-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,21]],"date-time":"2025-11-21T06:03:11Z","timestamp":1763704991000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11390-025-5281-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9]]},"references-count":65,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2025,9]]}},"alternative-id":["5281"],"URL":"https:\/\/doi.org\/10.1007\/s11390-025-5281-7","relation":{},"ISSN":["1000-9000","1860-4749"],"issn-type":[{"type":"print","value":"1000-9000"},{"type":"electronic","value":"1860-4749"}],"subject":[],"published":{"date-parts":[[2025,9]]},"assertion":[{"value":"13 February 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 June 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 November 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"Conflict of Interest\n                      The authors declare that they have no conflict of interest.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics"}}]}}