{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,13]],"date-time":"2025-10-13T15:41:08Z","timestamp":1760370068605,"version":"3.37.3"},"reference-count":40,"publisher":"Springer Science and Business Media LLC","issue":"20","license":[{"start":{"date-parts":[[2024,7,18]],"date-time":"2024-07-18T00:00:00Z","timestamp":1721260800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,7,18]],"date-time":"2024-07-18T00:00:00Z","timestamp":1721260800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61572251","61572162","61702144"],"award-info":[{"award-number":["61572251","61572162","61702144"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"the Zhejiang Provincial Key Science and Technolog \u201cLingYan\u201d Project Foundation","award":["2023C01145"],"award-info":[{"award-number":["2023C01145"]}]},{"name":"the Key Science and Technology Project Foundation of Zhejiang Province","award":["2018C01012"],"award-info":[{"award-number":["2018C01012"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Soft Comput"],"published-print":{"date-parts":[[2024,10]]},"DOI":"10.1007\/s00500-024-09955-x","type":"journal-article","created":{"date-parts":[[2024,7,18]],"date-time":"2024-07-18T14:02:31Z","timestamp":1721311351000},"page":"12377-12388","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Multi-granularity transformer fusion for temporal action localization"],"prefix":"10.1007","volume":"28","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6059-3798","authenticated-orcid":false,"given":"Min","family":"Zhang","sequence":"first","affiliation":[]},{"given":"Haiyang","family":"Hu","sequence":"additional","affiliation":[]},{"given":"Zhongjin","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,7,18]]},"reference":[{"key":"9955_CR1","doi-asserted-by":"crossref","unstructured":"Bai Y, Wang Y, Tong Y, Yang Y, Liu Q, Liu J (2020) Boundary content graph neural network for temporal action proposal generation. In: European conference on computer vision (ECCV), pp 121\u2013137","DOI":"10.1007\/978-3-030-58604-1_8"},{"key":"9955_CR2","doi-asserted-by":"crossref","unstructured":"Carion N, Massa F, Synnaeve G, Usunier N, Kirillov A, Zagoruyko S (2020) End-to-end object detection with transformers. In: European conference on computer vision (ECCV), pp 213\u2013229","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"9955_CR3","doi-asserted-by":"crossref","unstructured":"Carreira J, Zisserman A (2017) Quo Vadis, action recognition? A new model and the kinetics dataset. In: Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR), pp 6299\u20136308","DOI":"10.1109\/CVPR.2017.502"},{"key":"9955_CR4","doi-asserted-by":"crossref","unstructured":"Chao YW, Vijayanarasimhan S, Seybold B, Ross DA, Deng J, Sukthankar R (2018) Rethinking the faster R-CNN architecture for temporal action localization. In: Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR), pp 1130\u20131139","DOI":"10.1109\/CVPR.2018.00124"},{"key":"9955_CR5","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Houlsby N (2020) An image is worth $$16\\times 16$$ words: transformers for image recognition at scale. Preprint arXiv:2010.11929"},{"key":"9955_CR6","doi-asserted-by":"crossref","unstructured":"Gao J, Shi Z, Wang G, Li J, Yuan Y, Ge S, Zhou X (2020) Accurate temporal action proposal generation with relation-aware pyramid network. In: Proceedings of the AAAI conference on artificial intelligence, pp 10810\u201310817","DOI":"10.1609\/aaai.v34i07.6711"},{"key":"9955_CR7","doi-asserted-by":"crossref","unstructured":"Heilbron FC, Escorcia V, Ghanem B, Niebles JC (2015) Activitynet: a large-scale video benchmark for human activity understanding. In: Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR), pp 961\u2013970","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"9955_CR8","doi-asserted-by":"crossref","unstructured":"Hu M, Li Y, Fang L, Wang S (2021) A2-FPN: attention aggregation based feature pyramid network for instance segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR), pp 15343\u201315352","DOI":"10.1109\/CVPR46437.2021.01509"},{"key":"9955_CR9","doi-asserted-by":"crossref","unstructured":"Jain J, Li J, Chiu M, Hassani A, Orlov N, Shi H (2022) OneFormer: one transformer to rule universal image segmentation. Preprint arXiv:2211.06220","DOI":"10.1109\/CVPR52729.2023.00292"},{"key":"9955_CR10","unstructured":"Jiang Y-G, Liu J, Roshan Zamir A, Toderici G, Laptev I, Shah M, Sukthankar R (2014) THUMOS challenge: action recognition with a large number of classes. http:\/\/crcv.ucf.edu\/THUMOS14\/"},{"key":"9955_CR11","doi-asserted-by":"crossref","unstructured":"Kang TK, Lee GH, Lee SW (2022) HTNet: anchor-free temporal action localization with hierarchical transformers. In: IEEE international conference on systems, man, and cybernetics (SMC), pp 365\u2013370","DOI":"10.1109\/SMC53654.2022.9945289"},{"key":"9955_CR12","doi-asserted-by":"crossref","unstructured":"Lin T, Zhao X, Shou Z (2017a) Single shot temporal action detection. In: Proceedings of the 25th ACM international conference on multimedia, pp 988\u2013996","DOI":"10.1145\/3123266.3123343"},{"key":"9955_CR13","doi-asserted-by":"crossref","unstructured":"Lin TY, Doll\u00e3r P, Girshick R, He K, Hariharan B, Belongie S (2017b) Feature pyramid networks for object detection. In: Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR), pp 2117\u20132125","DOI":"10.1109\/CVPR.2017.106"},{"key":"9955_CR14","doi-asserted-by":"crossref","unstructured":"Lin T, Zhao X, Su H, Wang C, Yang M (2018) BSN: boundary sensitive network for temporal action proposal generation. In: European conference on computer vision (ECCV), pp 3\u201319","DOI":"10.1007\/978-3-030-01225-0_1"},{"key":"9955_CR15","doi-asserted-by":"crossref","unstructured":"Lin T, Liu X, Li X, Ding E, Wen S (2019) BMN: boundary-matching network for temporal action proposal generation. In: Proceedings of the IEEE international conference on computer vision (ICCV), pp 3889\u20133898","DOI":"10.1109\/ICCV.2019.00399"},{"key":"9955_CR16","doi-asserted-by":"crossref","unstructured":"Lin C, Li J, Wang Y, Tai Y, Luo D, Cui Z, Ji R (2020) Fast learning of temporal action proposal via dense boundary generator. In: Proceedings of the AAAI conference on artificial intelligence, pp 11499\u201311506","DOI":"10.1609\/aaai.v34i07.6815"},{"key":"9955_CR17","doi-asserted-by":"crossref","unstructured":"Lin C, Xu C, Luo D, Wang Y, Tai Y, Wang C, Fu Y (2021) Learning salient boundary feature for anchor-free temporal action localization. In: Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR), pp 3320\u20133329","DOI":"10.1109\/CVPR46437.2021.00333"},{"key":"9955_CR18","doi-asserted-by":"crossref","unstructured":"Lin K, Li L, Lin CC, Ahmed F, Gan Z, Liu Z, Wang L (2022) Swinbert: end-to-end transformers with sparse attention for video captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR), pp 17949\u201317958","DOI":"10.1109\/CVPR52688.2022.01742"},{"key":"9955_CR19","doi-asserted-by":"crossref","unstructured":"Liu Q, Wang Z (2020) Progressive boundary refinement network for temporal action detection. In: Proceedings of the AAAI conference on artificial intelligence (AAAI), pp 11612\u201311619","DOI":"10.1609\/aaai.v34i07.6829"},{"key":"9955_CR20","doi-asserted-by":"crossref","unstructured":"Liu Y, Ma L, Zhang Y, Liu W, Chang SF (2019) Multi-granularity generator for temporal action proposal. In: Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR), pp 3604\u20133613","DOI":"10.1109\/CVPR.2019.00372"},{"issue":"1","key":"9955_CR21","doi-asserted-by":"publisher","first-page":"5","DOI":"10.1109\/TCSVT.2021.3075607","volume":"32","author":"Y Liu","year":"2021","unstructured":"Liu Y, Chen J, Chen X, Deng B, Huang J, Hua XS (2021) Centerness-aware network for temporal action proposal. IEEE Trans Circuits Syst Video Technol 32(1):5\u201316","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"9955_CR22","doi-asserted-by":"publisher","first-page":"5427","DOI":"10.1109\/TIP.2022.3195321","volume":"31","author":"X Liu","year":"2022","unstructured":"Liu X, Wang Q, Hu Y, Tang X, Zhang S, Bai S, Bai X (2022) End-to-end temporal action detection with transformer. IEEE Trans Image Process 31:5427\u20135441","journal-title":"IEEE Trans Image Process"},{"key":"9955_CR23","unstructured":"Ouyang Y, Zhang T, Gu W, Wang H, Wang L, Guo X (2022) Adaptive perception transformer for temporal action localization. Preprint arXiv:2208.11908"},{"key":"9955_CR24","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2022.104589","volume":"129","author":"J Shang","year":"2023","unstructured":"Shang J, Wei P, Li H, Zheng N (2023) Multi-scale interaction transformer for temporal action proposal generation. Image Vis Comput 129:104589","journal-title":"Image Vis Comput"},{"key":"9955_CR25","doi-asserted-by":"crossref","unstructured":"Shi D, Zhong Y, Cao Q, Zhang J, Ma L, Li J, Tao D (2022) React: temporal action detection with relational queries. In: European conference on computer vision (ECCV), pp 105\u2013121","DOI":"10.1007\/978-3-031-20080-9_7"},{"key":"9955_CR26","doi-asserted-by":"crossref","unstructured":"Shou Z, Wang D, Chang SF (2016) Temporal action localization in untrimmed videos via multi-stage CNNs. In: Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR), pp 1049\u20131058","DOI":"10.1109\/CVPR.2016.119"},{"key":"9955_CR27","doi-asserted-by":"crossref","unstructured":"Su H, Gan W, Wu W, Qiao Y, Yan J (2021) Bsn++: complementary boundary regressor with scale-balanced relation modeling for temporal action proposal generation. In: Proceedings of the AAAI conference on artificial intelligence (AAAI), pp 2602\u20132610","DOI":"10.1609\/aaai.v35i3.16363"},{"key":"9955_CR28","doi-asserted-by":"crossref","unstructured":"Tan J, Tang J, Wang L, Wu G (2021) Relaxed transformer decoders for direct action proposal generation. In: Proceedings of the IEEE international conference on computer vision (ICCV), pp 13526\u201313535","DOI":"10.1109\/ICCV48922.2021.01327"},{"key":"9955_CR29","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Polosukhin I (2017) Attention is all you need. In: Advances in neural information processing systems, pp 5998\u20136008"},{"key":"9955_CR30","doi-asserted-by":"crossref","unstructured":"Xu H, Das A, Saenko K (2017) R-C3D: region convolutional 3D network for temporal activity detection. In: Proceedings of the IEEE international conference on computer vision (ICCV), pp 5783\u20135792","DOI":"10.1109\/ICCV.2017.617"},{"key":"9955_CR31","doi-asserted-by":"crossref","unstructured":"Xu M, Zhao C, Rojas DS, Thabet A, Ghanem B (2020) G-tad: sub-graph localization for temporal action detection. In: Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR), pp 10156\u201310165","DOI":"10.1109\/CVPR42600.2020.01017"},{"key":"9955_CR32","doi-asserted-by":"crossref","unstructured":"Yan S, Xiong X, Arnab A, Lu Z, Zhang M, Sun C, Schmid C (2022) Multiview transformers for video recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR), pp 3333\u20133343","DOI":"10.1109\/CVPR52688.2022.00333"},{"key":"9955_CR33","doi-asserted-by":"publisher","first-page":"8535","DOI":"10.1109\/TIP.2020.3016486","volume":"29","author":"L Yang","year":"2020","unstructured":"Yang L, Peng H, Zhang D, Fu J, Han J (2020) Revisiting anchor mechanisms for temporal action localization. IEEE Trans Image Process 29:8535\u20138548","journal-title":"IEEE Trans Image Process"},{"issue":"12","key":"9955_CR34","doi-asserted-by":"publisher","first-page":"9814","DOI":"10.1109\/TPAMI.2021.3132058","volume":"44","author":"L Yang","year":"2021","unstructured":"Yang L, Han J, Zhao T, Lin T, Zhang D, Chen J (2021) Background-click supervision for temporal action localization. IEEE Trans Pattern Anal Mach Intell 44(12):9814\u20139829","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"9955_CR35","doi-asserted-by":"crossref","unstructured":"Yuan J, Ni B, Yang X, Kassim AA (2016) Temporal action localization with pyramid of score distribution features. In: Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR), pp 3093\u20133102","DOI":"10.1109\/CVPR.2016.337"},{"key":"9955_CR36","doi-asserted-by":"crossref","unstructured":"Zeng R, Huang W, Tan M, Rong Y, Zhao P, Huang J, Gan C (2019) Graph convolutional networks for temporal action localization. In: Proceedings of the IEEE international conference on computer vision (ICCV), pp 7094\u20137103","DOI":"10.1109\/ICCV.2019.00719"},{"key":"9955_CR37","doi-asserted-by":"crossref","unstructured":"Zhang CL, Wu J, Li Y (2022) Actionformer: localizing moments of actions with transformers. In: European conference on computer vision (ECCV), pp 492\u2013510","DOI":"10.1007\/978-3-031-19772-7_29"},{"key":"9955_CR38","doi-asserted-by":"crossref","unstructured":"Zhao Y, Xiong Y, Wang L, Wu Z, Tang X, Lin D (2017) Temporal action detection with structured segment networks. In: Proceedings of the IEEE international conference on computer vision (ICCV), pp 2914\u20132923","DOI":"10.1109\/ICCV.2017.317"},{"key":"9955_CR39","unstructured":"Zhu X, Su W, Lu L, Li B, Wang X, Dai J (2020) Deformable DETR: deformable transformers for end-to-end object detection. Preprint arXiv:2010.04159"},{"key":"9955_CR40","doi-asserted-by":"crossref","unstructured":"Zhu L, Lee F, Cai J, Yu H, Chen Q (2022) An improved feature pyramid network for object detection. Neurocomputing 483:127\u2013139","DOI":"10.1016\/j.neucom.2022.02.016"}],"container-title":["Soft Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00500-024-09955-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00500-024-09955-x\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00500-024-09955-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,23]],"date-time":"2024-10-23T01:04:13Z","timestamp":1729645453000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00500-024-09955-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,7,18]]},"references-count":40,"journal-issue":{"issue":"20","published-print":{"date-parts":[[2024,10]]}},"alternative-id":["9955"],"URL":"https:\/\/doi.org\/10.1007\/s00500-024-09955-x","relation":{},"ISSN":["1432-7643","1433-7479"],"issn-type":[{"type":"print","value":"1432-7643"},{"type":"electronic","value":"1433-7479"}],"subject":[],"published":{"date-parts":[[2024,7,18]]},"assertion":[{"value":"11 May 2024","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 July 2024","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no Conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"This research does not contain any studies with human participants performed by any of the authors.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical approval"}}]}}