{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T16:17:38Z","timestamp":1775578658674,"version":"3.50.1"},"publisher-location":"Cham","reference-count":83,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031197710","type":"print"},{"value":"9783031197727","type":"electronic"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-19772-7_29","type":"book-chapter","created":{"date-parts":[[2022,10,27]],"date-time":"2022-10-27T22:09:58Z","timestamp":1666908598000},"page":"492-510","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":321,"title":["ActionFormer: Localizing Moments of\u00a0Actions with\u00a0Transformers"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3168-1852","authenticated-orcid":false,"given":"Chen-Lin","family":"Zhang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2085-7568","authenticated-orcid":false,"given":"Jianxin","family":"Wu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4173-9453","authenticated-orcid":false,"given":"Yin","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,10,28]]},"reference":[{"key":"29_CR1","doi-asserted-by":"crossref","unstructured":"Alwassel, H., Giancola, S., Ghanem, B.: TSP: Temporally-sensitive pretraining of video encoders for localization tasks. In: International Conference on Computer Vision Workshops, pp. 1\u201311 (2021)","DOI":"10.1109\/ICCVW54120.2021.00356"},{"key":"29_CR2","doi-asserted-by":"crossref","unstructured":"Arnab, A., Dehghani, M., Heigold, G., Sun, C., Lu\u010di\u0107, M., Schmid, C.: ViViT: a video vision transformer. In: International Conference on Computer Vision (2021)","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"29_CR3","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1007\/978-3-030-58604-1_8","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Y Bai","year":"2020","unstructured":"Bai, Y., Wang, Y., Tong, Y., Yang, Y., Liu, Q., Liu, J.: Boundary content graph neural network for temporal action proposal generation. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12373, pp. 121\u2013137. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58604-1_8"},{"key":"29_CR4","unstructured":"Beltagy, I., Peters, M.E., Cohan, A.: LongFormer: the long-document transformer. arXiv:2004.05150 (2020)"},{"key":"29_CR5","doi-asserted-by":"crossref","unstructured":"Bodla, N., Singh, B., Chellappa, R., Davis, L.S.: Soft-NMS-improving object detection with one line of code. In: International Conference on Computer Vision, pp. 5561\u20135569 (2017)","DOI":"10.1109\/ICCV.2017.593"},{"key":"29_CR6","doi-asserted-by":"crossref","unstructured":"Buch, S., Escorcia, V., Ghanem, B., Niebles Carlos, J.: End-to-end, single-stream temporal action detection in untrimmed videos. In: British Machine Vision Conference, pp. 93.1\u201393.12 (2017)","DOI":"10.5244\/C.31.93"},{"key":"29_CR7","doi-asserted-by":"crossref","unstructured":"Buch, S., Escorcia, V., Shen, C., Ghanem, B., Carlos Niebles, J.: SST: Single-stream temporal action proposals. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 2911\u20132920 (2017)","DOI":"10.1109\/CVPR.2017.675"},{"key":"29_CR8","doi-asserted-by":"crossref","unstructured":"Caba Heilbron, F., Carlos Niebles, J., Ghanem, B.: Fast temporal activity proposals for efficient detection of human actions in untrimmed videos. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 1914\u20131923 (2016)","DOI":"10.1109\/CVPR.2016.211"},{"key":"29_CR9","doi-asserted-by":"crossref","unstructured":"Caba Heilbron, F., Escorcia, V., Ghanem, B., Carlos Niebles, J.: ActivityNet: a large-scale video benchmark for human activity understanding. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 961\u2013970 (2015)","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"29_CR10","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"29_CR11","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? A new model and the Kinetics dataset. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 4724\u20134733 (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"29_CR12","doi-asserted-by":"crossref","unstructured":"Chang, S., Wang, P., Wang, F., Li, H., Feng, J.: Augmented transformer with adaptive graph for temporal action proposal generation. arXiv preprint arXiv:2103.16024 (2021)","DOI":"10.1145\/3552458.3556443"},{"key":"29_CR13","doi-asserted-by":"crossref","unstructured":"Chao, Y.W., Vijayanarasimhan, S., Seybold, B., Ross, D.A., Deng, J., Sukthankar, R.: Rethinking the Faster-RCNN architecture for temporal action localization. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 1130\u20131139 (2018)","DOI":"10.1109\/CVPR.2018.00124"},{"key":"29_CR14","unstructured":"Cheng, B., Schwing, A.G., Kirillov, A.: Per-pixel classification is not all you need for semantic segmentation. In: Advances in Neural Information Processing Systems (2021)"},{"key":"29_CR15","unstructured":"Choromanski, K., et al.: Rethinking attention with performers. In: International Conference on Learning Representations (2021)"},{"key":"29_CR16","doi-asserted-by":"crossref","unstructured":"Dai, X., Chen, Y., Yang, J., Zhang, P., Yuan, L., Zhang, L.: Dynamic DETR: end-to-end object detection with dynamic attention. In: International Conference on Computer Vision, pp. 2988\u20132997 (2021)","DOI":"10.1109\/ICCV48922.2021.00298"},{"key":"29_CR17","unstructured":"Damen, D., et al.: Rescaling egocentric vision. arXiv preprint arXiv:2006.13256 (2020)"},{"key":"29_CR18","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: North American Chapter of the Association for Computational Linguistics, pp. 4171\u20134186 (2019)"},{"key":"29_CR19","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. In: International Conference on Learning Representations (2021)"},{"key":"29_CR20","doi-asserted-by":"crossref","unstructured":"Duan, K., Bai, S., Xie, L., Qi, H., Huang, Q., Tian, Q.: CenterNet: keypoint triplets for object detection. In: International Conference on Computer Vision, pp. 6569\u20136578 (2019)","DOI":"10.1109\/ICCV.2019.00667"},{"key":"29_CR21","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"768","DOI":"10.1007\/978-3-319-46487-9_47","volume-title":"Computer Vision \u2013 ECCV 2016","author":"V Escorcia","year":"2016","unstructured":"Escorcia, V., Caba Heilbron, F., Niebles, J.C., Ghanem, B.: DAPs: deep action proposals for action understanding. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9907, pp. 768\u2013784. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46487-9_47"},{"key":"29_CR22","doi-asserted-by":"crossref","unstructured":"Fan, H., Xiong, B., Mangalam, K., Li, Y., Yan, Z., Malik, J., Feichtenhofer, C.: Multiscale vision transformers. In: International Conference on Computer Vision (2021)","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"29_CR23","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., He, K.: SlowFast networks for video recognition. In: International Conference on Computer Vision, pp. 6202\u20136211 (2019)","DOI":"10.1109\/ICCV.2019.00630"},{"key":"29_CR24","doi-asserted-by":"crossref","unstructured":"Girdhar, R., Carreira, J., Doersch, C., Zisserman, A.: Video action transformer network. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 244\u2013253 (2019)","DOI":"10.1109\/CVPR.2019.00033"},{"key":"29_CR25","doi-asserted-by":"crossref","unstructured":"Gong, G., Zheng, L., Mu, Y.: Scale matters: temporal scale aggregation network for precise action localization in untrimmed videos. In: International Conference on Multimedia and Expo, pp. 1\u20136. IEEE (2020)","DOI":"10.1109\/ICME46284.2020.9102850"},{"key":"29_CR26","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/j.cviu.2016.10.018","volume":"155","author":"H Idrees","year":"2017","unstructured":"Idrees, H., et al.: The THUMOS challenge on action recognition for videos \u201cin the wild\u2019\u2019. Comput. Vis. Image Under. 155, 1\u201323 (2017)","journal-title":"Comput. Vis. Image Under."},{"key":"29_CR27","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. In: International Conference on Learning Representations, pp. 1\u201311 (2015)"},{"key":"29_CR28","unstructured":"Li, S., et al.: Enhancing the locality and breaking the memory bottleneck of transformer on time series forecasting. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"29_CR29","doi-asserted-by":"crossref","unstructured":"Li, X., et al.: Deep concept-wise temporal convolutional networks for action localization. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 4004\u20134012 (2020)","DOI":"10.1145\/3394171.3413860"},{"key":"29_CR30","doi-asserted-by":"crossref","unstructured":"Lin, C., et al.: Fast learning of temporal action proposal via dense boundary generator. In: AAAI, pp. 11499\u201311506 (2020)","DOI":"10.1609\/aaai.v34i07.6815"},{"key":"29_CR31","doi-asserted-by":"crossref","unstructured":"Lin, C., et al.: Learning salient boundary feature for anchor-free temporal action localization. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 3320\u20133329 (2021)","DOI":"10.1109\/CVPR46437.2021.00333"},{"key":"29_CR32","doi-asserted-by":"crossref","unstructured":"Lin, T., Liu, X., Li, X., Ding, E., Wen, S.: BMN: boundary-matching network for temporal action proposal generation. In: International Conference on Computer Vision, pp. 3889\u20133898 (2019)","DOI":"10.1109\/ICCV.2019.00399"},{"key":"29_CR33","doi-asserted-by":"crossref","unstructured":"Lin, T., Zhao, X., Shou, Z.: Single shot temporal action detection. In: ACM International Conference on Multimedia, pp. 988\u2013996 (2017)","DOI":"10.1145\/3123266.3123343"},{"key":"29_CR34","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1007\/978-3-030-01225-0_1","volume-title":"Computer Vision \u2013 ECCV 2018","author":"T Lin","year":"2018","unstructured":"Lin, T., Zhao, X., Su, H., Wang, C., Yang, M.: BSN: boundary sensitive network for temporal action proposal generation. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11208, pp. 3\u201321. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01225-0_1"},{"key":"29_CR35","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Doll\u00e1r, P., Girshick, R., He, K., Hariharan, B., Belongie, S.: Feature pyramid networks for object detection. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 2117\u20132125 (2017)","DOI":"10.1109\/CVPR.2017.106"},{"key":"29_CR36","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Goyal, P., Girshick, R., He, K., Doll\u00e1r, P.: Focal loss for dense object detection. In: IEEE Conference on Computer Vision, pp. 2980\u20132988 (2017)","DOI":"10.1109\/ICCV.2017.324"},{"key":"29_CR37","doi-asserted-by":"crossref","unstructured":"Liu, D., Jiang, T., Wang, Y.: Completeness modeling and context separation for weakly supervised temporal action localization. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 1298\u20131307 (2019)","DOI":"10.1109\/CVPR.2019.00139"},{"key":"29_CR38","doi-asserted-by":"crossref","unstructured":"Liu, L., Liu, X., Gao, J., Chen, W., Han, J.: Understanding the difficulty of training transformers. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 5747\u20135763 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.463"},{"key":"29_CR39","doi-asserted-by":"crossref","unstructured":"Liu, Q., Wang, Z.: Progressive boundary refinement network for temporal action detection. In: AAAI, vol. 34, pp. 11612\u201311619 (2020)","DOI":"10.1609\/aaai.v34i07.6829"},{"key":"29_CR40","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"21","DOI":"10.1007\/978-3-319-46448-0_2","volume-title":"Computer Vision \u2013 ECCV 2016","author":"W Liu","year":"2016","unstructured":"Liu, W., et al.: SSD: single shot multibox detector. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 21\u201337. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_2"},{"key":"29_CR41","doi-asserted-by":"crossref","unstructured":"Liu, X., Hu, Y., Bai, S., Ding, F., Bai, X., Torr, P.H.: Multi-shot temporal event localization: a benchmark. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 12596\u201312606 (2021)","DOI":"10.1109\/CVPR46437.2021.01241"},{"key":"29_CR42","doi-asserted-by":"crossref","unstructured":"Liu, X., Wang, Q., Hu, Y., Tang, X., Bai, S., Bai, X.: End-to-end temporal action detection with transformer. arXiv preprint arXiv:2106.10271 (2021)","DOI":"10.1109\/TIP.2022.3195321"},{"key":"29_CR43","doi-asserted-by":"crossref","unstructured":"Liu, Y., Ma, L., Zhang, Y., Liu, W., Chang, S.F.: Multi-granularity generator for temporal action proposal. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 3604\u20133613 (2019)","DOI":"10.1109\/CVPR.2019.00372"},{"key":"29_CR44","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: IEEE Conference on Computer Vision (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"29_CR45","unstructured":"Liu, Z., et al.: Video swin transformer. arXiv preprint arXiv:2106.13230 (2021)"},{"key":"29_CR46","doi-asserted-by":"crossref","unstructured":"Long, F., Yao, T., Qiu, Z., Tian, X., Luo, J., Mei, T.: Gaussian temporal awareness networks for action localization. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 344\u2013353 (2019)","DOI":"10.1109\/CVPR.2019.00043"},{"key":"29_CR47","unstructured":"Paszke, A., et al.: PyTorch: an imperative style, high-performance deep learning library. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"29_CR48","doi-asserted-by":"crossref","unstructured":"Qing, Z., et al.: Temporal context aggregation network for temporal action proposal refinement. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 485\u2013494 (2021)","DOI":"10.1109\/CVPR46437.2021.00055"},{"key":"29_CR49","doi-asserted-by":"crossref","unstructured":"Qiu, Z., Yao, T., Mei, T.: Learning spatio-temporal representation with pseudo-3D residual networks. In: IEEE Conference on Computer Vision , pp. 5533\u20135541 (2017)","DOI":"10.1109\/ICCV.2017.590"},{"key":"29_CR50","doi-asserted-by":"crossref","unstructured":"Rezatofighi, H., Tsoi, N., Gwak, J., Sadeghian, A., Reid, I., Savarese, S.: Generalized intersection over union: a metric and a loss for bounding box regression. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 658\u2013666 (2019)","DOI":"10.1109\/CVPR.2019.00075"},{"key":"29_CR51","doi-asserted-by":"crossref","unstructured":"Shou, Z., Chan, J., Zareian, A., Miyazawa, K., Chang, S.F.: CDC: convolutional-de-convolutional networks for precise temporal action localization in untrimmed videos. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 5734\u20135743 (2017)","DOI":"10.1109\/CVPR.2017.155"},{"key":"29_CR52","doi-asserted-by":"crossref","unstructured":"Shou, Z., Wang, D., Chang, S.F.: Temporal action localization in untrimmed videos via multi-stage CNNs. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 1049\u20131058 (2016)","DOI":"10.1109\/CVPR.2016.119"},{"key":"29_CR53","doi-asserted-by":"crossref","unstructured":"Sridhar, D., Quader, N., Muralidharan, S., Li, Y., Dai, P., Lu, J.: Class semantics-based attention for action detection. In: IEEE Conference on Computer Vision, pp. 13739\u201313748 (2021)","DOI":"10.1109\/ICCV48922.2021.01348"},{"key":"29_CR54","doi-asserted-by":"crossref","unstructured":"Tan, J., Tang, J., Wang, L., Wu, G.: Relaxed transformer decoders for direct action proposal generation. In: IEEE Conference on Computer Vision, pp. 13526\u201313535 (2021)","DOI":"10.1109\/ICCV48922.2021.01327"},{"key":"29_CR55","doi-asserted-by":"crossref","unstructured":"Tian, Z., Shen, C., Chen, H., He, T.: FCOS: fully convolutional one-stage object detection. In: IEEE Conference on Computer Vision, pp. 9627\u20139636 (2019)","DOI":"10.1109\/ICCV.2019.00972"},{"key":"29_CR56","unstructured":"Touvron, H., Cord, M., Douze, M., Massa, F., Sablayrolles, A., J\u00e9gou, H.: Training data-efficient image transformers & distillation through attention. In: International Conference on Machine Learning, pp. 10347\u201310357 (2021)"},{"key":"29_CR57","doi-asserted-by":"crossref","unstructured":"Touvron, H., Cord, M., Sablayrolles, A., Synnaeve, G., J\u00e9gou, H.: Going deeper with image transformers. In: IEEE Conference on Computer Vision (2021)","DOI":"10.1109\/ICCV48922.2021.00010"},{"key":"29_CR58","doi-asserted-by":"crossref","unstructured":"Tran, D., Wang, H., Torresani, L., Ray, J., LeCun, Y., Paluri, M.: A closer look at spatiotemporal convolutions for action recognition. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 6450\u20136459 (2018)","DOI":"10.1109\/CVPR.2018.00675"},{"key":"29_CR59","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, pp. 5998\u20136008 (2017)"},{"key":"29_CR60","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"20","DOI":"10.1007\/978-3-319-46484-8_2","volume-title":"Computer Vision \u2013 ECCV 2016","author":"L Wang","year":"2016","unstructured":"Wang, L., et al.: Temporal segment networks: towards good practices for deep action recognition. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9912, pp. 20\u201336. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46484-8_2"},{"key":"29_CR61","unstructured":"Wang, L., Yang, H., Wu, W., Yao, H., Huang, H.: Temporal action proposal generation with transformers. arXiv preprint arXiv:2105.12043 (2021)"},{"key":"29_CR62","unstructured":"Wang, S., Li, B., Khabsa, M., Fang, H., Ma, H.: LinFormer: self-attention with linear complexity. arXiv preprint arXiv:2006.04768 (2020)"},{"key":"29_CR63","doi-asserted-by":"crossref","unstructured":"Wang, T., Yuan, L., Chen, Y., Feng, J., Yan, S.: PnP-DETR: towards efficient visual analysis with transformers. In: IEEE Conference on Computer Vision, pp. 4661\u20134670 (2021)","DOI":"10.1109\/ICCV48922.2021.00462"},{"key":"29_CR64","doi-asserted-by":"crossref","unstructured":"Wang, W., et al.: Pyramid vision transformer: a versatile backbone for dense prediction without convolutions. In: IEEE Conference on Computer Vision (2021)","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"29_CR65","doi-asserted-by":"crossref","unstructured":"Wang, Y., et al.: End-to-end video instance segmentation with transformers. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 8741\u20138750 (2021)","DOI":"10.1109\/CVPR46437.2021.00863"},{"key":"29_CR66","unstructured":"Xiao, T., Singh, M., Mintun, E., Darrell, T., Doll\u00e1r, P., Girshick, R.: Early convolutions help transformers see better. In: Advances in Neural Information Processing Systems (2021)"},{"key":"29_CR67","unstructured":"Xie, E., Wang, W., Yu, Z., Anandkumar, A., Alvarez, J.M., Luo, P.: SegFormer: simple and efficient design for semantic segmentation with transformers. In: Advances in Neural Information Processing Systems (2021)"},{"key":"29_CR68","doi-asserted-by":"crossref","unstructured":"Xiong, Y., et al.: Nystr\u00f6mformer: a nystr\u00f6m-based algorithm for approximating self-attention. In: AAAI, vol. 35, pp. 14138\u201314148 (2021)","DOI":"10.1609\/aaai.v35i16.17664"},{"key":"29_CR69","doi-asserted-by":"crossref","unstructured":"Xu, M., Zhao, C., Rojas, D.S., Thabet, A., Ghanem, B.: G-TAD: sub-graph localization for temporal action detection. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 10156\u201310165 (2020)","DOI":"10.1109\/CVPR42600.2020.01017"},{"key":"29_CR70","unstructured":"Yang, J., et al.: Focal self-attention for local-global interactions in vision transformers. In: Advances in Neural Information Processing Systems (2021)"},{"key":"29_CR71","doi-asserted-by":"publisher","first-page":"8535","DOI":"10.1109\/TIP.2020.3016486","volume":"29","author":"L Yang","year":"2020","unstructured":"Yang, L., Peng, H., Zhang, D., Fu, J., Han, J.: Revisiting anchor mechanisms for temporal action localization. IEEE Trans. Image Process. 29, 8535\u20138548 (2020)","journal-title":"IEEE Trans. Image Process."},{"key":"29_CR72","doi-asserted-by":"crossref","unstructured":"Yang, Z., Qin, J., Huang, D.: AcgNet: action complement graph network for weakly-supervised temporal action localization. In: AAAI, vol. 36\u20133, pp. 3090\u20133098 (2022)","DOI":"10.1609\/aaai.v36i3.20216"},{"key":"29_CR73","doi-asserted-by":"crossref","unstructured":"Yuan, L., et al.: Tokens-to-token ViT: training vision transformers from scratch on ImageNet. In: IEEE Conference on Computer Vision (2021)","DOI":"10.1109\/ICCV48922.2021.00060"},{"key":"29_CR74","doi-asserted-by":"crossref","unstructured":"Zeng, R., et al.: Graph convolutional networks for temporal action localization. In: IEEE Conference on Computer Vision, pp. 7094\u20137103 (2019)","DOI":"10.1109\/ICCV.2019.00719"},{"key":"29_CR75","doi-asserted-by":"crossref","unstructured":"Zeng, R., Xu, H., Huang, W., Chen, P., Tan, M., Gan, C.: Dense regression network for video grounding. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 10287\u201310296 (2020)","DOI":"10.1109\/CVPR42600.2020.01030"},{"key":"29_CR76","doi-asserted-by":"crossref","unstructured":"Zhang, S., Chi, C., Yao, Y., Lei, Z., Li, S.Z.: Bridging the gap between anchor-based and anchor-free detection via adaptive training sample selection. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 9759\u20139768 (2020)","DOI":"10.1109\/CVPR42600.2020.00978"},{"key":"29_CR77","doi-asserted-by":"crossref","unstructured":"Zhao, C., Thabet, A.K., Ghanem, B.: Video self-stitching graph network for temporal action localization. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 13658\u201313667 (2021)","DOI":"10.1109\/ICCV48922.2021.01340"},{"key":"29_CR78","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"539","DOI":"10.1007\/978-3-030-58598-3_32","volume-title":"Computer Vision \u2013 ECCV 2020","author":"P Zhao","year":"2020","unstructured":"Zhao, P., Xie, L., Ju, C., Zhang, Y., Wang, Y., Tian, Q.: Bottom-up temporal action localization with mutual regularization. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12353, pp. 539\u2013555. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58598-3_32"},{"key":"29_CR79","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Xiong, Y., Wang, L., Wu, Z., Tang, X., Lin, D.: Temporal action detection with structured segment networks. In: IEEE Conference on Computer Vision, pp. 2914\u20132923 (2017)","DOI":"10.1109\/ICCV.2017.317"},{"key":"29_CR80","unstructured":"Zhao, Y., et al.: CUHK & ETHZ & SIAT submission to ActivityNet challenge 2017. arXiv preprint arXiv:1710.08011 (2017)"},{"key":"29_CR81","doi-asserted-by":"crossref","unstructured":"Zheng, Z., Wang, P., Liu, W., Li, J., Ye, R., Ren, D.: Distance-IoU loss: faster and better learning for bounding box regression. In: AAAI (2020)","DOI":"10.1609\/aaai.v34i07.6999"},{"key":"29_CR82","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., Dai, J.: Deformable DETR: deformable transformers for end-to-end object detection. In: International Conference on Learning Representations, pp. 1\u201311 (2021)"},{"key":"29_CR83","doi-asserted-by":"crossref","unstructured":"Zhu, Z., Tang, W., Wang, L., Zheng, N., Hua, G.: Enriching local and global contexts for temporal action localization. In: International Conference on Computer Vision, pp. 13516\u201313525 (2021)","DOI":"10.1109\/ICCV48922.2021.01326"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-19772-7_29","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,12]],"date-time":"2024-03-12T16:54:31Z","timestamp":1710262471000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-19772-7_29"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031197710","9783031197727"],"references-count":83,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-19772-7_29","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"28 October 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}