{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T16:06:04Z","timestamp":1777651564286,"version":"3.51.4"},"publisher-location":"Cham","reference-count":56,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031781094","type":"print"},{"value":"9783031781100","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-78110-0_11","type":"book-chapter","created":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T21:53:41Z","timestamp":1733090021000},"page":"164-178","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["Project and\u00a0Pool: An Action Localization Network for\u00a0Localizing Actions in\u00a0Untrimmed Videos"],"prefix":"10.1007","author":[{"given":"Himanshu","family":"Singh","sequence":"first","affiliation":[]},{"given":"Avijit","family":"Dey","sequence":"additional","affiliation":[]},{"given":"Badri Narayan","family":"Subudhi","sequence":"additional","affiliation":[]},{"given":"Vinit","family":"Jakhetiya","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,2]]},"reference":[{"key":"11_CR1","doi-asserted-by":"crossref","unstructured":"Alwassel, H., Giancola, S., Ghanem, B.: TSP: temporally-sensitive pretraining of video encoders for localization tasks. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3173\u20133183 (2021)","DOI":"10.1109\/ICCVW54120.2021.00356"},{"key":"11_CR2","doi-asserted-by":"crossref","unstructured":"Bai, Y., et al.: Boundary content graph neural network for temporal action proposal generation. In: Proceedings of the European Conference on Computer Vision, pp. 121\u2013137. Springer, Cham (2020)","DOI":"10.1007\/978-3-030-58604-1_8"},{"key":"11_CR3","doi-asserted-by":"crossref","unstructured":"Buch, S., et al.: SST: single-stream temporal action proposals. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2911\u20132920 (2017)","DOI":"10.1109\/CVPR.2017.675"},{"key":"11_CR4","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? A new model and the kinetics dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6299\u20136308 (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"11_CR5","doi-asserted-by":"crossref","unstructured":"Chao, Y.-W., et al.: Rethinking the faster R-CNN architecture for temporal action localization. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1130\u20131139 (2018)","DOI":"10.1109\/CVPR.2018.00124"},{"key":"11_CR6","doi-asserted-by":"crossref","unstructured":"Dai, R., et al.: MS-TCT: multi-scale temporal convtransformer for action detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 20041\u201320051 (2022)","DOI":"10.1109\/CVPR52688.2022.01941"},{"key":"11_CR7","doi-asserted-by":"crossref","unstructured":"Dai, R., et al.: PDAN: pyramid dilated attention network for action detection. In: Proceedings of the IEEE Winter Conference on Applications of Computer Vision, pp. 2970\u20132979 (2021)","DOI":"10.1109\/WACV48630.2021.00301"},{"key":"11_CR8","doi-asserted-by":"crossref","unstructured":"Escorcia, V., et al.: DAPS: deep action proposals for action understanding. In: Proceedings of the European Conference on Computer Vision, pp. 768\u2013784 (2016)","DOI":"10.1007\/978-3-319-46487-9_47"},{"key":"11_CR9","doi-asserted-by":"crossref","unstructured":"Fan, H., et al.: Reconfigurable acceleration of 3D-CNNs for human action recognition with block floating-point representation. In: Proceedings of the International Conference on Field Programmable Logic and Applications, pp. 287\u20132877 (2018)","DOI":"10.1109\/FPL.2018.00056"},{"key":"11_CR10","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., et al.: Slowfast networks for video recognition. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 6202\u20136211 (2019)","DOI":"10.1109\/ICCV.2019.00630"},{"key":"11_CR11","doi-asserted-by":"crossref","unstructured":"Gong, G., Zheng, L., Mu, Y.: Scale matters: temporal scale aggregation network for precise action localization in untrimmed videos. In: Proceedings of the IEEE International Conference on Multimedia and Expo, pp. 1\u20136 (2020)","DOI":"10.1109\/ICME46284.2020.9102850"},{"key":"11_CR12","doi-asserted-by":"crossref","unstructured":"Gritsenko, A.A., et al.: End-to-end spatio-temporal action localisation with video transformers. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 18373\u201318383 (2024)","DOI":"10.1109\/CVPR52733.2024.01739"},{"key":"11_CR13","doi-asserted-by":"crossref","unstructured":"Heilbron, F.C., Niebles, J.C., Ghanem, B.: Fast temporal action formervity proposals for efficient detection of human actions in untrimmed videos. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1914\u20131923 (2016)","DOI":"10.1109\/CVPR.2016.211"},{"key":"11_CR14","doi-asserted-by":"crossref","unstructured":"Kapoor, M., et al.: Underwater moving object detection using an end-to-end encoder-decoder architecture and GraphSage with aggregator and refactoring. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops, pp. 5636\u20135645 (2023)","DOI":"10.1109\/CVPRW59228.2023.00597"},{"key":"11_CR15","unstructured":"Kipf, T.N., Welling, M.: Semi-Supervised Classification with Graph Convolutional Networks. (2017). arXiv: 1609.02907"},{"key":"11_CR16","doi-asserted-by":"crossref","unstructured":"Li, Z., et al.: DeTAL: open-vocabulary temporal action localization with decoupled networks. IEEE Trans. Pattern Anal. Mach. Intell. 1\u201314 (2024)","DOI":"10.1109\/TPAMI.2024.3395778"},{"key":"11_CR17","doi-asserted-by":"crossref","unstructured":"Lin, C., et al.: Fast learning of temporal action proposal via dense boundary generator. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 34, pp. 11499\u201311506 (2020)","DOI":"10.1609\/aaai.v34i07.6815"},{"key":"11_CR18","doi-asserted-by":"crossref","unstructured":"Lin, C., et al.: Learning salient boundary feature for anchor-free temporal action localization. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3320\u20133329 (2021)","DOI":"10.1109\/CVPR46437.2021.00333"},{"key":"11_CR19","doi-asserted-by":"crossref","unstructured":"Lin, T., Zhao, X., Shou, Z.: Single shot temporal action detection. In: Proceedings of the ACM International Conference on Multimedia, pp. 988\u2013996 (2017)","DOI":"10.1145\/3123266.3123343"},{"key":"11_CR20","doi-asserted-by":"crossref","unstructured":"Lin, T., et al.: BMN: boundary-matching network for temporal action proposal generation. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 3889\u20133898 (2019)","DOI":"10.1109\/ICCV.2019.00399"},{"key":"11_CR21","doi-asserted-by":"crossref","unstructured":"Lin, T., et al.: BSN: boundary sensitive network for temporal action proposal generation. In: Proceedings of the European Conference on Computer Vision, pp. 3\u201319 (2018)","DOI":"10.1007\/978-3-030-01225-0_1"},{"key":"11_CR22","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., et al.: Focal loss for dense object detection. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2980\u20132988 (2017)","DOI":"10.1109\/ICCV.2017.324"},{"key":"11_CR23","doi-asserted-by":"crossref","unstructured":"Liu, Q., Wang, Z.: Progressive boundary refinement network for temporal action detection. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 34, pp. 11612\u201311619 (2020)","DOI":"10.1609\/aaai.v34i07.6829"},{"key":"11_CR24","doi-asserted-by":"crossref","unstructured":"Liu, W., et al.: SSD: single shot multibox detector. In: Proceedings of the European Conference on Computer Vision, pp. 21\u201337 (2016)","DOI":"10.1007\/978-3-319-46448-0_2"},{"key":"11_CR25","doi-asserted-by":"publisher","first-page":"5427","DOI":"10.1109\/TIP.2022.3195321","volume":"31","author":"X Liu","year":"2022","unstructured":"Liu, X., et al.: End-to-end temporal action detection with transformer. IEEE Trans. Image Process. 31, 5427\u20135441 (2022)","journal-title":"IEEE Trans. Image Process."},{"key":"11_CR26","doi-asserted-by":"crossref","unstructured":"Liu, X., et al.: Multi-shot temporal event localization: a benchmark. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 12596\u201312606 (2021)","DOI":"10.1109\/CVPR46437.2021.01241"},{"key":"11_CR27","doi-asserted-by":"crossref","unstructured":"Long, F., et al.: Gaussian temporal awareness networks for action localization. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 344\u2013353 (2019)","DOI":"10.1109\/CVPR.2019.00043"},{"key":"11_CR28","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: Proceedings of the International Conference on Learning Representations, pp. 1\u201319 (2018)"},{"key":"11_CR29","doi-asserted-by":"crossref","unstructured":"Meng, H., Pears, N., Bailey, C.: A human action recognition system for embedded computer vision application. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1\u20136 (2007)","DOI":"10.1109\/CVPR.2007.383420"},{"key":"11_CR30","doi-asserted-by":"crossref","unstructured":"Qing, Z., et al.: Temporal context aggregation network for temporal action proposal refinement. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 485\u2013494 (2021)","DOI":"10.1109\/CVPR46437.2021.00055"},{"key":"11_CR31","doi-asserted-by":"crossref","unstructured":"Qiu, Z., Yao, T., Mei, T.: Learning spatio-temporal representation with pseudo-3D residual networks. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5533\u20135541 (2017)","DOI":"10.1109\/ICCV.2017.590"},{"issue":"5","key":"11_CR32","doi-asserted-by":"publisher","first-page":"611","DOI":"10.1109\/TCSVT.2011.2129370","volume":"21","author":"C Rougier","year":"2011","unstructured":"Rougier, C., et al.: Robust video surveillance for fall detection based on human shape deformation. IEEE Trans. Circuits Syst. Video Technol. 21(5), 611\u2013622 (2011)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"11_CR33","doi-asserted-by":"crossref","unstructured":"Shao, J., et al.: Action sensitivity learning for temporal action localization. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 13457\u201313469 (2023)","DOI":"10.1109\/ICCV51070.2023.01238"},{"key":"11_CR34","doi-asserted-by":"crossref","unstructured":"Shou, Z., et al.: CDC: convolutional-de-convolutional networks for precise temporal action localization in untrimmed videos. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5734\u20135743 (2017)","DOI":"10.1109\/CVPR.2017.155"},{"issue":"1","key":"11_CR35","first-page":"1","volume":"1","author":"H Singh","year":"2022","unstructured":"Singh, H., et al.: Action recognition in dark videos using spatio-temporal features and bidirectional encoder representations from transformers. IEEE Trans. Artif. Intell. 1(1), 1\u201311 (2022)","journal-title":"IEEE Trans. Artif. Intell."},{"key":"11_CR36","doi-asserted-by":"crossref","unstructured":"Singh, H., et al.: C3D and localization model for locating and recognizing the actions from untrimmed videos (student abstract). In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 36, pp. 13051\u201313052 (2022)","DOI":"10.1609\/aaai.v36i11.21662"},{"key":"11_CR37","doi-asserted-by":"crossref","unstructured":"Sridhar, D., et al.: Class semantics-based attention for action detection. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 13739\u201313748 (2021)","DOI":"10.1109\/ICCV48922.2021.01348"},{"key":"11_CR38","unstructured":"Tan, J., et al.: PointTAD: Multi-Label Temporal Action Detection with Learnable Query Points. arXiv preprint arXiv:2210.11035 (2022)"},{"key":"11_CR39","doi-asserted-by":"crossref","unstructured":"Tan, J., et al.: Relaxed transformer decoders for direct action proposal generation. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 13526\u201313535 (2021)","DOI":"10.1109\/ICCV48922.2021.01327"},{"key":"11_CR40","doi-asserted-by":"crossref","unstructured":"Tirupattur, P., et al.: Modeling multi-label action dependencies for temporal action localization. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1460\u20131470 (2021)","DOI":"10.1109\/CVPR46437.2021.00151"},{"key":"11_CR41","doi-asserted-by":"crossref","unstructured":"Tran, D., et al.: A closer look at spatiotemporal convolutions for action recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6450\u20136459 (2018)","DOI":"10.1109\/CVPR.2018.00675"},{"key":"11_CR42","doi-asserted-by":"crossref","unstructured":"Wang, L., et al.: Temporal segment networks: towards good practices for deep action recognition. In: Proceedings of the European Conference on Computer Vision, pp. 20\u201336 (2016)","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"11_CR43","unstructured":"Wang, L., et al.: Temporal Action Proposal Generation with Transformers (2021). arXiv: 2105.12043"},{"key":"11_CR44","doi-asserted-by":"crossref","unstructured":"Xu, M., et al.: G-TAD: sub-graph localization for temporal action detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 10156\u201310165 (2020)","DOI":"10.1109\/CVPR42600.2020.01017"},{"key":"11_CR45","doi-asserted-by":"publisher","first-page":"8535","DOI":"10.1109\/TIP.2020.3016486","volume":"29","author":"L Yang","year":"2020","unstructured":"Yang, L., et al.: Revisiting anchor mechanisms for temporal action localization. IEEE Trans. Image Process. 29, 8535\u20138548 (2020)","journal-title":"IEEE Trans. Image Process."},{"key":"11_CR46","doi-asserted-by":"crossref","unstructured":"Yang, Z., Qin, J., Huang, D.: ACGNET: action complement graph network for weakly-supervised temporal action localization. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 36, pp. 3090\u20133098 (2022)","DOI":"10.1609\/aaai.v36i3.20216"},{"key":"11_CR47","doi-asserted-by":"crossref","unstructured":"Yuan, L., et al.: Tokens-to-token VIT: training vision transformers from scratch on imagenet. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 558\u2013567 (2021)","DOI":"10.1109\/ICCV48922.2021.00060"},{"key":"11_CR48","unstructured":"Zeng, R., et al.: Graph convolutional networks for temporal action localization. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 7094\u20137103 (2019)"},{"key":"11_CR49","doi-asserted-by":"crossref","unstructured":"Zhang, C.-L., Wu, J., Li, Y.: Actionformer: localizing moments of actions with transformers. In: Proceedings of the European Conference on Computer Vision, pp. 492\u2013510 (2022)","DOI":"10.1007\/978-3-031-19772-7_29"},{"key":"11_CR50","doi-asserted-by":"crossref","unstructured":"Zhao, C., Thabet, A.K., Ghanem, B.: Video self-stitching graph network for temporal action localization. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 13658\u201313667 (2021)","DOI":"10.1109\/ICCV48922.2021.01340"},{"key":"11_CR51","doi-asserted-by":"crossref","unstructured":"Zhao, C., et al.: Re2TAL: rewiring pretrained video backbones for reversible temporal action localization. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 10637\u201310647 (2023)","DOI":"10.1109\/CVPR52729.2023.01025"},{"key":"11_CR52","doi-asserted-by":"crossref","unstructured":"Zhao, P., et al.: Bottom-up temporal action localization with mutual regularization. In: Proceedings of the European Conference on Computer Vision, pp. 539\u2013555 (2020)","DOI":"10.1007\/978-3-030-58598-3_32"},{"key":"11_CR53","doi-asserted-by":"crossref","unstructured":"Zheng, Z., et al.: Distance-IoU loss: faster and better learning for bounding box regression. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 34, pp. 12993\u201313000 (2020)","DOI":"10.1609\/aaai.v34i07.6999"},{"key":"11_CR54","unstructured":"Zhu, X., et al.: Deformable DETR: Deformable Transformers for End-to-End Object Detection. arXiv preprint arXiv:2010.04159 (2020)"},{"key":"11_CR55","doi-asserted-by":"crossref","unstructured":"Zhu, Z., et al.: Contextloc++: a unified context model for temporal action localization. IEEE Trans. Pattern Anal. Mach. Intell. (2023)","DOI":"10.1109\/TPAMI.2023.3237597"},{"key":"11_CR56","doi-asserted-by":"crossref","unstructured":"Zhu, Z., et al.: Enriching local and global contexts for temporal action localization. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 13516\u201313525 (2021)","DOI":"10.1109\/ICCV48922.2021.01326"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-78110-0_11","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T23:33:09Z","timestamp":1733095989000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-78110-0_11"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,2]]},"ISBN":["9783031781094","9783031781100"],"references-count":56,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-78110-0_11","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12,2]]},"assertion":[{"value":"2 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICPR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Pattern Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Kolkata","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"India","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icpr2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icpr2024.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}