{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,20]],"date-time":"2025-08-20T13:02:51Z","timestamp":1755694971102,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":40,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819609079"},{"type":"electronic","value":"9789819609086"}],"license":[{"start":{"date-parts":[[2024,12,7]],"date-time":"2024-12-07T00:00:00Z","timestamp":1733529600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,7]],"date-time":"2024-12-07T00:00:00Z","timestamp":1733529600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-0908-6_24","type":"book-chapter","created":{"date-parts":[[2024,12,6]],"date-time":"2024-12-06T19:24:07Z","timestamp":1733513047000},"page":"422-438","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["TAPS: Temporal Attention-Based Pruning and\u00a0Scaling for\u00a0Efficient Video Action Recognition"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-8994-054X","authenticated-orcid":false,"given":"Yonatan","family":"Dinai","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4428-0505","authenticated-orcid":false,"given":"Avraham","family":"Raviv","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0000-1706-2695","authenticated-orcid":false,"given":"Nimrod","family":"Harel","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0008-3334-309X","authenticated-orcid":false,"given":"Donghoon","family":"Kim","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0003-7883-032X","authenticated-orcid":false,"given":"Ishay","family":"Goldin","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0000-5623-8040","authenticated-orcid":false,"given":"Niv","family":"Zehngut","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,7]]},"reference":[{"key":"24_CR1","doi-asserted-by":"crossref","unstructured":"Arnab, A., Dehghani, M., Heigold, G., Sun, C., Lu\u010di\u0107, M., Schmid, C.: Vivit: A video vision transformer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV). pp. 6836\u20136846 (October 2021)","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"24_CR2","doi-asserted-by":"crossref","unstructured":"Caba\u00a0Heilbron, F., Escorcia, V., Ghanem, B., Carlos\u00a0Niebles, J.: Activitynet: A large-scale video benchmark for human activity understanding. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 961\u2013970 (2015)","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"24_CR3","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? a new model and the kinetics dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (July 2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"24_CR4","unstructured":"Chung, J., Gulcehre, C., Cho, K., Bengio, Y.: Empirical evaluation of gated recurrent neural networks on sequence modeling. CoRR abs\/1412.3555 (2014), http:\/\/arxiv.org\/abs\/1412.3555"},{"key":"24_CR5","doi-asserted-by":"crossref","unstructured":"Dai, R., Das, S., Kahatapitiya, K., Ryoo, M.S., Br\u00e9mond, F.: Ms-tct: multi-scale temporal convtransformer for action detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 20041\u201320051 (2022)","DOI":"10.1109\/CVPR52688.2022.01941"},{"key":"24_CR6","unstructured":"Dror, A.B., Zehngut, N., Raviv, A., Artyomov, E., Vitek, R.: Layer folding: Neural network depth reduction using activation linearization. In: 33rd British Machine Vision Conference 2022, BMVC 2022, London, UK, November 21-24, 2022. BMVA Press (2022), https:\/\/bmvc2022.mpi-inf.mpg.de\/0612.pdf"},{"key":"24_CR7","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C.: X3d: Expanding architectures for efficient video recognition. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp. 203\u2013213 (2020)","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"24_CR8","doi-asserted-by":"crossref","unstructured":"Gao, R., Oh, T.H., Grauman, K., Torresani, L.: Listen to look: Action recognition by previewing audio (2020)","DOI":"10.1109\/CVPR42600.2020.01047"},{"key":"24_CR9","unstructured":"Gao, X., Zhao, Y., Dudziak, \u0141., Mullins, R., Xu, C.z.: Dynamic channel pruning: Feature boosting and suppression. arXiv preprint arXiv:1810.05331 (2018)"},{"key":"24_CR10","doi-asserted-by":"crossref","unstructured":"Gao, Y., Zhang, B., Qi, X., So, H.K.H.: Dpacs: Hardware accelerated dynamic neural network pruning through algorithm-architecture co-design. In: Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2. pp. 237\u2013251 (2023)","DOI":"10.1145\/3575693.3575728"},{"key":"24_CR11","doi-asserted-by":"crossref","unstructured":"Ghodrati, A., Bejnordi, B.E., Habibian, A.: Frameexit: Conditional early exiting for efficient video recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2021)","DOI":"10.1109\/CVPR46437.2021.01535"},{"key":"24_CR12","doi-asserted-by":"crossref","unstructured":"Girdhar, R., Carreira, J., Doersch, C., Zisserman, A.: Video action transformer network. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp. 244\u2013253 (2019)","DOI":"10.1109\/CVPR.2019.00033"},{"key":"24_CR13","doi-asserted-by":"crossref","unstructured":"Goyal, R., Ebrahimi\u00a0Kahou, S., Michalski, V., Materzynska, J., Westphal, S., Kim, H., Haenel, V., Fruend, I., Yianilos, P., Mueller-Freitag, M., et\u00a0al.: The \"something something\" video database for learning and evaluating visual common sense. CoRR abs\/1706.04261 (2017), http:\/\/arxiv.org\/abs\/1706.04261","DOI":"10.1109\/ICCV.2017.622"},{"issue":"11","key":"24_CR14","doi-asserted-by":"publisher","first-page":"7436","DOI":"10.1109\/TPAMI.2021.3117837","volume":"44","author":"Y Han","year":"2021","unstructured":"Han, Y., Huang, G., Song, S., Yang, L., Wang, H., Wang, Y.: Dynamic neural networks: A survey. IEEE Trans. Pattern Anal. Mach. Intell. 44(11), 7436\u20137456 (2021)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"24_CR15","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"issue":"8","key":"24_CR16","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"24_CR17","unstructured":"Jang, E., Gu, S., Poole, B.: Categorical reparameterization with gumbel-softmax. arXiv preprint arXiv:1611.01144 (2016)"},{"key":"24_CR18","unstructured":"Koot, R., Lu, H.: Videolightformer: Lightweight action recognition using transformers (2021), https:\/\/arxiv.org\/abs\/2107.00451"},{"key":"24_CR19","doi-asserted-by":"crossref","unstructured":"Korbar, B., Tran, D., Torresani, L.: Scsampler: Sampling salient clips from video for efficient action recognition (2019)","DOI":"10.1109\/ICCV.2019.00633"},{"key":"24_CR20","doi-asserted-by":"crossref","unstructured":"Lin, J.: Tsm: Temporal shift module for efficient video understanding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 7083\u20137093 (2019)","DOI":"10.1109\/ICCV.2019.00718"},{"key":"24_CR21","doi-asserted-by":"crossref","unstructured":"Lin, J., Duan, H., Chen, K., Lin, D., Wang, L.: Ocsampler: Compressing videos to one clip with single-step sampling (2022)","DOI":"10.1109\/CVPR52688.2022.01352"},{"key":"24_CR22","doi-asserted-by":"crossref","unstructured":"Materzynska, J., Berger, G., Bax, I., Memisevic, R.: The jester dataset: A large-scale video dataset of human gestures. In: Proceedings of the IEEE\/CVF international conference on computer vision workshops (2019)","DOI":"10.1109\/ICCVW.2019.00349"},{"key":"24_CR23","doi-asserted-by":"crossref","unstructured":"Meng, Y., Lin, C.C., Panda, R., Sattigeri, P., Karlinsky, L., Oliva, A., Saenko, K., Feris, R.: Ar-net: Adaptive frame resolution for efficient action recognition. In: European Conference on Computer Vision. pp. 86\u2013104. Springer (2020)","DOI":"10.1007\/978-3-030-58571-6_6"},{"key":"24_CR24","doi-asserted-by":"crossref","unstructured":"Meng, Y., Panda, R., Lin, C.C., Sattigeri, P., Karlinsky, L., Saenko, K., Oliva, A., Feris, R.: Adafuse: Adaptive temporal fusion network for efficient action recognition. arXiv preprint arXiv:2102.05775 (2021)","DOI":"10.1007\/978-3-030-58571-6_6"},{"key":"24_CR25","unstructured":"Raviv, A., Dinai, Y., Drozdov, I., Zehngut, N., Goldin, I.: D-step: Dynamic spatio-temporal pruning. In: 33rd British Machine Vision Conference 2022, BMVC 2022, London, UK, November 21-24, 2022. BMVA Press (2022), https:\/\/bmvc2022.mpi-inf.mpg.de\/0632.pdf"},{"key":"24_CR26","doi-asserted-by":"crossref","unstructured":"Sandler, M., Howard, A., Zhu, M., Zhmoginov, A., Chen, L.C.: Mobilenetv2: Inverted residuals and linear bottlenecks. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 4510\u20134520 (2018)","DOI":"10.1109\/CVPR.2018.00474"},{"key":"24_CR27","unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos. Advances in neural information processing systems 27 (2014)"},{"key":"24_CR28","doi-asserted-by":"crossref","unstructured":"Tian, Y., Lu, G., Yan, Y., Zhai, G., Chen, L., Gao, Z.: A coding framework and benchmark towards low-bitrate video understanding. IEEE Transactions on Pattern Analysis and Machine Intelligence (2024)","DOI":"10.1109\/TPAMI.2024.3367879"},{"key":"24_CR29","doi-asserted-by":"crossref","unstructured":"Tian, Y., Lu, G., Zhai, G., Gao, Z.: Non-semantics suppressed mask learning for unsupervised video semantic compression. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 13610\u201313622 (2023)","DOI":"10.1109\/ICCV51070.2023.01252"},{"issue":"10","key":"24_CR30","doi-asserted-by":"publisher","first-page":"2453","DOI":"10.1007\/s11263-022-01661-1","volume":"130","author":"Y Tian","year":"2022","unstructured":"Tian, Y., Yan, Y., Zhai, G., Guo, G., Gao, Z.: Ean: event adaptive network for enhanced action recognition. Int. J. Comput. Vision 130(10), 2453\u20132471 (2022)","journal-title":"Int. J. Comput. Vision"},{"key":"24_CR31","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., Fergus, R., Torresani, L., Paluri, M.: Learning spatiotemporal features with 3d convolutional networks. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV) (December 2015)","DOI":"10.1109\/ICCV.2015.510"},{"key":"24_CR32","doi-asserted-by":"publisher","first-page":"1155","DOI":"10.1109\/ACCESS.2017.2778011","volume":"6","author":"A Ullah","year":"2017","unstructured":"Ullah, A., Ahmad, J., Muhammad, K., Sajjad, M., Baik, S.W.: Action recognition in video sequences using deep bi-directional lstm with cnn features. IEEE access 6, 1155\u20131166 (2017)","journal-title":"IEEE access"},{"key":"24_CR33","doi-asserted-by":"crossref","unstructured":"Verelst, T., Tuytelaars, T.: Dynamic convolutions: Exploiting spatial sparsity for faster inference. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 2320\u20132329 (2020)","DOI":"10.1109\/CVPR42600.2020.00239"},{"key":"24_CR34","doi-asserted-by":"crossref","unstructured":"Wang, L., Xiong, Y., Wang, Z., Qiao, Y., Lin, D., Tang, X., Gool, L.V.: Temporal segment networks: Towards good practices for deep action recognition. In: European conference on computer vision. pp. 20\u201336. Springer (2016)","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"24_CR35","doi-asserted-by":"crossref","unstructured":"Wang, Y., Yue, Y., Lin, Y., Jiang, H., Lai, Z., Kulikov, V., Orlov, N., Shi, H., Huang, G.: Adafocus v2: End-to-end training of spatial dynamic networks for video recognition. arXiv preprint arXiv:2112.14238 (2021)","DOI":"10.1109\/CVPR52688.2022.01943"},{"key":"24_CR36","unstructured":"Will, Kay, J.C., Simonyan, K., Zhang, B., Hillier, C., Vijayanarasimhan, S., Viola, F., Green, T., Back, T., Natsev, P., Suleyman, M., Zisserman, A.: The kinetics human action video dataset. CoRR abs\/1705.06950 (2017), http:\/\/arxiv.org\/abs\/1705.06950"},{"key":"24_CR37","unstructured":"Wu, Z., Xiong, C., Jiang, Y.G., Davis, L.S.: Liteeval: A coarse-to-fine framework for resource efficient video recognition (2019)"},{"key":"24_CR38","unstructured":"Wu, Z., Xiong, C., Ma, C.Y., Socher, R., Davis, L.S.: Adaframe: Adaptive frame selection for fast video recognition. CoRR abs\/1811.12432 (2018), http:\/\/arxiv.org\/abs\/1811.12432"},{"key":"24_CR39","unstructured":"Zhang, Y., Bai, Y., Wang, H., Xu, Y., Fu, Y.: Look more but care less in video recognition. arXiv preprint arXiv:2211.09992 (2022)"},{"key":"24_CR40","unstructured":"Zhu, Y., Li, X., Liu, C., Zolfaghari, M., Xiong, Y., Wu, C., Zhang, Z., Tighe, J., Manmatha, R., Li, M.: A comprehensive study of deep video action recognition. arXiv preprint arXiv:2012.06567 (2020)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ACCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-0908-6_24","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,6]],"date-time":"2024-12-06T20:15:34Z","timestamp":1733516134000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-0908-6_24"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,7]]},"ISBN":["9789819609079","9789819609086"],"references-count":40,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-0908-6_24","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,12,7]]},"assertion":[{"value":"7 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ACCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Asian Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Hanoi","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Vietnam","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"12 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"accv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}