{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,14]],"date-time":"2026-04-14T23:38:16Z","timestamp":1776209896498,"version":"3.50.1"},"publisher-location":"Cham","reference-count":51,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031919787","type":"print"},{"value":"9783031919794","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-91979-4_18","type":"book-chapter","created":{"date-parts":[[2025,5,31]],"date-time":"2025-05-31T19:07:30Z","timestamp":1748718450000},"page":"235-251","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["CA3D: Convolutional-Attentional 3D Nets for\u00a0Efficient Video Activity Recognition on\u00a0the\u00a0Edge"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4739-5778","authenticated-orcid":false,"given":"Gabriele","family":"Lagani","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6258-5313","authenticated-orcid":false,"given":"Fabrizio","family":"Falchi","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3715-149X","authenticated-orcid":false,"given":"Claudio","family":"Gennaro","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0171-4315","authenticated-orcid":false,"given":"Giuseppe","family":"Amato","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,5,12]]},"reference":[{"key":"18_CR1","doi-asserted-by":"crossref","unstructured":"Arnab, A., Dehghani, M., Heigold, G., Sun, C., Lu\u010di\u0107, M., Schmid, C.: Vivit: A video vision transformer. In: ICCV, pp. 6836\u20136846 (2021)","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"18_CR2","unstructured":"Ba, J.L., Kiros, J.R., Hinton, G.E.: Layer normalization. arXiv preprint arXiv:1607.06450 (2016)"},{"key":"18_CR3","doi-asserted-by":"crossref","unstructured":"Badar, A., et al.: Highlighting the importance of reducing research bias and carbon emissions in CNNs. In: International Conference of the Italian Association for Artificial Intelligence, pp. 515\u2013531. Springer (2021)","DOI":"10.1007\/978-3-031-08421-8_36"},{"key":"18_CR4","unstructured":"Banner, R., Nahshan, Y., Soudry, D.: Post training 4-bit quantization of convolutional networks for rapid-deployment. NeurIPS 32 (2019)"},{"key":"18_CR5","unstructured":"Bertasius, G., Wang, H., Torresani, L.: Is space-time attention all you need for video understanding? In: ICML, vol.\u00a02, p.\u00a04 (2021)"},{"key":"18_CR6","doi-asserted-by":"crossref","unstructured":"Chen, Y., Kalantidis, Y., Li, J., Yan, S., Feng, J.: Multi-fiber networks for video recognition. In: ECCV, pp. 352\u2013367 (2018)","DOI":"10.1007\/978-3-030-01246-5_22"},{"key":"18_CR7","unstructured":"Child, R., Gray, S., Radford, A., Sutskever, I.: Generating long sequences with sparse transformers. arXiv preprint arXiv:1904.10509 (2019)"},{"key":"18_CR8","unstructured":"Choi, J., Wang, Z., Venkataramani, S., Chuang, P.I.J., Srinivasan, V., Gopalakrishnan, K.: Pact: Parameterized clipping activation for quantized neural networks. arXiv preprint arXiv:1805.06085 (2018)"},{"key":"18_CR9","unstructured":"Choromanski, K., et\u00a0al.: Rethinking attention with performers. arXiv preprint arXiv:2009.14794 (2020)"},{"key":"18_CR10","doi-asserted-by":"crossref","unstructured":"Choukroun, Y., Kravchik, E., Yang, F., Kisilev, P.: Low-bit quantization of neural networks for efficient inference. In: 2019 IEEE\/CVF International Conference on Computer Vision Workshop (ICCVW), pp. 3009\u20133018. IEEE (2019)","DOI":"10.1109\/ICCVW.2019.00363"},{"key":"18_CR11","doi-asserted-by":"crossref","unstructured":"Crasto, N., Weinzaepfel, P., Alahari, K., Schmid, C.: Mars: motion-augmented RGB stream for action recognition. In: CVPR, pp. 7882\u20137891 (2019)","DOI":"10.1109\/CVPR.2019.00807"},{"key":"18_CR12","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"18_CR13","doi-asserted-by":"crossref","unstructured":"Fan, L., Huang, W., Gan, C., Ermon, S., Gong, B., Huang, J.: End-to-end learning of motion representation for video understanding. In: CVPR, pp. 6016\u20136025 (2018)","DOI":"10.1109\/CVPR.2018.00630"},{"key":"18_CR14","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C.: X3d: Expanding architectures for efficient video recognition. In: CVPR, pp. 203\u2013213 (2020)","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"18_CR15","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., He, K.: Slowfast networks for video recognition. In: ICCV, pp. 6202\u20136211 (2019)","DOI":"10.1109\/ICCV.2019.00630"},{"key":"18_CR16","unstructured":"Gholamalinezhad, H., Khosravi, H.: Pooling methods in deep neural networks, a review. arXiv preprint arXiv:2009.07485 (2020)"},{"key":"18_CR17","doi-asserted-by":"crossref","unstructured":"Gholami, A., Kim, S., Dong, Z., Yao, Z., Mahoney, M.W., Keutzer, K.: A survey of quantization methods for efficient neural network inference. In: Low-Power Computer Vision, pp. 291\u2013326. Chapman and Hall\/CRC (2022)","DOI":"10.1201\/9781003162810-13"},{"key":"18_CR18","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"18_CR19","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Identity mappings in deep residual networks. In: ECCV, pp. 630\u2013645. Springer International Publishing, Cham (2016)","DOI":"10.1007\/978-3-319-46493-0_38"},{"key":"18_CR20","unstructured":"Ioffe, S., Szegedy, C.: Batch normalization: Accelerating deep network training by reducing internal covariate shift. arXiv preprint arXiv:1502.03167 (2015)"},{"key":"18_CR21","doi-asserted-by":"crossref","unstructured":"Jacob, B., et al.: Quantization and training of neural networks for efficient integer-arithmetic-only inference. In: CVPR, pp. 2704\u20132713 (2018)","DOI":"10.1109\/CVPR.2018.00286"},{"issue":"1","key":"18_CR22","doi-asserted-by":"publisher","first-page":"221","DOI":"10.1109\/TPAMI.2012.59","volume":"35","author":"S Ji","year":"2012","unstructured":"Ji, S., Xu, W., Yang, M., Yu, K.: 3D convolutional neural networks for human action recognition. IEEE TPAMI 35(1), 221\u2013231 (2012)","journal-title":"IEEE TPAMI"},{"key":"18_CR23","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Toderici, G., Shetty, S., Leung, T., Sukthankar, R., Fei-Fei, L.: Large-scale video classification with convolutional neural networks. In: CVPR, pp. 1725\u20131732 (2014)","DOI":"10.1109\/CVPR.2014.223"},{"key":"18_CR24","unstructured":"Kay, W., et\u00a0al.: The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)"},{"key":"18_CR25","unstructured":"Krishnamoorthi, R.: Quantizing deep convolutional networks for efficient inference: A whitepaper. arXiv preprint arXiv:1806.08342 (2018)"},{"key":"18_CR26","doi-asserted-by":"crossref","unstructured":"Kuehne, H., Jhuang, H., Garrote, E., Poggio, T., Serre, T.: HMDB: a large video database for human motion recognition. In: ICCV, pp. 2556\u20132563. IEEE (2011)","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"18_CR27","unstructured":"Lagani, G., Falchi, F., Gennaro, C., Amato, G.: Spiking neural networks and bio-inspired supervised deep learning: a survey. arXiv preprint arXiv:2307.16235 (2023)"},{"key":"18_CR28","unstructured":"Lagani, G., Falchi, F., Gennaro, C., Amato, G.: Synaptic plasticity models and bio-inspired unsupervised deep learning: a survey. arXiv preprint arXiv:2307.16236 (2023)"},{"key":"18_CR29","unstructured":"Lee, S.H., Lee, S., Song, B.C.: Vision transformer for small-size datasets. arXiv preprint arXiv:2112.13492 (2021)"},{"key":"18_CR30","unstructured":"Li, K., et al.: Uniformer: Unified transformer for efficient spatiotemporal representation learning. arXiv preprint arXiv:2201.04676 (2022)"},{"key":"18_CR31","unstructured":"Li, K., et al.: Uniformerv2: Spatiotemporal learning by arming image vits with video uniformer. arXiv preprint arXiv:2211.09552 (2022)"},{"key":"18_CR32","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: Hierarchical vision transformer using shifted windows. In: ICCV, pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"18_CR33","unstructured":"Liu, Z., et al.: Video swin transformer. In: CVPR, pp. 3202\u20133211 (2022)"},{"key":"18_CR34","unstructured":"Nair, V., Hinton, G.E.: Rectified linear units improve restricted boltzmann machines. In: ICML (2010)"},{"key":"18_CR35","doi-asserted-by":"crossref","unstructured":"Ng, J.Y.H., Choi, J., Neumann, J., Davis, L.S.: Actionflownet: learning motion representation for action recognition. In: WACV, pp. 1616\u20131624. IEEE (2018)","DOI":"10.1109\/WACV.2018.00179"},{"key":"18_CR36","doi-asserted-by":"crossref","unstructured":"Piergiovanni, A., Kuo, W., Angelova, A.: Rethinking video vits: sparse video tubes for joint image and video learning. In: CVPR, pp. 2214\u20132224 (2023)","DOI":"10.1109\/CVPR52729.2023.00220"},{"key":"18_CR37","unstructured":"Sharir, G., Noy, A., Zelnik-Manor, L.: An image is worth 16x16 words, what is a video worth? arXiv preprint arXiv:2103.13915 (2021)"},{"key":"18_CR38","unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos. NeurIPS 27 (2014)"},{"key":"18_CR39","unstructured":"Soomro, K., Zamir, A.R., Shah, M.: Ucf101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)"},{"issue":"1","key":"18_CR40","first-page":"1929","volume":"15","author":"N Srivastava","year":"2014","unstructured":"Srivastava, N., Hinton, G., Krizhevsky, A., Sutskever, I., Salakhutdinov, R.: Dropout: a simple way to prevent neural networks from overfitting. JMLR 15(1), 1929\u20131958 (2014)","journal-title":"JMLR"},{"key":"18_CR41","doi-asserted-by":"crossref","unstructured":"Stroud, J., Ross, D., Sun, C., Deng, J., Sukthankar, R.: D3d: distilled 3d networks for video action recognition. In: WACV, pp. 625\u2013634 (2020)","DOI":"10.1109\/WACV45572.2020.9093274"},{"key":"18_CR42","doi-asserted-by":"crossref","unstructured":"Sun, S., Kuang, Z., Sheng, L., Ouyang, W., Zhang, W.: Optical flow guided feature: a fast and robust motion representation for video action recognition. In: CVPR, pp. 1390\u20131399 (2018)","DOI":"10.1109\/CVPR.2018.00151"},{"key":"18_CR43","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., Fergus, R., Torresani, L., Paluri, M.: Learning spatiotemporal features with 3D convolutional networks. In: ICCV, pp. 4489\u20134497 (2015)","DOI":"10.1109\/ICCV.2015.510"},{"key":"18_CR44","doi-asserted-by":"crossref","unstructured":"Tran, D., Wang, H., Torresani, L., Ray, J., LeCun, Y., Paluri, M.: A closer look at spatiotemporal convolutions for action recognition. In: CVPR, pp. 6450\u20136459 (2018)","DOI":"10.1109\/CVPR.2018.00675"},{"key":"18_CR45","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NeurIPS, pp. 5998\u20136008 (2017)"},{"key":"18_CR46","unstructured":"Wang, S., Li, B.Z., Khabsa, M., Fang, H., Ma, H.: Linformer: Self-attention with linear complexity. arXiv preprint arXiv:2006.04768 (2020)"},{"key":"18_CR47","doi-asserted-by":"crossref","unstructured":"Wang, X., Girshick, R., Gupta, A., He, K.: Non-local neural networks. In: CVPR, pp. 7794\u20137803 (2018)","DOI":"10.1109\/CVPR.2018.00813"},{"key":"18_CR48","doi-asserted-by":"crossref","unstructured":"Xie, S., Sun, C., Huang, J., Tu, Z., Murphy, K.: Rethinking spatiotemporal feature learning: speed-accuracy trade-offs in video classification. In: ECCV, pp. 305\u2013321 (2018)","DOI":"10.1007\/978-3-030-01267-0_19"},{"key":"18_CR49","doi-asserted-by":"crossref","unstructured":"Zhang, D., Yang, J., Ye, D., Hua, G.: Lq-nets: Learned quantization for highly accurate and compact deep neural networks. In: ECCV, pp. 365\u2013382 (2018)","DOI":"10.1007\/978-3-030-01237-3_23"},{"key":"18_CR50","doi-asserted-by":"crossref","unstructured":"Zhang, Y., et al.: Vidtr: Video transformer without convolutions. In: ICCV, pp. 13577\u201313587 (October 2021)","DOI":"10.1109\/ICCV48922.2021.01332"},{"key":"18_CR51","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Lan, Z., Newsam, S., Hauptmann, A.: Hidden two-stream convolutional networks for action recognition. In: ACCV,pp. 363\u2013378. Springer (2019)","DOI":"10.1007\/978-3-030-20893-6_23"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024 Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-91979-4_18","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,31]],"date-time":"2025-05-31T19:07:42Z","timestamp":1748718462000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-91979-4_18"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031919787","9783031919794"],"references-count":51,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-91979-4_18","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"12 May 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}