{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,8]],"date-time":"2025-09-08T06:10:28Z","timestamp":1757311828343,"version":"3.37.3"},"reference-count":61,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2023,7,19]],"date-time":"2023-07-19T00:00:00Z","timestamp":1689724800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,7,19]],"date-time":"2023-07-19T00:00:00Z","timestamp":1689724800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100003725","name":"National Research Foundation of Korea","doi-asserted-by":"publisher","award":["2021R1I1A3042145","2021R1I1A3042145"],"award-info":[{"award-number":["2021R1I1A3042145","2021R1I1A3042145"]}],"id":[{"id":"10.13039\/501100003725","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100010418","name":"Institute for Information and Communications Technology Promotion","doi-asserted-by":"publisher","award":["IITP-2023-2020-0-01462"],"award-info":[{"award-number":["IITP-2023-2020-0-01462"]}],"id":[{"id":"10.13039\/501100010418","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Multimed Info Retr"],"published-print":{"date-parts":[[2023,12]]},"DOI":"10.1007\/s13735-023-00280-x","type":"journal-article","created":{"date-parts":[[2023,7,19]],"date-time":"2023-07-19T13:01:50Z","timestamp":1689771710000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Cluster-guided temporal modeling for action recognition"],"prefix":"10.1007","volume":"12","author":[{"given":"Jeong-Hun","family":"Kim","sequence":"first","affiliation":[]},{"given":"Fei","family":"Hao","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7541-9127","authenticated-orcid":false,"given":"Carson Kai-Sang","family":"Leung","sequence":"additional","affiliation":[]},{"given":"Aziz","family":"Nasridinov","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,7,19]]},"reference":[{"key":"280_CR1","doi-asserted-by":"publisher","unstructured":"Segal S, Kee E, Luo W, Sadat A, Yumer E, Urtasun R (2020) Universal embeddings for spatio-temporal tagging of self-driving logs. In: proceedings of conference on robot learning (CoRL), Nov. 16\u201318. Cambridge, MA, USA, pp. 973\u2013983. https:\/\/doi.org\/10.48550\/arXiv.2011.06165.","DOI":"10.48550\/arXiv.2011.06165"},{"key":"280_CR2","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s10489-022-03813-9","volume":"53","author":"C Li","year":"2022","unstructured":"Li C, Chen X (2022) Video prediction for driving scenes with a memory differential motion network model. Appl Intell 53:1\u201317. https:\/\/doi.org\/10.1007\/s10489-022-03813-9","journal-title":"Appl Intell"},{"issue":"1","key":"280_CR3","doi-asserted-by":"publisher","first-page":"43","DOI":"10.1007\/s10489-015-0695-5","volume":"44","author":"E Bastianelli","year":"2016","unstructured":"Bastianelli E, Nardi D, Aiello L-C, Giacomelli F, Manes N (2016) Speaky for robots: the development of vocal interfaces for robotic applications. Appl Intell 44(1):43\u201366. https:\/\/doi.org\/10.1007\/s10489-015-0695-5","journal-title":"Appl Intell"},{"key":"280_CR4","doi-asserted-by":"publisher","unstructured":"Nguyen A, Kanoulas D, Muratore L, Caldwell D-G, Tsagarakis N-G (2018) Translating videos to commands for robotic manipulation with deep recurrent neural networks. In: proceedings of the IEEE international conference on robotics and automation (ICRA), May 21\u201325. Brisbane, QLD, Australia, pp. 3782\u20133788. https:\/\/doi.org\/10.1109\/ICRA.2018.8460857.","DOI":"10.1109\/ICRA.2018.8460857"},{"key":"280_CR5","doi-asserted-by":"publisher","unstructured":"Wilson A, Lin M-C (2020) AVOT: audio-visual object tracking of multiple objects for robotics. In: proceedings of the IEEE international conference on robotics and automation (ICRA), May 31-Aug. 31. Paris, France, pp. 10045\u201310051. https:\/\/doi.org\/10.1109\/ICRA40945.2020.9197528.","DOI":"10.1109\/ICRA40945.2020.9197528"},{"key":"280_CR6","doi-asserted-by":"publisher","unstructured":"Choi S, On K-W, Heo Y-J, Seo A, Jang Y, Lee M, Zhang B-T (2021) DramaQA: character-centered video story understanding with hierarchical QA. In: proceedings of the AAAI conference on artificial intelligence (AAAI), Feb. 2\u20139. San Francisco, CA, USA, pp. 1166\u20131174. https:\/\/doi.org\/10.1609\/aaai.v35i2.16203.","DOI":"10.1609\/aaai.v35i2.16203"},{"key":"280_CR7","doi-asserted-by":"publisher","unstructured":"Xiao J, Shang X, Yao A, Chua T-S (2021) Next-qa: Next phase of question-answering to explaining temporal actions. In: proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), Jun. 19\u201325. Nashville, TN, USA, pp. 9777\u20139786. https:\/\/doi.org\/10.1109\/CVPR46437.2021.00965.","DOI":"10.1109\/CVPR46437.2021.00965"},{"issue":"2","key":"280_CR8","doi-asserted-by":"publisher","first-page":"690","DOI":"10.1007\/s10489-020-01823-z","volume":"51","author":"O Elharrouss","year":"2021","unstructured":"Elharrouss O, Almaadeed N, Al-Maadeed S, Bouridane A, Beghdadi A (2021) A combined multiple action recognition and summarization for surveillance video sequences. Appl Intell 51(2):690\u2013712. https:\/\/doi.org\/10.1007\/s10489-020-01823-z","journal-title":"Appl Intell"},{"issue":"4","key":"280_CR9","doi-asserted-by":"publisher","first-page":"2128","DOI":"10.1007\/s10489-020-01933-8","volume":"51","author":"L He","year":"2021","unstructured":"He L, Wen S, Wang L, Li F (2021) Vehicle theft recognition from surveillance video based on spatiotemporal attention. Appl Intell 51(4):2128\u20132143. https:\/\/doi.org\/10.1007\/s10489-020-01933-8","journal-title":"Appl Intell"},{"key":"280_CR10","doi-asserted-by":"publisher","unstructured":"Wu S, He X, Lu H, Yuille A-L (2010) A unified model of short-range and long-range motion perception. In: advances in neural information processing systems (NIPS), Dec. 6\u201311. Vancouver, Canada, pp. 2478\u20132486. https:\/\/doi.org\/10.5555\/2997046.2997172.","DOI":"10.5555\/2997046.2997172"},{"key":"280_CR11","doi-asserted-by":"publisher","unstructured":"Li Y, Ji B, Shi X, Zhang J, Kang B, Wang L (2020) TEA: temporal excitation and aggregation for action recognition. In: proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), Jun. 14\u201319. Seattle, WA, USA, pp. 909\u2013918. https:\/\/doi.org\/10.1109\/CVPR42600.2020.00099.","DOI":"10.1109\/CVPR42600.2020.00099"},{"key":"280_CR12","doi-asserted-by":"publisher","unstructured":"Karpathy A, Toderici G, Shetty S, Leung T, Sukthankar R, Fei-Fei L (2014) Large-scale video classification with convolutional neural networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), Jun. 24\u201327. Columbus, OH, USA, pp. 1725\u20131732. https:\/\/doi.org\/10.1109\/CVPR.2014.223.","DOI":"10.1109\/CVPR.2014.223"},{"key":"280_CR13","doi-asserted-by":"publisher","unstructured":"Simonyan K, Zisserman A (2014) Two-stream convolutional networks for action recognition in videos. In: advances in neural information processing systems (NIPS), Dec. 8\u201313. Montreal, Canada, pp. 568\u2013576. https:\/\/doi.org\/10.5555\/2968826.2968890.","DOI":"10.5555\/2968826.2968890"},{"key":"280_CR14","doi-asserted-by":"publisher","unstructured":"Yue-Hei Ng J, Hausknecht M, Vijayanarasimhan S, Vinyals O, Monga R, Toderici G (2015) Beyond short snippets: deep networks for video classification. In: proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), Jun. 7\u201312. Rockville, MD, USA, pp. 4694\u20134702. https:\/\/doi.org\/10.1109\/CVPR.2015.7299101.","DOI":"10.1109\/CVPR.2015.7299101"},{"issue":"11","key":"280_CR15","doi-asserted-by":"publisher","first-page":"1369","DOI":"10.1016\/j.patrec.2010.03.024","volume":"31","author":"J Pers","year":"2010","unstructured":"Pers J, Sulic V, Kristan M, Perse M, Polanec K, Kovacic S (2010) Histograms of optical flow for efficient representation of body motion. Pattern Recognit Lett 31(11):1369\u20131376. https:\/\/doi.org\/10.1016\/j.patrec.2010.03.024","journal-title":"Pattern Recognit Lett"},{"key":"280_CR16","doi-asserted-by":"publisher","unstructured":"Sun L, Jia K, Chen K, Yeung D-Y, Shi B-E, Savarese S (2017) Lattice long short-term memory for human action recognition. In: proceedings of the IEEE international conference on computer vision (ICCV), Oct. 22\u201329. Venice, Italy, pp. 2147\u20132156. https:\/\/doi.org\/10.1109\/ICCV.2017.236.","DOI":"10.1109\/ICCV.2017.236"},{"issue":"7","key":"280_CR17","doi-asserted-by":"publisher","first-page":"1510","DOI":"10.1109\/TMM.2017.2666540","volume":"19","author":"Y Shi","year":"2017","unstructured":"Shi Y, Tian Y, Wang Y, Huang T (2017) Sequential deep trajectory descriptor for action recognition with three-stream CNN. IEEE Trans Multimedia 19(7):1510\u20131520","journal-title":"IEEE Trans Multimedia"},{"key":"280_CR18","doi-asserted-by":"publisher","unstructured":"Tran D, Bourdev L, Fergus R, Torresani L, Paluri M, (2015) Learning spatiotemporal features with 3d convolutional networks. In: proceedings of the IEEE international conference on computer vision (ICCV), Dec. 11\u201318. Santiago, Chile, pp. 4489\u20134497. https:\/\/doi.org\/10.1109\/ICCV.2015.510.","DOI":"10.1109\/ICCV.2015.510"},{"key":"280_CR19","doi-asserted-by":"publisher","unstructured":"Wang L, Xiong Y, Wang Z, Qiao Y, Lin D, Tang X, Gool L-V (2016) Temporal segment networks: Towards good practices for deep action recognition. In: proceedings of the European conference on computer vision (ECCV), Oct. 8\u201316. Amsterdam, Netherlands, pp. 20\u201336. https:\/\/doi.org\/10.1007\/978-3-319-46484-8_2.","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"280_CR20","doi-asserted-by":"publisher","unstructured":"Girdhar R, Ramanan D, Gupta A, Sivic J, Russell B (2017) ActionVLAD: learning spatio-temporal aggregation for action recognition. In: proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), Jul. 21\u201326. Honolulu, HI, USA, pp. 971\u2013980. https:\/\/doi.org\/10.1109\/CVPR.2017.337.","DOI":"10.1109\/CVPR.2017.337"},{"key":"280_CR21","doi-asserted-by":"publisher","unstructured":"Gao R, Oh T-H, Grauman K, Torresani L (2020) Listen to look: action recognition by previewing audio. In: proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), Jun. 14\u201319. Seattle, WA, USA, pp. 10457\u201310467. https:\/\/doi.org\/10.1109\/CVPR42600.2020.01047.","DOI":"10.1109\/CVPR42600.2020.01047"},{"key":"280_CR22","doi-asserted-by":"publisher","unstructured":"Zhu C, Tan X, Zhou F, Liu X, Yue K, Ding E, and Ma Y, (2018) Fine-grained video categorization with redundancy reduction attention. In: proceedings of the European conference on computer vision (ECCV), Sep. 8\u201314, Munich, Germany, pp 136\u2013152. https:\/\/doi.org\/10.1007\/978-3-030-01228-1_9.","DOI":"10.1007\/978-3-030-01228-1_9"},{"key":"280_CR23","doi-asserted-by":"publisher","unstructured":"Wu C-Y, Zaheer M, Hu H, Manmatha R, Smola A-J, Kr\u00e4henb\u00fchl P (2018) Compressed video action recognition. In: proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), Jun. 18\u201322, Salt Lake City, UT, USA, pp. 6026\u20136035. https:\/\/doi.org\/10.1109\/CVPR.2018.00631.","DOI":"10.1109\/CVPR.2018.00631"},{"key":"280_CR24","doi-asserted-by":"publisher","unstructured":"Goyal R, Ebrahimi Kahou S, Michalski V, Materzynska J., Westphal S, Kim H, Haenel V, Fruend I, Yianilos P, Mueller-Freitag M, Hoppe F, Thurau C, Bax I, Memisevic R (2017) The \u201csomething something\u201d video database for learning and evaluating visual common sense. In: proceedings of the IEEE International Conference on Computer Vision (ICCV), Oct. 22\u201329. Venice, Italy, pp. 5842\u20135850. https:\/\/doi.org\/10.1109\/ICCV.2017.622.","DOI":"10.1109\/ICCV.2017.622"},{"key":"280_CR25","unstructured":"Mahdisoltani F, Berger G, Gharbieh W, Fleet D, Memisevic R (2018) Fine-grained video classification and captioning. arXiv preprint arXiv:1804.09235."},{"key":"280_CR26","unstructured":"Soomro K, Zamir A-R, Shah M (2012) UCF101: a dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402."},{"key":"280_CR27","unstructured":"Kay W, Carreira J, Simonyan K, Zhang B, Hillier C, Vijayanarasimhan S, Viola F, Green T, Back T, Natsev P, Suleyman M, Zisserman A (2017) The kinetics human action video dataset. arXiv preprint arXiv:1705.06950."},{"key":"280_CR28","doi-asserted-by":"publisher","unstructured":"Caba Heilbron F, Escorcia V, Ghanem B, Carlos Neibles J (2015) ActivityNet: a large-scale video benchmark for human activity understanding. In: proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), Jun. 7\u201312. Rockville, MD, USA, pp. 961\u2013970. https:\/\/doi.org\/10.1109\/CVPR.2015.7298698.","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"280_CR29","doi-asserted-by":"publisher","unstructured":"Krizhevsky A, Sutskever I, Hinton G-E (2012) ImageNet classification with deep convolutional neural networks. In: advances in neural information processing systems (NIPS), Dec. 3\u20138. Lake Tahoe, NV, USA, pp. 84\u201390. https:\/\/doi.org\/10.1145\/3065386.","DOI":"10.1145\/3065386"},{"key":"280_CR30","doi-asserted-by":"publisher","unstructured":"Simonyan K, Zisserman A (2015) Very deep convolutional networks for large-scale image recognition. In: proceedings of the international conference on learning representations (ICLR), May 7\u20139. San Diego, CA, USA, pp. 1\u201314. https:\/\/doi.org\/10.48550\/arXiv.1409.1556.","DOI":"10.48550\/arXiv.1409.1556"},{"key":"280_CR31","doi-asserted-by":"publisher","unstructured":"Szegedy C, Liu W, Jia Y, Sermanet P, Reed S, Anguelov D, Erhan D, Vanhoucke V, Rabinovich A (2015) Going deeper with convolutions. In: proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), Jun. 7\u201312. Boston, MA, USA, pp. 1\u20139. https:\/\/doi.org\/10.1109\/CVPR.2015.7298594.","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"280_CR32","doi-asserted-by":"publisher","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), Jun. 27\u201330. Las Vegas, NV, USA, pp. 770\u2013778. https:\/\/doi.org\/10.1109\/CVPR.2016.90.","DOI":"10.1109\/CVPR.2016.90"},{"key":"280_CR33","doi-asserted-by":"publisher","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Identity mappings in deep residual networks. In: proceedings of the european conference on computer vision (ECCV), Oct. 8\u201316. Amsterdam, Netherlands, pp. 630\u2013645. https:\/\/doi.org\/10.1007\/978-3-319-46493-0_38.","DOI":"10.1007\/978-3-319-46493-0_38"},{"key":"280_CR34","doi-asserted-by":"publisher","unstructured":"Tan, M., Le, Q., 2019. EfficientNet: Rethinking model scaling for convolutional neural networks. In: International Conference on Machine Learning (ICML), Jun. 9\u201315. Long Beach, CA, USA, pp. 6105\u20136114. https:\/\/doi.org\/10.48550\/arXiv.1905.11946.","DOI":"10.48550\/arXiv.1905.11946"},{"issue":"11","key":"280_CR35","doi-asserted-by":"publisher","first-page":"7903","DOI":"10.1007\/s10489-021-02280-y","volume":"51","author":"C Liu","year":"2021","unstructured":"Liu C, Huang L, Wei Z, Zhang W (2021) Subtler mixed attention network on fine-grained image classification. Appl Intell 51(11):7903\u20137916. https:\/\/doi.org\/10.1007\/s10489-021-02280-y","journal-title":"Appl Intell"},{"issue":"1","key":"280_CR36","doi-asserted-by":"publisher","first-page":"221","DOI":"10.1109\/TPAMI.2012.59","volume":"35","author":"S Ji","year":"2012","unstructured":"Ji S, Xu W, Yang M, Yu K (2012) 3D convolutional neural networks for human action recognition. IEEE Trans Pattern Anal Mach Intell 35(1):221\u2013231. https:\/\/doi.org\/10.1109\/TPAMI.2012.59","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"280_CR37","doi-asserted-by":"publisher","unstructured":"Donahue J, Anne Hendricks L, Guadarrama S, Rohrbach M, Venugopalan S, Saenko K, Darrell T (2015) Long-term recurrent convolutional networks for visual recognition and description. In: proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), Jun. 7\u201312. Boston, MA, USA, pp. 2625\u20132634. https:\/\/doi.org\/10.1109\/CVPR.2015.7298878.","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"280_CR38","doi-asserted-by":"publisher","unstructured":"Yao L, Torabi A, Cho K, Ballas N, Pal C, Larochelle H, Courville A (2015) Describing videos by exploiting temporal structure. In: proceedings of the IEEE international conference on computer vision (ICCV), Dec. 11\u201318. Santiago, Chile, pp. 4507\u20134515. https:\/\/doi.org\/10.1109\/ICCV.2015.512.","DOI":"10.1109\/ICCV.2015.512"},{"key":"280_CR39","doi-asserted-by":"publisher","unstructured":"Feichtenhofer C, Pinz A, Zisserman A (2016) Convolutional two-stream network fusion for video action recognition. In: proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), Jun. 27\u201330. Las Vegas, NV, USA, pp. 1933\u20131941. https:\/\/doi.org\/10.1109\/CVPR.2016.213.","DOI":"10.1109\/CVPR.2016.213"},{"key":"280_CR40","doi-asserted-by":"publisher","unstructured":"Zhang, J., Zheng, Y., Qi, D., 2017. Deep spatio-temporal residual networks for citywide crowd flows prediction. In: proceedings of the AAAI conference on artificial intelligence (AAAI), Feb. 2\u20139. San Francisco, CA, USA, pp. 1166\u20131174. https:\/\/doi.org\/10.1609\/aaai.v31i1.10735.","DOI":"10.1609\/aaai.v31i1.10735"},{"key":"280_CR41","doi-asserted-by":"publisher","unstructured":"Carreira J, Zisserman A (2017) Quo vadis, action recognition? A new model and the kinetics dataset. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), Jul. 21\u201326. Honolulu, HI, USA, pp. 6299\u20136308. https:\/\/doi.org\/10.1109\/CVPR.2017.502.","DOI":"10.1109\/CVPR.2017.502"},{"key":"280_CR42","doi-asserted-by":"publisher","unstructured":"Wang X, Girshick R, Gupta A, He K (2018) Non-local neural networks. In: proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), Jun. 18\u201322. Salt Lake City, UT, USA, pp. 7794\u20137803. https:\/\/doi.org\/10.1109\/CVPR.2018.00813.","DOI":"10.1109\/CVPR.2018.00813"},{"key":"280_CR43","doi-asserted-by":"publisher","unstructured":"Yang C, Xu Y, Shi J, Dai B, Zhou B (2020) Temporal pyramid network for action recognition. In: proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), Jun. 14\u201319. Seattle, WA, USA, pp. 591\u2013600. https:\/\/doi.org\/10.1109\/CVPR42600.2020.00067.","DOI":"10.1109\/CVPR42600.2020.00067"},{"key":"280_CR44","doi-asserted-by":"publisher","unstructured":"Feichtenhofer C, Fan H, Malik J, He K (2019) SlowFast networks for video recognition. In: proceedings of the IEEE international conference on computer vision (ICCV), Oct. 27-Nov. 2. Seoul, South Korea, pp. 6202\u20136211. https:\/\/doi.org\/10.1109\/ICCV.2019.00630.","DOI":"10.1109\/ICCV.2019.00630"},{"key":"280_CR45","doi-asserted-by":"publisher","unstructured":"Tran D, Wang H, Torresani L, Ray J, LeCun Y, Paluri M (2018) A closer look at spatiotemporal convolutions for action recognition. In: proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), Jun. 18\u201322. Salt Lake City, UT, USA, pp. 6450\u20136459. https:\/\/doi.org\/10.1109\/CVPR.2018.00675.","DOI":"10.1109\/CVPR.2018.00675"},{"key":"280_CR46","doi-asserted-by":"publisher","unstructured":"Xie S, Sun C, Huang J, Tu Z, Murphy K (2018) Rethinking spatiotemporal feature learning: speed-accuracy trade-offs in video classification. In: proceedings of the European conference on computer vision (ECCV), Sep. 8\u201314. Munich, Germany, pp. 305\u2013321. https:\/\/doi.org\/10.1007\/978-3-030-01267-0_19.","DOI":"10.1007\/978-3-030-01267-0_19"},{"key":"280_CR47","doi-asserted-by":"publisher","unstructured":"Tran D, Wang H, Torresani L, Feiszli M (2019) Video classification with channel-separated convolutional networks. In: proceedings of the IEEE\/CVF international conference on computer vision (ICCV), Oct. 27-Nov. 2. Seoul, South Korea, pp. 5552\u20135561. https:\/\/doi.org\/10.1109\/ICCV.2019.00565.","DOI":"10.1109\/ICCV.2019.00565"},{"key":"280_CR48","doi-asserted-by":"publisher","unstructured":"Lin J, Gan C, Han S (2019) TSM: temporal shift module for efficient video understanding. In: proceedings of the IEEE\/CVF international conference on computer vision (ICCV), Oct. 27-Nov. 2. Seoul, South Korea, pp. 7083\u20137093. https:\/\/doi.org\/10.1109\/ICCV.2019.00718.","DOI":"10.1109\/ICCV.2019.00718"},{"key":"280_CR49","doi-asserted-by":"publisher","unstructured":"Wang L, Tong Z, Ji B, Wu G (2021) Tdn: temporal difference networks for efficient action recognition. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), Jun. 19\u201325. Nashville, TN, USA, pp. 1895\u20131904. https:\/\/doi.org\/10.1109\/CVPR46437.2021.00193.","DOI":"10.1109\/CVPR46437.2021.00193"},{"key":"280_CR50","doi-asserted-by":"publisher","first-page":"357","DOI":"10.1016\/j.patcog.2019.03.002","volume":"91","author":"P Wang","year":"2019","unstructured":"Wang P, Liu L, Shen C, Shen H-T (2019) Order-aware convolutional pooling for video based action recognition. Pattern Recognit 91:357\u2013365. https:\/\/doi.org\/10.1016\/j.patcog.2019.03.002","journal-title":"Pattern Recognit"},{"key":"280_CR51","doi-asserted-by":"publisher","unstructured":"Yeung S, Russakovsky O, Mori G, Fei-Fei L (2016) End-to-end learning of action detection from frame glimpses in videos. In: proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), Jun. 27\u201330. Las Vegas, NV, USA, pp. 2678\u20132687. https:\/\/doi.org\/10.1109\/CVPR.2016.293.","DOI":"10.1109\/CVPR.2016.293"},{"key":"280_CR52","doi-asserted-by":"crossref","unstructured":"Fan H, Xu Z, Zhu L, Yan C, Ge J, Yang Y (2018) Watching a small portion could be as good as watching all: towards efficient video classification. In: international joint conference on artificial intelligence (IJCAI), Jul. 13\u201319. Stockholm, Sweden, pp. 705\u2013711","DOI":"10.24963\/ijcai.2018\/98"},{"key":"280_CR53","doi-asserted-by":"publisher","unstructured":"Wu Z, Xiong C, Ma C-Y, Socher R, Davis L-S (2019) AdaFrame: adaptive frame selection for fast video recognition. In: proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), Jun. 16\u201320. Long Beach, CA, USA, pp. 1278\u20131287. https:\/\/doi.org\/10.1109\/CVPR.2019.00137.","DOI":"10.1109\/CVPR.2019.00137"},{"key":"280_CR54","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2022.108797","author":"W Dong","year":"2022","unstructured":"Dong W, Zhang Z, Song C, Tan T (2022) Identifying the key frames: an attention-aware sampling method for action recognition. Patt Recognit. https:\/\/doi.org\/10.1016\/j.patcog.2022.108797","journal-title":"Patt Recognit"},{"key":"280_CR55","doi-asserted-by":"publisher","unstructured":"Korbar B, Tran D, Torresani L (2019) SCSampler: sampling salient clips from video for efficient action recognition. In: proceedings of the IEEE international conference on computer vision (ICCV), Oct. 27-Nov. 2. Seoul, South Korea, pp. 6232\u20136242. https:\/\/doi.org\/10.1109\/ICCV.2019.00633","DOI":"10.1109\/ICCV.2019.00633"},{"key":"280_CR56","doi-asserted-by":"publisher","unstructured":"Gowda S-N, Rohrbach M, Sevilla-Lara L (2021) Smart frame selection for action recognition. In: proceedings of the AAAI conference on artificial intelligence (AAAI), Feb. 2\u20139. San Francisco, CA, USA, pp. 1451\u20131459. https:\/\/doi.org\/10.1609\/aaai.v35i2.16235","DOI":"10.1609\/aaai.v35i2.16235"},{"issue":"1","key":"280_CR57","doi-asserted-by":"publisher","first-page":"145","DOI":"10.1109\/18.61115","volume":"37","author":"J Lin","year":"1991","unstructured":"Lin J (1991) Divergence measures based on the Shannon entropy. IEEE Trans on Inform Theory 37(1):145\u2013151. https:\/\/doi.org\/10.1109\/18.61115","journal-title":"IEEE Trans on Inform Theory"},{"key":"280_CR58","doi-asserted-by":"publisher","unstructured":"Caron M, Bojanowski P, Joulin A, Douze M (2018) Deep clustering for unsupervised learning of visual features. In: proceedings of the European conference on computer vision (ECCV), Sep. 8\u201314, Munich, Germany, pp 132\u2013149. https:\/\/doi.org\/10.1007\/978-3-030-01264-9_9","DOI":"10.1007\/978-3-030-01264-9_9"},{"key":"280_CR59","doi-asserted-by":"publisher","unstructured":"Deng J, Dong W, Socher R, Li L-J, Li K, Fei-Fei L (2009) ImageNet: a large-scale hierarchical image database. In: proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), Jun. 20\u201325. Miami, FL, USA, pp. 248\u2013255. https:\/\/doi.org\/10.1109\/CVPR.2009.5206848","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"280_CR60","doi-asserted-by":"publisher","unstructured":"Teed Z, Deng J, (2020) Raft: recurrent all-pairs field transforms for optical flow. In: proceedings of the European conference on computer vision (ECCV), Aug. 23\u201328. Glasgow, UK, pp 402\u2013419. doi https:\/\/doi.org\/10.1007\/978-3-030-58536-5_24","DOI":"10.1007\/978-3-030-58536-5_24"},{"key":"280_CR61","doi-asserted-by":"publisher","unstructured":"Zhou B, Khosla A, Lapedriza A, Oliva A, Torralba A, (2016) Learning deep features for discriminative localization. In: proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), Jun. 27\u201330. Las Vegas, NV, USA, pp 2921\u20132929. doi https:\/\/doi.org\/10.1109\/CVPR.2016.319","DOI":"10.1109\/CVPR.2016.319"}],"container-title":["International Journal of Multimedia Information Retrieval"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13735-023-00280-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s13735-023-00280-x\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13735-023-00280-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,12,2]],"date-time":"2023-12-02T14:10:35Z","timestamp":1701526235000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s13735-023-00280-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,7,19]]},"references-count":61,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2023,12]]}},"alternative-id":["280"],"URL":"https:\/\/doi.org\/10.1007\/s13735-023-00280-x","relation":{},"ISSN":["2192-6611","2192-662X"],"issn-type":[{"type":"print","value":"2192-6611"},{"type":"electronic","value":"2192-662X"}],"subject":[],"published":{"date-parts":[[2023,7,19]]},"assertion":[{"value":"7 December 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"19 June 2023","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 June 2023","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"19 July 2023","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"15"}}