{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,10]],"date-time":"2025-06-10T14:10:04Z","timestamp":1749564604084,"version":"3.41.0"},"reference-count":50,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2025,4,25]],"date-time":"2025-04-25T00:00:00Z","timestamp":1745539200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,4,25]],"date-time":"2025-04-25T00:00:00Z","timestamp":1745539200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Multimed Info Retr"],"published-print":{"date-parts":[[2025,6]]},"DOI":"10.1007\/s13735-025-00366-8","type":"journal-article","created":{"date-parts":[[2025,4,25]],"date-time":"2025-04-25T09:26:45Z","timestamp":1745573205000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["MMDL: a multi-modal deep learning for video highlight detection in sports"],"prefix":"10.1007","volume":"14","author":[{"given":"Qiaoyun","family":"Zhang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chih-Yung","family":"Chang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shih-Jung","family":"Wu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hsiang-Chuan","family":"Chang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Diptendu Sinha","family":"Roy","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,4,25]]},"reference":[{"key":"366_CR1","doi-asserted-by":"publisher","first-page":"107859","DOI":"10.1016\/j.comnet.2021.107859","volume":"189","author":"A Farrokhi","year":"2021","unstructured":"Farrokhi A, Farahbakhsh R et al (2021) Application of Internet of Things and artificial intelligence for smart fitness: a survey. Comput Netw 189:107859","journal-title":"Comput Netw"},{"key":"366_CR2","doi-asserted-by":"publisher","first-page":"3999","DOI":"10.1109\/TMM.2020.3035285","volume":"23","author":"F Qi","year":"2020","unstructured":"Qi F, Yang X, Xu C (2020) Emotion knowledge driven video highlight detection. IEEE Trans Multimed 23:3999\u20134013","journal-title":"IEEE Trans Multimed"},{"issue":"8","key":"366_CR3","doi-asserted-by":"publisher","first-page":"3195","DOI":"10.1109\/TNNLS.2021.3053249","volume":"33","author":"L Jiao","year":"2022","unstructured":"Jiao L, Zhang R et al (2022) New generation deep learning for video object detection: a survey. IEEE Trans Neural Netw Learning Syst 33(8):3195\u20133215","journal-title":"IEEE Trans Neural Netw Learning Syst"},{"issue":"5","key":"366_CR4","doi-asserted-by":"publisher","first-page":"1995","DOI":"10.1109\/TCSVT.2020.3014491","volume":"31","author":"P Xu","year":"2021","unstructured":"Xu P, Liu K et al (2021) Fine-grained instance-level sketch-based video retrieval. IEEE Trans Circuits Syst Video Technol 31(5):1995\u20132007","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"issue":"4","key":"366_CR5","doi-asserted-by":"publisher","first-page":"1986","DOI":"10.1109\/TCSVT.2021.3093928","volume":"32","author":"X Zhang","year":"2022","unstructured":"Zhang X, Wang T et al (2022) Multi-attention convolutional neural network for video deblurring. IEEE Trans Circuits Syst Video Technol 32(4):1986\u20131997","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"366_CR6","doi-asserted-by":"crossref","unstructured":"Rongved OAN, Hicks SA et al. (2020) Real-time detection of events in soccer videos using 3D convolutional neural networks. In: IEEE International Symposium on Multimedia (ISM), pp. 135\u2013144","DOI":"10.1109\/ISM.2020.00030"},{"issue":"7","key":"366_CR7","doi-asserted-by":"publisher","first-page":"54","DOI":"10.12775\/JEHS.2021.11.07.005","volume":"11","author":"B Li","year":"2021","unstructured":"Li B, Xu X (2021) Application of artificial intelligence in basketball sport. J Educ, Health Sport 11(7):54\u201367","journal-title":"J Educ, Health Sport"},{"key":"366_CR8","doi-asserted-by":"publisher","first-page":"191","DOI":"10.1016\/j.comnet.2019.01.028","volume":"151","author":"E Fenil","year":"2019","unstructured":"Fenil E, Manogaran G et al (2019) Real-time violence detection framework for football stadium comprising of big data analysis and deep learning through bidirectional LSTM. Comput Netw 151:191\u2013200","journal-title":"Comput Netw"},{"issue":"2","key":"366_CR9","doi-asserted-by":"publisher","first-page":"647","DOI":"10.1109\/TCSVT.2020.2984569","volume":"31","author":"K Liu","year":"2021","unstructured":"Liu K, Liu W et al (2021) A real-time action representation with temporal encoding and deep compression. IEEE Trans Circuits Syst Video Technol 31(2):647\u2013660","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"366_CR10","first-page":"1","volume":"2021","author":"L Wang","year":"2021","unstructured":"Wang L, Zhang H, Yuan G (2021) Big data and deep learning-based video classification model for sports. Wirel Commun Mob Comput 2021:1\u201311","journal-title":"Wirel Commun Mob Comput"},{"key":"366_CR11","doi-asserted-by":"crossref","unstructured":"Piergiovanni AJ and Ryoo MS (2018) Fine-grained activity recognition in baseball videos. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops, pp. 1740\u20131748","DOI":"10.1109\/CVPRW.2018.00226"},{"key":"366_CR12","doi-asserted-by":"crossref","unstructured":"Piergiovanni AJ and Ryoo MS (2021) Recognizing actions in videos from unseen viewpoints. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4124\u20134132","DOI":"10.1109\/CVPR46437.2021.00411"},{"key":"366_CR13","doi-asserted-by":"publisher","first-page":"118","DOI":"10.1016\/j.cviu.2018.04.007","volume":"171","author":"P Wang","year":"2018","unstructured":"Wang P et al (2018) RGB-D-based human motion recognition with deep learning: A survey. Comput Vis Image Underst 171:118\u2013139","journal-title":"Comput Vis Image Underst"},{"key":"366_CR14","doi-asserted-by":"crossref","unstructured":"Duan H, Zhao Y et al. (2020) Omni-sourced webly-supervised learning for video recognition. In: European Conference on Computer Vision, pp. 670\u2013688","DOI":"10.1007\/978-3-030-58555-6_40"},{"issue":"7","key":"366_CR15","doi-asserted-by":"publisher","first-page":"1510","DOI":"10.1109\/TMM.2017.2666540","volume":"19","author":"Y Shi","year":"2017","unstructured":"Shi Y, Tian Y et al (2017) Sequential deep trajectory descriptor for action recognition with three-stream CNN. IEEE Trans Multimed 19(7):1510\u20131520","journal-title":"IEEE Trans Multimed"},{"issue":"10","key":"366_CR16","doi-asserted-by":"publisher","first-page":"2684","DOI":"10.1109\/TPAMI.2019.2916873","volume":"42","author":"J Liu","year":"2020","unstructured":"Liu J, Shahroudy A et al (2020) NTU RGB+D: A large-scale benchmark for 3D human activity understanding. IEEE Trans Pattern Anal Mach Intell 42(10):2684\u20132701","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"366_CR17","doi-asserted-by":"crossref","unstructured":"Zhou J, Lin KY, et al. (2021) Graph-based high order relation modeling for long-term action recognition. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 8984\u20138993","DOI":"10.1109\/CVPR46437.2021.00887"},{"key":"366_CR18","doi-asserted-by":"crossref","unstructured":"Yao A, Gall J, et al. (2011) Does human action recognition benefit from pose estimation. In: British Machine Vision Conference, pp. 67.1\u201367.11","DOI":"10.5244\/C.25.67"},{"key":"366_CR19","doi-asserted-by":"crossref","unstructured":"Du Y, Wang W and Wang L. (2015) Hierarchical recurrent neural network for skeleton based action recognition. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1110\u20131118","DOI":"10.1109\/CVPR.2015.7298714"},{"issue":"9","key":"366_CR20","doi-asserted-by":"publisher","first-page":"4382","DOI":"10.1109\/TIP.2018.2837386","volume":"27","author":"H Wang","year":"2018","unstructured":"Wang H, Wang L (2018) Beyond joints: Learning representations from primitive geometries for skeleton-based action recognition and detection. IEEE Trans Image Process 27(9):4382\u20134394","journal-title":"IEEE Trans Image Process"},{"issue":"7","key":"366_CR21","doi-asserted-by":"publisher","first-page":"3010","DOI":"10.1109\/TIP.2016.2552404","volume":"25","author":"Y Du","year":"2016","unstructured":"Du Y, Fu Y, Wang L (2016) Representation learning of temporal dynamics for skeleton-based action recognition. IEEE Trans Image Process 25(7):3010\u20133022","journal-title":"IEEE Trans Image Process"},{"issue":"9","key":"366_CR22","doi-asserted-by":"publisher","first-page":"2330","DOI":"10.1109\/TMM.2018.2802648","volume":"20","author":"S Zhang","year":"2018","unstructured":"Zhang S, Yang Y et al (2018) Fusing geometric features for skeleton-based action recognition using multilayer LSTM networks. IEEE Trans Multimed 20(9):2330\u20132343","journal-title":"IEEE Trans Multimed"},{"issue":"12","key":"366_CR23","doi-asserted-by":"publisher","first-page":"3007","DOI":"10.1109\/TPAMI.2017.2771306","volume":"40","author":"J Liu","year":"2018","unstructured":"Liu J, Shahroudy A et al (2018) Skeleton-based action recognition using spatio-temporal LSTM network with trust gates. IEEE Trans Pattern Anal Mach Intell 40(12):3007\u20133021","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"issue":"4","key":"366_CR24","doi-asserted-by":"publisher","first-page":"1586","DOI":"10.1109\/TIP.2017.2785279","volume":"27","author":"J Liu","year":"2018","unstructured":"Liu J, Wang G et al (2018) Skeleton-based human action recognition with global context-aware attention LSTM networks. IEEE Trans Image Process 27(4):1586\u20131599","journal-title":"IEEE Trans Image Process"},{"key":"366_CR25","unstructured":"Devlin J, Chang MW, et al. (2019) BERT: Pre-training of deep bidirectional transformers for language understanding. In: North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 4171\u20134186"},{"key":"366_CR26","doi-asserted-by":"publisher","first-page":"5477","DOI":"10.1109\/TIP.2021.3076556","volume":"30","author":"W Jin","year":"2021","unstructured":"Jin W, Zhao Z et al (2021) Adaptive spatio-temporal graph enhanced vision-language representation for video QA. IEEE Trans Image Process 30:5477\u20135489","journal-title":"IEEE Trans Image Process"},{"key":"366_CR27","doi-asserted-by":"publisher","first-page":"2914","DOI":"10.1109\/TMM.2021.3090595","volume":"24","author":"X Song","year":"2021","unstructured":"Song X, Chen J et al (2021) Spatial-temporal graphs for cross-modal text2video retrieval. IEEE Trans Multimed 24:2914\u20132923","journal-title":"IEEE Trans Multimed"},{"key":"366_CR28","unstructured":"Malhotra P, Vig L et al. (2015) Long short term memory networks for anomaly detection in time series. In: European Symposium on Artificial Neural Networks Computational Intelligence and Machine Learning(ESANN), pp. 89\u201394"},{"issue":"7","key":"366_CR29","doi-asserted-by":"publisher","first-page":"3459","DOI":"10.1109\/TIP.2018.2818328","volume":"27","author":"S Song","year":"2018","unstructured":"Song S, Lan C et al (2018) Spatio-temporal attention-based LSTM networks for 3D action recognition and detection. IEEE Trans Image Process 27(7):3459\u20133471","journal-title":"IEEE Trans Image Process"},{"issue":"21","key":"366_CR30","doi-asserted-by":"publisher","first-page":"15990","DOI":"10.1109\/JIOT.2020.3042986","volume":"8","author":"S Xu","year":"2021","unstructured":"Xu S, Rao H et al (2021) Attention-based multilevel co-occurrence graph convolutional LSTM for 3-D action recognition. IEEE Internet Things J 8(21):15990\u201316001","journal-title":"IEEE Internet Things J"},{"issue":"18","key":"366_CR31","doi-asserted-by":"publisher","first-page":"17421","DOI":"10.1109\/JSEN.2021.3059685","volume":"22","author":"Q Gao","year":"2022","unstructured":"Gao Q, Chen Y et al (2022) Dynamic hand gesture recognition based on 3D hand pose estimation for human\u2013robot interaction. IEEE Sens J 22(18):17421\u201317430","journal-title":"IEEE Sens J"},{"issue":"8","key":"366_CR32","doi-asserted-by":"publisher","first-page":"1583","DOI":"10.1109\/TPAMI.2016.2537340","volume":"38","author":"D Wu","year":"2016","unstructured":"Wu D, Pigou L et al (2016) Deep dynamic neural networks for multimodal gesture segmentation and recognition. IEEE Trans Pattern Anal Mach Intell 38(8):1583\u20131597","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"issue":"4","key":"366_CR33","doi-asserted-by":"publisher","first-page":"1011","DOI":"10.1109\/TMM.2018.2869278","volume":"21","author":"G Zhu","year":"2019","unstructured":"Zhu G, Zhang L et al (2019) Continuous gesture segmentation and recognition using 3DCNN and convolutional LSTM. IEEE Trans Multimed 21(4):1011\u20131021","journal-title":"IEEE Trans Multimed"},{"issue":"1","key":"366_CR34","doi-asserted-by":"publisher","first-page":"161","DOI":"10.1109\/TAI.2022.3151307","volume":"4","author":"V Chandrakanth","year":"2023","unstructured":"Chandrakanth V, Murthy VSN, Channappayya SS (2023) Siamese cross-domain tracker design for seamless tracking of targets in RGB and thermal videos. IEEE Trans Artif Intell 4(1):161\u2013172","journal-title":"IEEE Trans Artif Intell"},{"issue":"6","key":"366_CR35","doi-asserted-by":"publisher","first-page":"2260","DOI":"10.1109\/TCSVT.2020.3017727","volume":"31","author":"H Jain","year":"2021","unstructured":"Jain H, Harit G, Sharma A (2021) Action quality assessment using siamese network-based deep metric learning. IEEE Trans Circuits Syst Video Technol 31(6):2260\u20132273","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"issue":"1","key":"366_CR36","doi-asserted-by":"publisher","first-page":"186","DOI":"10.1109\/TCSVT.2021.3102886","volume":"33","author":"C Fan","year":"2023","unstructured":"Fan C, Yu H et al (2023) Siamon: Siamese occlusion-aware network for visual tracking. IEEE Trans Circuits Syst Video Technol 33(1):186\u2013199","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"issue":"3","key":"366_CR37","doi-asserted-by":"publisher","first-page":"1095","DOI":"10.1109\/TCYB.2017.2756840","volume":"48","author":"C Cao","year":"2018","unstructured":"Cao C, Zhang Y et al (2018) Body joint guided 3-D deep convolutional descriptors for action recognition. IEEE Trans Cybern 48(3):1095\u20131108","journal-title":"IEEE Trans Cybern"},{"issue":"3","key":"366_CR38","doi-asserted-by":"publisher","first-page":"634","DOI":"10.1109\/TMM.2017.2749159","volume":"20","author":"X Wang","year":"2018","unstructured":"Wang X, Gao L et al (2018) Two-stream 3-D convnet fusion for action recognition in videos with arbitrary size and length. IEEE Trans Multimed 20(3):634\u2013644","journal-title":"IEEE Trans Multimed"},{"issue":"8","key":"366_CR39","doi-asserted-by":"publisher","first-page":"1963","DOI":"10.1109\/TPAMI.2019.2896631","volume":"41","author":"P Zhang","year":"2019","unstructured":"Zhang P, Lan C et al (2019) View adaptive neural networks for high performance skeleton-based human action recognition. IEEE Trans Pattern Anal Mach Intell 41(8):1963\u20131978","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"issue":"1","key":"366_CR40","doi-asserted-by":"publisher","first-page":"172","DOI":"10.1109\/TPAMI.2019.2929257","volume":"43","author":"Z Cao","year":"2021","unstructured":"Cao Z, Hidalgo G et al (2021) OpenPose: realtime multi-person 2D pose estimation using part affinity fields. IEEE Trans Pattern Anal Mach Intell 43(1):172\u2013186","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"366_CR41","unstructured":"Lugaresi C, Tang J et al. (2019) Mediapipe: A framework for perceiving and processing reality. In: Third Workshop on Computer Vision for AR\/VR at IEEE Computer Vision and Pattern Recognition (CVPR), pp. 1\u20134"},{"issue":"1","key":"366_CR42","doi-asserted-by":"publisher","first-page":"221","DOI":"10.1109\/TPAMI.2012.59","volume":"35","author":"S Ji","year":"2013","unstructured":"Ji S, Xu W et al (2013) 3D convolutional neural networks for human action recognition. IEEE Trans Pattern Anal Mach Intell 35(1):221\u2013231","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"issue":"6","key":"366_CR43","doi-asserted-by":"publisher","first-page":"84","DOI":"10.1145\/3065386","volume":"60","author":"A Krizhevsky","year":"2017","unstructured":"Krizhevsky A, Sutskever I, Hinton GE (2017) Imagenet classification with deep convolutional neural networks. Commun ACM 60(6):84\u201390","journal-title":"Commun ACM"},{"key":"366_CR44","doi-asserted-by":"crossref","unstructured":"Carreira J and Zisserman A. (2017) Quo vadis, action recognition? A new model and the kinetics dataset. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 6299\u20136308","DOI":"10.1109\/CVPR.2017.502"},{"key":"366_CR45","doi-asserted-by":"crossref","unstructured":"Wang H and Schmid C. (2013) Action recognition with improved trajectories. In: IEEE International Conference on Computer Vision, pp. 3551\u20133558","DOI":"10.1109\/ICCV.2013.441"},{"key":"366_CR46","unstructured":"Vaswani A, Shazeer N, Parmar N et al. (2017) Attention Is All You Need. In: 31st Conference on Neural Information Processing Systems, pp. 1\u201311"},{"key":"366_CR47","unstructured":"Lugaresi C, Tang J, et al. (2019) Mediapipe: A framework for perceiving and processing reality, In: IEEE Computer Vision and Pattern Recognition (CVPR), pp. 1\u20134"},{"key":"366_CR48","doi-asserted-by":"crossref","unstructured":"Chu F, Cao J, Shao Z et al. (2022) Illumination-guided transformer-based network for multispectral pedestrian detection. In: International Conference on Artificial Intelligence, pp. 343\u2013355","DOI":"10.1007\/978-3-031-20497-5_28"},{"key":"366_CR49","doi-asserted-by":"crossref","unstructured":"Chen N, Xie J, Nie J, et al. (2023) Attentive alignment network for multispectral pedestrian detection. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 3787\u20133795","DOI":"10.1145\/3581783.3613444"},{"issue":"5","key":"366_CR50","doi-asserted-by":"publisher","first-page":"3739","DOI":"10.1109\/TITS.2023.3330155","volume":"25","author":"A Gao","year":"2024","unstructured":"Gao A, Pang Y, Nie J et al (2024) Toward generalizable multispectral pedestrian detection. IEEE Trans Intell Transp Syst 25(5):3739\u20133750","journal-title":"IEEE Trans Intell Transp Syst"}],"container-title":["International Journal of Multimedia Information Retrieval"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13735-025-00366-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s13735-025-00366-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13735-025-00366-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,10]],"date-time":"2025-06-10T13:41:08Z","timestamp":1749562868000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s13735-025-00366-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,25]]},"references-count":50,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2025,6]]}},"alternative-id":["366"],"URL":"https:\/\/doi.org\/10.1007\/s13735-025-00366-8","relation":{},"ISSN":["2192-6611","2192-662X"],"issn-type":[{"type":"print","value":"2192-6611"},{"type":"electronic","value":"2192-662X"}],"subject":[],"published":{"date-parts":[[2025,4,25]]},"assertion":[{"value":"13 August 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 January 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 March 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 April 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"This study does not involve either human subjects or animals.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical approval"}},{"value":"All participating authors have been informed.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Informed consent"}}],"article-number":"18"}}