{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T07:29:25Z","timestamp":1740122965599,"version":"3.37.3"},"reference-count":46,"publisher":"Springer Science and Business Media LLC","issue":"9","license":[{"start":{"date-parts":[[2022,1,8]],"date-time":"2022-01-08T00:00:00Z","timestamp":1641600000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,8]],"date-time":"2022-01-08T00:00:00Z","timestamp":1641600000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["WK2150110007","WK2150110012"],"award-info":[{"award-number":["WK2150110007","WK2150110012"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61772490","61472382","61472381","61572454"],"award-info":[{"award-number":["61772490","61472382","61472381","61572454"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2022,4]]},"DOI":"10.1007\/s11042-021-11247-7","type":"journal-article","created":{"date-parts":[[2022,1,8]],"date-time":"2022-01-08T19:02:38Z","timestamp":1641668558000},"page":"12157-12176","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["D2F: discriminative dense fusion of appearance and motion modalities for end-to-end video classification"],"prefix":"10.1007","volume":"81","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9944-5151","authenticated-orcid":false,"given":"Lin","family":"Wang","sequence":"first","affiliation":[]},{"given":"Xingfu","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Ammar","family":"Hawbani","sequence":"additional","affiliation":[]},{"given":"Yan","family":"Xiong","sequence":"additional","affiliation":[]},{"given":"Xu","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,1,8]]},"reference":[{"key":"11247_CR1","doi-asserted-by":"crossref","unstructured":"Abavisani M, Joze HRV, Patel VM (2019) Improving the performance of unimodal dynamic hand-gesture recognition with multimodal training. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 1165\u20131174","DOI":"10.1109\/CVPR.2019.00126"},{"key":"11247_CR2","doi-asserted-by":"crossref","unstructured":"Asadi-Aghbolaghi M, Bertiche H, Roig V, Kasaei S, Escalera S (2017) Action recognition from rgb-d data: comparison and fusion of spatio-temporal handcrafted features and deep strategies. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV) Workshops","DOI":"10.1109\/ICCVW.2017.376"},{"key":"11247_CR3","doi-asserted-by":"crossref","unstructured":"Carreira J, Zisserman A (2017) Quo vadis, action recognition? A new model and the kinetics dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 6299\u20136308","DOI":"10.1109\/CVPR.2017.502"},{"key":"11247_CR4","doi-asserted-by":"crossref","unstructured":"Chollet F (2017) Xception: deep learning with depthwise separable convolutions. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1251\u20131258","DOI":"10.1109\/CVPR.2017.195"},{"key":"11247_CR5","doi-asserted-by":"crossref","unstructured":"Duan B, Tang H, Wang W, Zong Z, Yang G, Yan Y (2021) Audio-visual event localization via recursive fusion by joint co-attention. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV), pp. 4013\u20134022 (2021)","DOI":"10.1109\/WACV48630.2021.00406"},{"key":"11247_CR6","doi-asserted-by":"crossref","unstructured":"Feichtenhofer C, Fan H, Malik J, He K (2019) Slowfast networks for video recognition. In: Proceedings of the IEEE international conference on computer vision, pp 6202\u20136211","DOI":"10.1109\/ICCV.2019.00630"},{"key":"11247_CR7","unstructured":"Feichtenhofer C, Pinz A, Wildes RP (2016) Spatiotemporal residual networks for video action recognition. In: Proceedings of the 30th International Conference on Neural Information Processing Systems, NIPS16, p 34763484. Curran Associates Inc., Red Hook, NY, USA"},{"key":"11247_CR8","doi-asserted-by":"crossref","unstructured":"Feichtenhofer C, Pinz A, Wildes RP (2017) Spatiotemporal multiplier networks for video action recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 4768\u20134777","DOI":"10.1109\/CVPR.2017.787"},{"key":"11247_CR9","unstructured":"Goyal P, Sahu S, Ghosh S, Lee C (2020) Cross-modal learning for multi-modal video categorization"},{"key":"11247_CR10","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"11247_CR11","unstructured":"Ioffe S, Szegedy C (2015) Batch normalization: accelerating deep network training by reducing internal covariate shift. arXiv preprint. arXiv:1502.03167"},{"key":"11247_CR12","doi-asserted-by":"crossref","unstructured":"Jain SD, Xiong B, Grauman K (2017) Fusionseg: learning to combine motion and appearance for fully automatic segmentation of generic objects in videos. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 2117\u20132126","DOI":"10.1109\/CVPR.2017.228"},{"key":"11247_CR13","doi-asserted-by":"crossref","unstructured":"Kalfaoglu M, Kalkan S, Alatan AA (2020) Late temporal modeling in 3d cnn architectures with bert for action recognition. arXiv preprint. arXiv:2008.01232","DOI":"10.1007\/978-3-030-68238-5_48"},{"issue":"9","key":"11247_CR14","doi-asserted-by":"publisher","first-page":"1635","DOI":"10.1109\/JPROC.2015.2459017","volume":"103","author":"AK Katsaggelos","year":"2015","unstructured":"Katsaggelos AK, Bahaadini S, Molina R (2015) Audiovisual fusion: challenges and new approaches. Proc IEEE 103(9):1635\u20131653","journal-title":"Proc IEEE"},{"key":"11247_CR15","doi-asserted-by":"publisher","first-page":"105986","DOI":"10.1016\/j.asoc.2019.105986","volume":"87","author":"MA Khan","year":"2020","unstructured":"Khan MA, Sharif M, Akram T, Raza M, Saba T (2020) Rehman A (2020) Hand-crafted and deep convolutional neural network features fusion and selection strategy: an application to intelligent human action recognition. Appl Soft Comput 87:105986","journal-title":"Appl Soft Comput"},{"issue":"14","key":"11247_CR16","doi-asserted-by":"publisher","first-page":"10423","DOI":"10.1007\/s00521-019-04578-y","volume":"32","author":"SA Khowaja","year":"2020","unstructured":"Khowaja SA, Lee SL (2020) Hybrid and hierarchical fusion networks: a deep cross-modal learning architecture for action recognition. Neural Comput Appl 32(14):10423\u201310434","journal-title":"Neural Comput Appl"},{"key":"11247_CR17","doi-asserted-by":"crossref","unstructured":"Kuehne H, Jhuang H, Garrote E, Poggio T, Serre T (2011) HMDB: a large video database for human motion recognition. In: Proceedings of the International Conference on Computer Vision (ICCV)","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"11247_CR18","unstructured":"Li A, Thotakuri M, Ross DA, Carreira J, Vostrikov A, Zisserman A (2020) The ava-kinetics localized human actions video dataset. arXiv preprint. arXiv:2005.00214"},{"key":"11247_CR19","doi-asserted-by":"publisher","first-page":"187","DOI":"10.1016\/j.patrec.2017.12.003","volume":"119","author":"Y Li","year":"2019","unstructured":"Li Y, Miao Q, Tian K, Fan Y, Xu X, Ma Z, Song J (2019) Large-scale gesture recognition with a fusion of rgb-d data based on optical flow and the c3d model. Pattern Recogn Lett 119:187\u2013194","journal-title":"Pattern Recogn Lett"},{"key":"11247_CR20","unstructured":"Liu K, Liu W, Gan C, Tan M, Ma H (2018) T-C3D: temporal convolutional 3d network for real-time action recognition. In: SA McIlraith, KQ Weinberger (eds.) Proceedings of the Thirty-Second AAAI Conference on Artificial Intelligence, (AAAI-18), the 30th innovative Applications of Artificial Intelligence (IAAI-18), and the 8th AAAI Symposium on Educational Advances in Artificial Intelligence (EAAI-18), New Orleans, Louisiana, USA, February 2\u20137, 2018, pp. 7138\u20137145. AAAI Press (2018). URL https:\/\/www.aaai.org\/ocs\/index. php\/AAAI\/AAAI18\/paper\/view\/17205"},{"key":"11247_CR21","doi-asserted-by":"publisher","unstructured":"Mai S, Hu H, Xing S\u00a0(2020) Modality to modality translation: An adversarial representation learning and graph fusion network for multimodal fusion. In: Proceedings of the AAAI Conference on Artificial Intelligence 34(1):164\u2013172. https:\/\/doi.org\/10.1609\/aaai.v34i01.5347. URL https:\/\/ojs.aaai.org\/index.php\/AAAI\/article\/view\/5347","DOI":"10.1609\/aaai.v34i01.5347"},{"issue":"6","key":"11247_CR22","doi-asserted-by":"publisher","first-page":"96","DOI":"10.1109\/MSP.2017.2738401","volume":"34","author":"D Ramachandram","year":"2017","unstructured":"Ramachandram D, Taylor GW (2017) Deep multimodal learning: a survey on recent advances and trends. IEEE Signal Process Mag 34(6):96\u2013108","journal-title":"IEEE Signal Process Mag"},{"key":"11247_CR23","doi-asserted-by":"crossref","unstructured":"Rashed H, Yogamani S, El-Sallab A, Krizek P, El-Helw M (2019) Optical flow augmented semantic segmentation networks for automated driving. arXiv preprint. arXiv:1901.07355","DOI":"10.5220\/0007248301650172"},{"key":"11247_CR24","doi-asserted-by":"crossref","unstructured":"Riva M, Wand M, Schmidhuber J (2020) Motion dynamics improve speaker-independent lipreading. In: ICASSP 2020\u20132020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp 4407\u20134411","DOI":"10.1109\/ICASSP40776.2020.9053535"},{"key":"11247_CR25","doi-asserted-by":"crossref","unstructured":"Roitberg A, Pollert T, Haurilet M, Martin M, Stiefelhagen R (2019) Analysis of deep fusion strategies for multi-modal gesture recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops","DOI":"10.1109\/CVPRW.2019.00029"},{"key":"11247_CR26","unstructured":"Saha S, Singh G, Cuzzolin F (2020) Two-stream amtnet for action detection. arXiv preprint. arXiv:2004.01494"},{"key":"11247_CR27","unstructured":"Sarma D, Kavyasree V, Bhuyan M (2020) Two-stream fusion model for dynamic hand gesture recognition using 3d-cnn and 2d-cnn optical flow guided motion template. arXiv preprint. arXiv:2007.08847"},{"issue":"7","key":"11247_CR28","doi-asserted-by":"publisher","first-page":"1510","DOI":"10.1109\/TMM.2017.2666540","volume":"19","author":"Y Shi","year":"2017","unstructured":"Shi Y, Tian Y, Wang Y, Huang T (2017) Sequential deep trajectory descriptor for action recognition with three-stream cnn. IEEE Trans Multimedia 19(7):1510\u20131520. https:\/\/doi.org\/10.1109\/TMM.2017.2666540","journal-title":"IEEE Trans Multimedia"},{"key":"11247_CR29","unstructured":"Simonyan K, Zisserman A (2014) Two-stream convolutional networks for action recognition in videos. In: Advances in neural information processing systems, pp 568\u2013576"},{"key":"11247_CR30","unstructured":"Simonyan K, Zisserman A (2014) Very deep convolutional networks for large-scale image recognition. Comput Vis Pattern Recognit"},{"key":"11247_CR31","unstructured":"Sohn K, Shang W, Lee H (2014) Improved multimodal deep learning with variation of information. In: Advances in neural information processing systems, pp 2141\u20132149"},{"key":"11247_CR32","doi-asserted-by":"publisher","first-page":"3957","DOI":"10.1109\/TIP.2020.2967577","volume":"29","author":"S Song","year":"2020","unstructured":"Song S, Liu J, Li Y, Guo Z (2020) Modality compensation network: cross-modal adaptation for action recognition. IEEE Trans Image Process 29:3957\u20133969","journal-title":"IEEE Trans Image Process"},{"key":"11247_CR33","unstructured":"Soomro K, Zamir AR, Shah M (2012) UCF101: A dataset of 101 human actions classes from videos in the wild. Comput Vis Pattern Recogn"},{"key":"11247_CR34","doi-asserted-by":"crossref","unstructured":"Sterpu G, Saam C, Harte N (2020) Should we hard-code the recurrence concept or learn it instead? Exploring the transformer architecture for audio-visual speech recognition","DOI":"10.21437\/Interspeech.2020-2480"},{"key":"11247_CR35","doi-asserted-by":"crossref","unstructured":"Su R, Ouyang W, Zhou L, Xu D (2019) Improving action localization by progressive crossstream cooperation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 12016\u201312025","DOI":"10.1109\/CVPR.2019.01229"},{"key":"11247_CR36","doi-asserted-by":"crossref","unstructured":"Wang L, Xiong Y, Wang Z, Qiao Y, Lin D, Tang X, Van Gool L (2016) Temporal segment networks: Towards good practices for deep action recognition. In: European conference on computer vision, pp 20\u201336, Springer","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"11247_CR37","unstructured":"Weng X, Kitani K (2019) Learning spatio-temporal features with two-stream deep 3d cnns for lipreading. arXiv preprint. arXiv:1905.02540"},{"key":"11247_CR38","doi-asserted-by":"crossref","unstructured":"Xiao J, Yang S, Zhang Y, Shan S, Chen X (2020) Deformation flow based two-stream network for lip reading. arXiv preprint. arXiv:2003.05709","DOI":"10.1109\/FG47880.2020.00132"},{"key":"11247_CR39","doi-asserted-by":"crossref","unstructured":"Xu B, Lu C, Guo Y, Wang J (2020) Discriminative multi-modality speech recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","DOI":"10.1109\/CVPR42600.2020.01444"},{"key":"11247_CR40","doi-asserted-by":"crossref","unstructured":"Yang C, Xu Y, Shi J, Dai B, Zhou B (2020) Temporal pyramid network for action recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 591\u2013600","DOI":"10.1109\/CVPR42600.2020.00067"},{"key":"11247_CR41","doi-asserted-by":"publisher","first-page":"106713","DOI":"10.1016\/j.asoc.2020.106713","volume":"97","author":"L Yao","year":"2020","unstructured":"Yao L, Yang W (2020) Huang W (2020) A data augmentation method for human action recognition using dense joint motion images. Appl Soft Comput 97:106713","journal-title":"Appl Soft Comput"},{"key":"11247_CR42","doi-asserted-by":"crossref","unstructured":"Yue-Hei Ng J, Hausknecht M, Vijayanarasimhan S, Vinyals O, Monga R, Toderici G (2015) Beyond short snippets: deep networks for video classification. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 4694\u20134702","DOI":"10.1109\/CVPR.2015.7299101"},{"key":"11247_CR43","doi-asserted-by":"crossref","unstructured":"Zach C, Pock T, Bischof H (2007) A duality based approach for realtime tv-l 1 optical flow. In: Joint pattern recognition symposium, pp 214\u2013223, Springer","DOI":"10.1007\/978-3-540-74936-3_22"},{"key":"11247_CR44","doi-asserted-by":"publisher","first-page":"107312","DOI":"10.1016\/j.patcog.2020.107312","volume":"103","author":"D Zhang","year":"2020","unstructured":"Zhang D, He L, Tu Z, Zhang S, Han F (2020) Yang B Learning motion representation for real-time spatio-temporal action localization. Pattern Recognit 103:107312","journal-title":"Pattern Recognit"},{"key":"11247_CR45","doi-asserted-by":"crossref","unstructured":"Zhao J, Snoek CG (2019) Dance with flow: two-in-one stream action detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 9935\u20139944","DOI":"10.1109\/CVPR.2019.01017"},{"key":"11247_CR46","first-page":"3","volume":"2","author":"T Zhou","year":"2020","unstructured":"Zhou T, Wang S, Zhou Y, Yao Y, Li J, Shao L (2020) Motion-attentive transition for zero-shot video object segmentation. Proc AAAI Conf Artif intel 2:3","journal-title":"Proc AAAI Conf Artif intel"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-021-11247-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-021-11247-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-021-11247-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,4,13]],"date-time":"2022-04-13T19:51:52Z","timestamp":1649879512000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-021-11247-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,1,8]]},"references-count":46,"journal-issue":{"issue":"9","published-print":{"date-parts":[[2022,4]]}},"alternative-id":["11247"],"URL":"https:\/\/doi.org\/10.1007\/s11042-021-11247-7","relation":{},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"type":"print","value":"1380-7501"},{"type":"electronic","value":"1573-7721"}],"subject":[],"published":{"date-parts":[[2022,1,8]]},"assertion":[{"value":"31 July 2020","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 May 2021","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"8 July 2021","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"8 January 2022","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}