{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,25]],"date-time":"2025-03-25T14:39:24Z","timestamp":1742913564984,"version":"3.40.3"},"publisher-location":"Cham","reference-count":35,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783031109850"},{"type":"electronic","value":"9783031109867"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-10986-7_4","type":"book-chapter","created":{"date-parts":[[2022,7,18]],"date-time":"2022-07-18T22:30:36Z","timestamp":1658183436000},"page":"43-56","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Sparse Dense Transformer Network for\u00a0Video Action Recognition"],"prefix":"10.1007","author":[{"given":"Xiaochun","family":"Qu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zheyuan","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wei","family":"Xiao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jinye","family":"Ran","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Guodong","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zili","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2022,7,19]]},"reference":[{"doi-asserted-by":"crossref","unstructured":"Arnab, A., Dehghani, M., Heigold, G., Sun, C., Lu\u010di\u0107, M., Schmid, C.: ViViT: a video vision transformer. arXiv preprint arXiv:2103.15691 (2021)","key":"4_CR1","DOI":"10.1109\/ICCV48922.2021.00676"},{"unstructured":"Ba, J.L., Kiros, J.R., Hinton, G.E.: Layer normalization. arXiv preprint arXiv:1607.06450 (2016)","key":"4_CR2"},{"unstructured":"Bertasius, G., Wang, H., Torresani, L.: Is space-time attention all you need for video understanding? arXiv preprint arXiv:2102.05095 (2021)","key":"4_CR3"},{"doi-asserted-by":"crossref","unstructured":"Cao, W.P., et al.: An ensemble fuzziness-based online sequential learning approach and its application. In: International Conference on Knowledge Science, Engineering and Management (KSEM), pp. 255\u2013267 (2021)","key":"4_CR4","DOI":"10.1007\/978-3-030-82136-4_21"},{"key":"4_CR5","doi-asserted-by":"publisher","first-page":"237","DOI":"10.1016\/j.neunet.2021.03.016","volume":"140","author":"W Cao","year":"2021","unstructured":"Cao, W., Xie, Z., Li, J., Xu, Z., Ming, Z., Wang, X.: Bidirectional stochastic configuration network for regression problems. Neural Netw. 140, 237\u2013246 (2021)","journal-title":"Neural Netw."},{"doi-asserted-by":"crossref","unstructured":"Cao, W., Yang, P., Ming, Z., Cai, S., Zhang, J.: An improved fuzziness based random vector functional link network for liver disease detection. In: 2020 IEEE 6th International Conference on Big Data Security on Cloud (BigDataSecurity), IEEE International Conference on High Performance and Smart Computing, (HPSC) and IEEE International Conference on Intelligent Data and Security (IDS), pp. 42\u201348 (2020)","key":"4_CR6","DOI":"10.1109\/BigDataSecurity-HPSC-IDS49724.2020.00019"},{"doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo Vadis, action recognition? A new model and the kinetics dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 6299\u20136308 (2017)","key":"4_CR7","DOI":"10.1109\/CVPR.2017.502"},{"unstructured":"Dosovitskiy, A., et al.: An image is worth 16 $$\\times $$ 16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)","key":"4_CR8"},{"doi-asserted-by":"crossref","unstructured":"Fan, H., et al.: Multiscale vision transformers. arXiv preprint arXiv:2104.11227 (2021)","key":"4_CR9","DOI":"10.1109\/ICCV48922.2021.00675"},{"doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C.: X3D: expanding architectures for efficient video recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 203\u2013213 (2020)","key":"4_CR10","DOI":"10.1109\/CVPR42600.2020.00028"},{"doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., He, K.: SlowFast networks for video recognition. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), pp. 6202\u20136211 (2019)","key":"4_CR11","DOI":"10.1109\/ICCV.2019.00630"},{"doi-asserted-by":"crossref","unstructured":"Gao, R., Oh, T.H., Grauman, K., Torresani, L.: Listen to look: action recognition by previewing audio. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10457\u201310467 (2020)","key":"4_CR12","DOI":"10.1109\/CVPR42600.2020.01047"},{"doi-asserted-by":"crossref","unstructured":"Girdhar, R., Carreira, J., Doersch, C., Zisserman, A.: Video action transformer network. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 244\u2013253 (2019)","key":"4_CR13","DOI":"10.1109\/CVPR.2019.00033"},{"unstructured":"Han, K., Xiao, A., Wu, E., Guo, J., Xu, C., Wang, Y.: Transformer in transformer. arXiv preprint arXiv:2103.00112 (2021)","key":"4_CR14"},{"doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 770\u2013778 (2016)","key":"4_CR15","DOI":"10.1109\/CVPR.2016.90"},{"issue":"4","key":"4_CR16","doi-asserted-by":"publisher","first-page":"656","DOI":"10.1109\/TITB.2009.2023116","volume":"13","author":"F Hu","year":"2009","unstructured":"Hu, F., Lakdawala, S., Hao, Q., Qiu, M.: Low-power, intelligent sensor hardware interface for medical data preprocessing. IEEE Trans. Inf Technol. Biomed. 13(4), 656\u2013663 (2009)","journal-title":"IEEE Trans. Inf Technol. Biomed."},{"doi-asserted-by":"crossref","unstructured":"Kahatapitiya, K., Ryoo, M.S.: Coarse-fine networks for temporal activity detection in videos. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 8385\u20138394 (2021)","key":"4_CR17","DOI":"10.1109\/CVPR46437.2021.00828"},{"key":"4_CR18","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"731","DOI":"10.1007\/978-3-030-68238-5_48","volume-title":"Computer Vision \u2013 ECCV 2020 Workshops","author":"ME Kalfaoglu","year":"2020","unstructured":"Kalfaoglu, M.E., Kalkan, S., Alatan, A.A.: Late temporal modeling in 3D CNN architectures with BERT for action recognition. In: Bartoli, A., Fusiello, A. (eds.) ECCV 2020. LNCS, vol. 12539, pp. 731\u2013747. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-68238-5_48"},{"unstructured":"Kay, W., et al.: The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)","key":"4_CR19"},{"issue":"11","key":"4_CR20","doi-asserted-by":"publisher","first-page":"2990","DOI":"10.1109\/TMM.2020.2965434","volume":"22","author":"J Li","year":"2020","unstructured":"Li, J., Liu, X., Zhang, W., Zhang, M., Song, J., Sebe, N.: Spatio-temporal attention networks for action recognition and detection. IEEE Trans. Multimedia 22(11), 2990\u20133001 (2020)","journal-title":"IEEE Trans. Multimedia"},{"issue":"4","key":"4_CR21","doi-asserted-by":"publisher","first-page":"2833","DOI":"10.1109\/TII.2020.3008010","volume":"17","author":"Y Li","year":"2020","unstructured":"Li, Y., Song, Y., Jia, L., Gao, S., Li, Q., Qiu, M.: Intelligent fault diagnosis by fusing domain adversarial training and maximum mean discrepancy via ensemble learning. IEEE Trans. Industr. Inf. 17(4), 2833\u20132841 (2020)","journal-title":"IEEE Trans. Industr. Inf."},{"unstructured":"Liu, Z., et al.: Video Swin transformer. arXiv preprint arXiv:2106.13230 (2021)","key":"4_CR22"},{"doi-asserted-by":"crossref","unstructured":"Neimark, D., Bar, O., Zohar, M., Asselmann, D.: Video transformer network. arXiv preprint arXiv:2102.00719 (2021)","key":"4_CR23","DOI":"10.1109\/ICCVW54120.2021.00355"},{"issue":"7","key":"4_CR24","doi-asserted-by":"publisher","first-page":"4560","DOI":"10.1109\/TITS.2020.3032882","volume":"22","author":"H Qiu","year":"2020","unstructured":"Qiu, H., Zheng, Q., Msahli, M., Memmi, G., Qiu, M., Lu, J.: Topological graph convolutional network-based urban traffic flow and density prediction. IEEE Trans. Intell. Transp. Syst. 22(7), 4560\u20134569 (2020)","journal-title":"IEEE Trans. Intell. Transp. Syst."},{"doi-asserted-by":"crossref","unstructured":"Qiu, Z., Yao, T., Mei, T.: Learning spatio-temporal representation with pseudo-3d residual networks. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), pp. 5533\u20135541 (2017)","key":"4_CR25","DOI":"10.1109\/ICCV.2017.590"},{"unstructured":"Sharir, G., Noy, A., Zelnik-Manor, L.: An image is worth 16 $$\\times $$ 16 words, what is a video worth? arXiv preprint arXiv:2103.13915 (2021)","key":"4_CR26"},{"unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos. arXiv preprint arXiv:1406.2199 (2014)","key":"4_CR27"},{"doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., Fergus, R., Torresani, L., Paluri, M.: Learning spatiotemporal features with 3d convolutional networks. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), pp. 4489\u20134497 (2015)","key":"4_CR28","DOI":"10.1109\/ICCV.2015.510"},{"doi-asserted-by":"crossref","unstructured":"Tran, D., Wang, H., Torresani, L., Feiszli, M.: Video classification with channel-separated convolutional networks. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), pp. 5552\u20135561 (2019)","key":"4_CR29","DOI":"10.1109\/ICCV.2019.00565"},{"doi-asserted-by":"crossref","unstructured":"Tran, D., Wang, H., Torresani, L., Ray, J., LeCun, Y., Paluri, M.: A closer look at spatiotemporal convolutions for action recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 6450\u20136459 (2018)","key":"4_CR30","DOI":"10.1109\/CVPR.2018.00675"},{"issue":"1","key":"4_CR31","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/0896-6273(94)90455-3","volume":"13","author":"DC Van Essen","year":"1994","unstructured":"Van Essen, D.C., Gallant, J.L.: Neural mechanisms of form and motion processing in the primate visual system. Neuron 13(1), 1\u201310 (1994)","journal-title":"Neuron"},{"unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems (NIPS), pp. 5998\u20136008 (2017)","key":"4_CR32"},{"key":"4_CR33","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"20","DOI":"10.1007\/978-3-319-46484-8_2","volume-title":"Computer Vision \u2013 ECCV 2016","author":"L Wang","year":"2016","unstructured":"Wang, L., et al.: Temporal segment networks: towards good practices for deep action recognition. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9912, pp. 20\u201336. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46484-8_2"},{"unstructured":"Zhang, S., Guo, S., Huang, W., Scott, M.R., Wang, L.: V4D: 4d convolutional neural networks for video-level representation learning. arXiv preprint arXiv:2002.07442 (2020)","key":"4_CR34"},{"key":"4_CR35","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"713","DOI":"10.1007\/978-3-030-01216-8_43","volume-title":"Computer Vision \u2013 ECCV 2018","author":"M Zolfaghari","year":"2018","unstructured":"Zolfaghari, M., Singh, K., Brox, T.: ECO: efficient convolutional network for online video understanding. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11206, pp. 713\u2013730. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01216-8_43"}],"container-title":["Lecture Notes in Computer Science","Knowledge Science, Engineering and Management"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-10986-7_4","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,7,18]],"date-time":"2022-07-18T22:31:32Z","timestamp":1658183492000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-10986-7_4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031109850","9783031109867"],"references-count":35,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-10986-7_4","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"19 July 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"KSEM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Knowledge Science, Engineering and Management","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Singapore","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Singapore","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"6 August 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 August 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ksem2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/ksem22.smart-conf.net\/index.html","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"EasyChair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"498","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"169","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"34% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"10","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}