{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T15:25:54Z","timestamp":1780673154437,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":18,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Key Research and Development Plan","award":["2021YFE0205700"],"award-info":[{"award-number":["2021YFE0205700"]}]},{"name":"Guangdong Provincial Key R&D Programme","award":["2019B010148001"],"award-info":[{"award-number":["2019B010148001"]}]},{"name":"Science and Technology Development Fund of Macau Project","award":["004\/2020\/A1 0123\/2022\/A3 0070\/2020\/AMJ"],"award-info":[{"award-number":["004\/2020\/A1 0123\/2022\/A3 0070\/2020\/AMJ"]}]},{"name":"External cooperation key project of Chinese Academy Sciences","award":["173211KYSB20200002"],"award-info":[{"award-number":["173211KYSB20200002"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612301","type":"proceedings-article","created":{"date-parts":[[2023,11,2]],"date-time":"2023-11-02T10:35:20Z","timestamp":1698921320000},"page":"3149-3160","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":17,"title":["Multi-stage Factorized Spatio-Temporal Representation for RGB-D Action and Gesture Recognition"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8014-0067","authenticated-orcid":false,"given":"Yujun","family":"Ma","sequence":"first","affiliation":[{"name":"Dalian Maritime University &amp; Massey University, Dalian, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4883-5552","authenticated-orcid":false,"given":"Benjia","family":"Zhou","sequence":"additional","affiliation":[{"name":"Macau University of Science and Technology, Macau SAR, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2899-9816","authenticated-orcid":false,"given":"Ruili","family":"Wang","sequence":"additional","affiliation":[{"name":"Dalian Maritime University &amp; Massey University, Dalian, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1430-0237","authenticated-orcid":false,"given":"Pichao","family":"Wang","sequence":"additional","affiliation":[{"name":"Amazon Prime Video, Seattle, WA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","volume":"35","author":"Bruce XB","year":"2021","unstructured":"XB Bruce, Yan Liu, and Keith CC Chan. 2021. Multimodal fusion via teacher-student network for indoor action recognition. In Proceedings of the AAAI Conference on Artificial Intelligence, Vol. 35. 3199--3207."},{"key":"e_1_3_2_2_2_1","volume-title":"Mmnet: A model-based multimodal network for human action recognition in rgb-d videos","author":"Bruce XB","year":"2022","unstructured":"XB Bruce, Yan Liu, Xiang Zhang, Sheng-hua Zhong, and Keith CC Chan. 2022. Mmnet: A model-based multimodal network for human action recognition in rgb-d videos. IEEE Transactions on Pattern Analysis and Machine Intelligence (2022)."},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3076165"},{"key":"e_1_3_2_2_4_1","volume-title":"Vpn: Learning video-pose embedding for activities of daily living. In Computer Vision-ECCV 2020: 16th European Conference","author":"Das Srijan","year":"2020","unstructured":"Srijan Das, Saurav Sharma, Rui Dai, Francois Bremond, and Monique Thonnat. 2020. Vpn: Learning video-pose embedding for activities of daily living. In Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part IX 16. Springer, 72--90."},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2022.3142675"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00331"},{"key":"e_1_3_2_2_7_1","volume-title":"TSM: Temporal shift module for efficient and scalable video understanding on edge devices","author":"Lin Ji","year":"2020","unstructured":"Ji Lin, Chuang Gan, Kuan Wang, and Song Han. 2020. TSM: Temporal shift module for efficient and scalable video understanding on edge devices. IEEE transactions on pattern analysis and machine intelligence 44, 5 (2020), 2760--2774."},{"key":"e_1_3_2_2_8_1","volume-title":"Ntu rgb d 120: A large-scale benchmark for 3d human activity understanding","author":"Liu Jun","year":"2019","unstructured":"Jun Liu, Amir Shahroudy, Mauricio Perez, Gang Wang, Ling-Yu Duan, and Alex C Kot. 2019. Ntu rgb d 120: A large-scale benchmark for 3d human activity understanding. IEEE transactions on pattern analysis and machine intelligence 42, 10 (2019), 2684--2701."},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00127"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2023.3247075"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2022.109741"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patrec.2023.02.024"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-60639-8_40"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3050642"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2020.12.020"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.3390\/math11092115"},{"key":"e_1_3_2_2_18_1","unstructured":"Hongyi Zhang Moustapha Cisse Yann N Dauphin and David Lopez-Paz. 2017. mixup: Beyond empirical risk minimization. arXiv preprint arXiv:1710.09412 (2017)"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612301","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612301","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T23:58:27Z","timestamp":1755820707000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612301"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":18,"alternative-id":["10.1145\/3581783.3612301","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612301","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}