{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:14:06Z","timestamp":1750220046743,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":12,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,2,17]],"date-time":"2023-02-17T00:00:00Z","timestamp":1676592000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Shenzhen Science and Technology Program","award":["JCYJ20180507183823045 and JCYJ20200109113014456"],"award-info":[{"award-number":["JCYJ20180507183823045 and JCYJ20200109113014456"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,2,17]]},"DOI":"10.1145\/3587716.3587761","type":"proceedings-article","created":{"date-parts":[[2023,9,7]],"date-time":"2023-09-07T23:27:30Z","timestamp":1694129250000},"page":"274-278","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Multi-semantic Representation with Transformer Network for Video Classification"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3040-5880","authenticated-orcid":false,"given":"Yuxi","family":"Sun","sequence":"first","affiliation":[{"name":"School of Computer Science and Technology, Harbin Institute of Technology, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7343-4391","authenticated-orcid":false,"given":"Tonghuan","family":"Xiao","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Harbin Institute of Technology, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1894-984X","authenticated-orcid":false,"given":"Xutao","family":"Li","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Harbin Institute of Technology, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1807-8581","authenticated-orcid":false,"given":"Yunming","family":"Ye","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Harbin Institute of Technology, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,9,7]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Kuehne H Jhuang H Garrote E HMDB: A large video database for human motion recognition[C]\/\/Proceedings of the IEEE International Conference on Computer Vision. 2011: 2556-2563.","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"e_1_3_2_1_2_1","first-page":"3551","volume":"2013","author":"Wang H","unstructured":"Wang H, Schmid C. Action recognition with improved trajectories[C]\/\/Proceedings of the IEEE International Conference on Computer Vision. 2013: 3551-3558.","journal-title":"Proceedings of the IEEE International Conference on Computer Vision."},{"key":"e_1_3_2_1_3_1","first-page":"6016","volume":"2018","author":"Fan L","unstructured":"Fan L, Huang W, Gan C, End-to-end learning of motion representation for video understanding[C]\/\/Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 2018: 6016-6025.","journal-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_4_1","first-page":"20","volume":"2016","author":"Wang L","unstructured":"Wang L, Xiong Y, Wang Z, Temporal segment networks: Towards good practices for deep action recognition[C]\/\/Proceedings of the European Conference on Computer Vision. 2016: 20-36.","journal-title":"Proceedings of the European Conference on Computer Vision."},{"key":"e_1_3_2_1_5_1","first-page":"4489","volume":"2015","author":"Tran D","unstructured":"Tran D, Bourdev L, Fergus R, Learning spatiotemporal features with 3d convolutional networks[C]\/\/Proceedings of the IEEE International Conference on Computer Vision. 2015: 4489-4497.","journal-title":"Proceedings of the IEEE International Conference on Computer Vision."},{"key":"e_1_3_2_1_6_1","first-page":"7794","volume":"2018","author":"Wang X","unstructured":"Wang X, Girshick R, Gupta A, Non-local neural networks[C]\/\/Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 2018: 7794-7803.","journal-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_7_1","volume-title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale[C]\/\/International Conference on Learning Representations","author":"Dosovitskiy A","year":"2020","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale[C]\/\/International Conference on Learning Representations. 2020."},{"key":"e_1_3_2_1_8_1","first-page":"6824","volume":"2021","author":"Fan H","unstructured":"Fan H, Xiong B, Mangalam K, Multiscale vision transformers[C]\/\/Proceedings of the IEEE International Conference on Computer Vision. 2021: 6824-6835.","journal-title":"Proceedings of the IEEE International Conference on Computer Vision."},{"key":"e_1_3_2_1_9_1","first-page":"4305","volume":"2015","author":"Wang L","unstructured":"Wang L, Qiao Y, Tang X. Action recognition with trajectory-pooled deep-convolutional descriptors[C]\/\/Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 2015: 4305-4314.","journal-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_10_1","first-page":"284","volume":"2018","author":"Diba A","unstructured":"Diba A, Fayyaz M, Sharma V, Spatio-temporal channel correlation networks for action classification[C]\/\/Proceedings of the European Conference on Computer Vision. 2018: 284-299.","journal-title":"Proceedings of the European Conference on Computer Vision."},{"key":"e_1_3_2_1_11_1","first-page":"7083","volume":"2019","author":"Lin J","unstructured":"Lin J, Gan C, Han S. Tsm: Temporal shift module for efficient video understanding[C]\/\/Proceedings of the IEEE\/CVF International Conference on Computer Vision. 2019: 7083-7093.","journal-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision."},{"key":"e_1_3_2_1_12_1","volume-title":"Gan W","author":"Jiang B","year":"2019","unstructured":"Jiang B, Wang M M, Gan W, Stm: Spatiotemporal and motion encoding for action recognition[C]\/\/Proceedings of the IEEE\/CVF International Conference on Computer Vision. 2019: 2000-2009."}],"event":{"name":"ICMLC 2023: 2023 15th International Conference on Machine Learning and Computing","acronym":"ICMLC 2023","location":"Zhuhai China"},"container-title":["Proceedings of the 2023 15th International Conference on Machine Learning and Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3587716.3587761","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3587716.3587761","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T18:08:00Z","timestamp":1750183680000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3587716.3587761"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,2,17]]},"references-count":12,"alternative-id":["10.1145\/3587716.3587761","10.1145\/3587716"],"URL":"https:\/\/doi.org\/10.1145\/3587716.3587761","relation":{},"subject":[],"published":{"date-parts":[[2023,2,17]]},"assertion":[{"value":"2023-09-07","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}