{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,12]],"date-time":"2026-02-12T17:29:21Z","timestamp":1770917361208,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,10,17]],"date-time":"2021-10-17T00:00:00Z","timestamp":1634428800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Central Universities of China","award":["D2192860"],"award-info":[{"award-number":["D2192860"]}]},{"name":"National Natural Science Foundation of China","award":["61876208"],"award-info":[{"award-number":["61876208"]}]},{"name":"Key-Area Research and Development Program of Guangdong Province","award":["2018B010108002"],"award-info":[{"award-number":["2018B010108002"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,10,17]]},"DOI":"10.1145\/3474085.3475248","type":"proceedings-article","created":{"date-parts":[[2021,10,18]],"date-time":"2021-10-18T22:11:38Z","timestamp":1634595098000},"page":"769-778","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":23,"title":["Modeling the Uncertainty for Self-supervised 3D Skeleton Action Representation Learning"],"prefix":"10.1145","author":[{"given":"Yukun","family":"Su","sequence":"first","affiliation":[{"name":"South China University of Technology, Guangzhou, China"}]},{"given":"Guosheng","family":"Lin","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore, Singapore"}]},{"given":"Ruizhou","family":"Sun","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, China"}]},{"given":"Yun","family":"Hao","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, China"}]},{"given":"Qingyao","family":"Wu","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2021,10,17]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 9922--9931","author":"Rubinstein Michael","year":"2020"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.143"},{"key":"e_1_3_2_2_3_1","volume-title":"Interna-tional conference on machine learning. PMLR, 1597--1607","author":"Chen Ting","year":"2020"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00026"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.5555\/3157382.3157527"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.167"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01061"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00186"},{"key":"e_1_3_2_2_9_1","volume-title":"Contrastive Multi-View Representation Learning on Graphs. arXiv preprint arXiv:2006.05582","author":"Hassani Kaveh","year":"2020"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"e_1_3_2_2_11_1","volume-title":"Learning deep represen-tations by mutual information estimation and maximization. arXiv preprint arXiv:1808.06670","author":"Hjelm R Devon","year":"2018"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.179"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00644"},{"key":"e_1_3_2_2_14_1","unstructured":"Will Kay Joao Carreira Karen Simonyan Brian Zhang Chloe Hillier Sudheendra Vijayanarasimhan Fabio Viola Tim Green Trevor Back Paul Natsev etal 2017. The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017).  Will Kay Joao Carreira Karen Simonyan Brian Zhang Chloe Hillier Sudheendra Vijayanarasimhan Fabio Viola Tim Green Trevor Back Paul Natsev et al. 2017. The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)."},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.486"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018545"},{"key":"e_1_3_2_2_17_1","volume-title":"Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114","author":"Kingma Diederik P","year":"2013"},{"key":"e_1_3_2_2_18_1","volume-title":"International Conference on Learning Represen-tations (ICLR).","author":"Komodakis Nikos","year":"2018"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_35"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00371"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413548"},{"key":"e_1_3_2_2_22_1","volume-title":"Two-stream 3d convolu-tional neural network for skeleton-based action recognition. arXiv preprint arXiv:1705.08106","author":"Liu Hong","year":"2017"},{"key":"e_1_3_2_2_23_1","volume-title":"Gang Wang, Ling-Yu Duan, and Alex Kot Chichung.","author":"Liu Jun","year":"2019"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3365212"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2017.02.030"},{"key":"e_1_3_2_2_26_1","volume-title":"Video cloze procedure for self-supervised spatio-temporal learning. arXiv preprint arXiv:2001.00294","author":"Luo Dezhao","year":"2020"},{"key":"e_1_3_2_2_27_1","volume-title":"2018 IEEE Winter Conference on Applications of Computer Vision (WACV). IEEE, 1616--1624","author":"Yue-Hei Ng Joe","year":"2018"},{"key":"e_1_3_2_2_28_1","volume-title":"European Conference on Computer Vision. Springer, 102--118","author":"Nie Qiang","year":"2020"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.5555\/3045390.3045603"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46466-4_5"},{"key":"e_1_3_2_2_31_1","volume-title":"Modeling uncertainty with hedged instance embedding. arXiv preprint arXiv:1810.00319","author":"Oh Seong Joon","year":"2018"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.638"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.115"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00810"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01230"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_7"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58571-6_3"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00965"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00558"},{"key":"e_1_3_2_2_41_1","volume-title":"Contrastive multiview coding. arXiv preprint arXiv:1906.05849","author":"Tian Yonglong","year":"2019"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.82"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.484"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.281"},{"key":"e_1_3_2_2_45_1","volume-title":"European Conference on Computer Vision. Springer, 504--521","author":"Wang Jiangliu","year":"2020"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.5555\/2354409.2354966"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00201"},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00393"},{"key":"e_1_3_2_2_49_1","volume-title":"PointContrast: Unsupervised Pre-training for 3D Point Cloud Understanding. arXiv preprint arXiv:2007.10985","author":"Xie Saining","year":"2020"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01058"},{"key":"e_1_3_2_2_51_1","volume-title":"Proceedings of the AAAI conference on artificial intelligence","volume":"32","author":"Yan Sijie","year":"2018"},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.233"},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01240-3_9"},{"key":"e_1_3_2_2_54_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 14333--14342","author":"Zhang Xikun","year":"2020"},{"key":"e_1_3_2_2_55_1","volume-title":"Thirty-Second AAAI conference on artificial intelligence.","author":"Zheng Nenggan","year":"2018"},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"publisher","DOI":"10.5555\/3016387.3016423"}],"event":{"name":"MM '21: ACM Multimedia Conference","location":"Virtual Event China","acronym":"MM '21","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 29th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475248","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3474085.3475248","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:48:16Z","timestamp":1750193296000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475248"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,17]]},"references-count":56,"alternative-id":["10.1145\/3474085.3475248","10.1145\/3474085"],"URL":"https:\/\/doi.org\/10.1145\/3474085.3475248","relation":{},"subject":[],"published":{"date-parts":[[2021,10,17]]},"assertion":[{"value":"2021-10-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}