{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,16]],"date-time":"2026-06-16T04:29:58Z","timestamp":1781584198709,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":49,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"the Science and Technology Innovation Committee of Shenzhen Municipalit Foundation","award":["No.JCYJ20210324132203007"],"award-info":[{"award-number":["No.JCYJ20210324132203007"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3613795","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T11:27:30Z","timestamp":1698406050000},"page":"8590-8597","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":23,"title":["Localization-assisted Uncertainty Score Disentanglement Network for Action Quality Assessment"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9122-6141","authenticated-orcid":false,"given":"Yanli","family":"Ji","sequence":"first","affiliation":[{"name":"Shenzhen Institute for Advanced Study &amp; UESTC, Chengdu, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-7350-6938","authenticated-orcid":false,"given":"Lingfeng","family":"Ye","sequence":"additional","affiliation":[{"name":"Shenzhen Institute for Advanced Study &amp; UESTC, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-1506-6279","authenticated-orcid":false,"given":"Huili","family":"Huang","sequence":"additional","affiliation":[{"name":"Shenzhen Institute for Advanced Study &amp; UESTC, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-9846-1170","authenticated-orcid":false,"given":"Lijing","family":"Mao","sequence":"additional","affiliation":[{"name":"UESTC, Chengdu, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9122-6141","authenticated-orcid":false,"given":"Yang","family":"Zhou","sequence":"additional","affiliation":[{"name":"UESTC, Chengdu, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9122-6141","authenticated-orcid":false,"given":"Lingling","family":"Gao","sequence":"additional","affiliation":[{"name":"UESTC, Chengdu, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"crossref","unstructured":"Yang Bai Desen Zhou Songyang Zhang Jian Wang Errui Ding Yu Guan Yang Long and Jingdong Wang. 2022. Action quality assessment with temporal parsing transformer. In ECCV. 422--438.","DOI":"10.1007\/978-3-031-19772-7_25"},{"key":"e_1_3_2_2_2_1","volume-title":"Stella X Yu, and Jianbo Shi.","author":"Bertasius Gedas","year":"2017","unstructured":"Gedas Bertasius, Hyun Soo Park, Stella X Yu, and Jianbo Shi. 2017. Am I a baller? basketball performance assessment from first-person videos. In ICCV. 2177--2185."},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"crossref","unstructured":"Nicolas Carion Francisco Massa Gabriel Synnaeve Nicolas Usunier Alexander Kirillov and Sergey Zagoruyko. 2020. End-to-end object detection with transformers. In ECCV. 213--229.","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_2_4_1","volume-title":"A short note about kinetics-600. arXiv preprint arXiv:1808.01340","author":"Carreira Joao","year":"2018","unstructured":"Joao Carreira, Eric Noland, Andras Banki-Horvath, Chloe Hillier, and Andrew Zisserman. 2018. A short note about kinetics-600. arXiv preprint arXiv:1808.01340 (2018)."},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"crossref","unstructured":"Joao Carreira and Andrew Zisserman. 2017. Quo vadis action recognition? a new model and the kinetics dataset. In CVPR. 6299--6308.","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2021.107388"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"crossref","unstructured":"Hazel Doughty Dima Damen and Walterio Mayol-Cuevas. 2018. Who's better? who's best? pairwise deep ranking for skill determination. In CVPR. 6057--6066.","DOI":"10.1109\/CVPR.2018.00634"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"crossref","unstructured":"Hazel Doughty Walterio Mayol-Cuevas and Dima Damen. 2019. The pros and cons: Rank-aware temporal attention for skill determination in long videos. In CVPR. 7862--7871.","DOI":"10.1109\/CVPR.2019.00805"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"crossref","unstructured":"Patrick Esser Ekaterina Sutter and Bj\u00f6rn Ommer. 2018. A variational u-net for conditional appearance and shape generation. In CVPR. 8857--8866.","DOI":"10.1109\/CVPR.2018.00923"},{"key":"e_1_3_2_2_10_1","volume-title":"Ramon Pena, Lela DiMonte, Anshu Gupta, Aishani Ataliwala, and Jocelyn Barker.","author":"Fathollahi Mona","year":"2022","unstructured":"Mona Fathollahi, Mohammad Hasan Sarhan, Ramon Pena, Lela DiMonte, Anshu Gupta, Aishani Ataliwala, and Jocelyn Barker. 2022. Video-Based Surgical Skills Assessment Using Long Term Tool Tracking. In MICCAI. 541--550."},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"crossref","unstructured":"Jibin Gao Wei-Shi Zheng Jia-Hui Pan Chengying Gao Yaowei Wang Wei Zeng and Jianhuang Lai. 2020. An asymmetric modeling for action assessment. In ECCV. 222--238.","DOI":"10.1007\/978-3-030-58577-8_14"},{"key":"e_1_3_2_2_12_1","volume-title":"MICCAI Workshop","volume":"3","author":"Gao Yixin","year":"2014","unstructured":"Yixin Gao, S Swaroop Vedula, Carol E Reiley, Narges Ahmidi, Balakrishnan Varadarajan, Henry C Lin, Lingling Tao, Luca Zappella, Benjamin B\u00e9jar, David D Yuh, et al. 2014. Jhu-isi gesture and skill assessment working set (jigsaws): A surgical activity dataset for human motion modeling. In MICCAI Workshop, Vol. 3."},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2020.3017727"},{"key":"e_1_3_2_2_14_1","volume-title":"NeurIPS","volume":"30","author":"Kendall Alex","year":"2017","unstructured":"Alex Kendall and Yarin Gal. 2017. What uncertainties do we need in bayesian deep learning for computer vision? NeurIPS, Vol. 30 (2017)."},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"crossref","unstructured":"Mingzhe Li Hong-Bo Zhang Qing Lei Zongwen Fan Jinghua Liu and Ji-Xiang Du. 2022c. Pairwise Contrastive Learning Network for Action Quality Assessment. In ECCV. 457--473.","DOI":"10.1007\/978-3-031-19772-7_27"},{"key":"e_1_3_2_2_16_1","volume-title":"Mhformer: Multi-hypothesis transformer for 3d human pose estimation. In CVPR. 13147--13156.","author":"Li Wenhao","year":"2022","unstructured":"Wenhao Li, Hong Liu, Hao Tang, Pichao Wang, and Luc Van Gool. 2022b. Mhformer: Multi-hypothesis transformer for 3d human pose estimation. In CVPR. 13147--13156."},{"key":"e_1_3_2_2_17_1","volume-title":"Scoringnet: Learning key fragment for action quality assessment with ranking loss in skilled sports. In ACCV. 149--164.","author":"Li Yongjun","year":"2019","unstructured":"Yongjun Li, Xiujuan Chai, and Xilin Chen. 2019. Scoringnet: Learning key fragment for action quality assessment with ranking loss in skilled sports. In ACCV. 149--164."},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"crossref","unstructured":"Zhenqiang Li Lin Gu Weimin Wang Ryosuke Nakamura and Yoichi Sato. 2022a. Surgical Skill Assessment via Video Semantic Aggregation. In MICCAI. 410--420.","DOI":"10.1007\/978-3-031-16449-1_39"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"crossref","unstructured":"Yu Liu Fangyin Wei Jing Shao Lu Sheng Junjie Yan and Xiaogang Wang. 2018. Exploring disentangled feature representation beyond face identification. In CVPR. 2080--2089.","DOI":"10.1109\/CVPR.2018.00222"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"crossref","unstructured":"Ze Liu Jia Ning Yue Cao Yixuan Wei Zheng Zhang Stephen Lin and Han Hu. 2022. Video swin transformer. In CVPR. 3202--3211.","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"e_1_3_2_2_21_1","unstructured":"Boyu Lu Jun-Cheng Chen and Rama Chellappa. 2019. Unsupervised domain-specific deblurring via disentangled representations. In CVPR. 10225--10234."},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"crossref","unstructured":"Takasuke Nagai Shoichiro Takeda Masaaki Matsumura Shinya Shimizu and Susumu Yamamoto. 2021. Action quality assessment with ignoring scene context. In ICIP. 1189--1193.","DOI":"10.1109\/ICIP42928.2021.9506257"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00458"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"crossref","unstructured":"Qiang Nie Ziwei Liu and Yunhui Liu. 2020. Unsupervised 3d human pose representation with viewpoint and pose disentanglement. In ECCV. 102--118.","DOI":"10.1007\/978-3-030-58529-7_7"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"crossref","unstructured":"Xuesong Niu Zitong Yu Hu Han Xiaobai Li Shiguang Shan and Guoying Zhao. 2020. Video-based remote physiological measurement via cross-verified feature disentangling. In ECCV. 295--310.","DOI":"10.1007\/978-3-030-58536-5_18"},{"key":"e_1_3_2_2_26_1","unstructured":"Jia-Hui Pan Jibin Gao and Wei-Shi Zheng. 2019. Action assessment by joint relation graphs. In ICCV. 6331--6340."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3126534"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"crossref","unstructured":"Paritosh Parmar and Brendan Morris. 2019a. Action quality assessment across multiple actions. In WACV. 1468--1476.","DOI":"10.1109\/WACV.2019.00161"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"crossref","unstructured":"Paritosh Parmar and Brendan Tran Morris. 2019b. What and how well you performed? a multitask learning approach to action quality assessment. In CVPR. 304--313.","DOI":"10.1109\/CVPR.2019.00039"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2017.16"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"crossref","unstructured":"Hamed Pirsiavash Carl Vondrick and Antonio Torralba. 2014. Assessing the quality of actions. In ECCV. 556--571.","DOI":"10.1007\/978-3-319-10599-4_36"},{"key":"e_1_3_2_2_32_1","volume-title":"Acm-net: Action context modeling network for weakly-supervised temporal action localization. arXiv preprint arXiv:2104.02967","author":"Qu Sanqing","year":"2021","unstructured":"Sanqing Qu, Guang Chen, Zhijun Li, Lijun Zhang, Fan Lu, and Alois Knoll. 2021. Acm-net: Action context modeling network for weakly-supervised temporal action localization. arXiv preprint arXiv:2104.02967 (2021)."},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"crossref","unstructured":"Yansong Tang Zanlin Ni Jiahuan Zhou Danyang Zhang Jiwen Lu Ying Wu and Jie Zhou. 2020. Uncertainty-aware score distribution learning for action quality assessment. In CVPR. 9839--9848.","DOI":"10.1109\/CVPR42600.2020.00986"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"crossref","unstructured":"Du Tran Lubomir Bourdev Rob Fergus Lorenzo Torresani and Manohar Paluri. 2015. Learning spatiotemporal features with 3d convolutional networks. In ICCV. 4489--4497.","DOI":"10.1109\/ICCV.2015.510"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"crossref","unstructured":"Luan Tran Xi Yin and Xiaoming Liu. 2017. Disentangled representation learning gan for pose-invariant face recognition. In CVPR. 1415--1424.","DOI":"10.1109\/CVPR.2017.141"},{"key":"e_1_3_2_2_36_1","volume-title":"NeurIPS","volume":"30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. NeurIPS, Vol. 30 (2017)."},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.media.2023.102770"},{"key":"e_1_3_2_2_38_1","volume-title":"Tsa-net: Tube self-attention network for action quality assessment. In ACM Multimedia. 4902--4910.","author":"Wang Shunli","year":"2021","unstructured":"Shunli Wang, Dingkang Yang, Peng Zhai, Chixiao Chen, and Lihua Zhang. 2021. Tsa-net: Tube self-attention network for action quality assessment. In ACM Multimedia. 4902--4910."},{"key":"e_1_3_2_2_39_1","volume-title":"Gregory D Hager, and Trac D Tran.","author":"Xiang Xiang","year":"2018","unstructured":"Xiang Xiang, Ye Tian, Austin Reiter, Gregory D Hager, and Trac D Tran. 2018. S3d: Stacking segmental p3d for action quality assessment. In ICIP. 928--932."},{"key":"e_1_3_2_2_40_1","unstructured":"Angchi Xu Ling-An Zeng and Wei-Shi Zheng. 2022b. Likert Scoring with Grade Decoupling for Long-term Action Assessment. In CVPR. 3232--3241."},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/tcsvt.2019.2927118"},{"key":"e_1_3_2_2_42_1","volume-title":"Finediving: A fine-grained dataset for procedure-aware action quality assessment. In CVPR. 2949--2958.","author":"Xu Jinglin","year":"2022","unstructured":"Jinglin Xu, Yongming Rao, Xumin Yu, Guangyi Chen, Jie Zhou, and Jiwen Lu. 2022a. Finediving: A fine-grained dataset for procedure-aware action quality assessment. In CVPR. 2949--2958."},{"key":"e_1_3_2_2_43_1","unstructured":"Xumin Yu Yongming Rao Wenliang Zhao Jiwen Lu and Jie Zhou. 2021. Group-aware contrastive regression for action quality assessment. In ICCV. 7919--7928."},{"key":"e_1_3_2_2_44_1","volume-title":"Ghulam Mubashar Hassan, and Ajmal Mian","author":"Zahan Sania","year":"2023","unstructured":"Sania Zahan, Ghulam Mubashar Hassan, and Ajmal Mian. 2023. Learning Sparse Temporal Video Mapping for Action Quality Assessment in Floor Gymnastics. arXiv preprint arXiv:2301.06103 (2023)."},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"crossref","unstructured":"Ling-An Zeng Fa-Ting Hong Wei-Shi Zheng Qi-Zhi Yu Wei Zeng Yao-Wei Wang and Jian-Huang Lai. 2020. Hybrid dynamic-static context-aware attention network for action assessment in long videos. In ACM Multimedia. 2526--2534.","DOI":"10.1145\/3394171.3413560"},{"key":"e_1_3_2_2_46_1","volume-title":"Mixste: Seq2seq mixed spatio-temporal encoder for 3d human pose estimation in video. In CVPR. 13232--13242.","author":"Zhang Jinlu","year":"2022","unstructured":"Jinlu Zhang, Zhigang Tu, Jianyu Yang, Yujin Chen, and Junsong Yuan. 2022b. Mixste: Seq2seq mixed spatio-temporal encoder for 3d human pose estimation in video. In CVPR. 13232--13242."},{"key":"e_1_3_2_2_47_1","volume-title":"Relative hidden markov models for video-based evaluation of motion skills in surgical training","author":"Zhang Qiang","year":"2014","unstructured":"Qiang Zhang and Baoxin Li. 2014. Relative hidden markov models for video-based evaluation of motion skills in surgical training. IEEE transactions on pattern analysis and machine intelligence, Vol. 37, 6 (2014), 1206--1218."},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3143549"},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"crossref","unstructured":"Ziyuan Zhang Luan Tran Xi Yin Yousef Atoum Xiaoming Liu Jian Wan and Nanxin Wang. 2019. Gait recognition via disentangled representation learning. In CVPR. 4710--4719","DOI":"10.1109\/CVPR.2019.00484"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3613795","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3613795","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:36:41Z","timestamp":1750178201000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3613795"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":49,"alternative-id":["10.1145\/3581783.3613795","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3613795","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}