{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,4]],"date-time":"2026-06-04T22:46:33Z","timestamp":1780613193452,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":67,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,20]],"date-time":"2026-06-20T00:00:00Z","timestamp":1781913600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"Research Grants Council of Hong Kong","award":["STG1\/E-403\/24-N"],"award-info":[{"award-number":["STG1\/E-403\/24-N"]}]},{"name":"Research Grants Council of Hong Kong","award":["SRFS 2526-4S02"],"award-info":[{"award-number":["SRFS 2526-4S02"]}]},{"name":"Research Grants Council of Hong Kong","award":["GRF 14201425"],"award-info":[{"award-number":["GRF 14201425"]}]},{"name":"Research Grants Council of Hong Kong","award":["CRF C4032-25E"],"award-info":[{"award-number":["CRF C4032-25E"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,21]]},"DOI":"10.1145\/3745756.3809209","type":"proceedings-article","created":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T12:52:21Z","timestamp":1780059141000},"page":"352-370","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["A Large-Scale Multimodal Dataset and Benchmarks for Human Activity Scene Understanding and Reasoning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9926-6532","authenticated-orcid":false,"given":"Siyang","family":"Jiang","sequence":"first","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2624-8755","authenticated-orcid":false,"given":"Mu","family":"Yuan","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-6458-6528","authenticated-orcid":false,"given":"Xiang","family":"Ji","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0032-2539","authenticated-orcid":false,"given":"Bufang","family":"Yang","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-0071-2416","authenticated-orcid":false,"given":"Zeyu","family":"Liu","sequence":"additional","affiliation":[{"name":"University of Illinois Urbana-Champaign, Urbana, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5203-7496","authenticated-orcid":false,"given":"Lilin","family":"Xu","sequence":"additional","affiliation":[{"name":"Columbia University, New York, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0744-4996","authenticated-orcid":false,"given":"Yang","family":"Li","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1954-952X","authenticated-orcid":false,"given":"Yuting","family":"He","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-8336-6144","authenticated-orcid":false,"given":"Liran","family":"Dong","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6328-714X","authenticated-orcid":false,"given":"Wenrui","family":"Lu","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4433-5211","authenticated-orcid":false,"given":"Zhenyu","family":"Yan","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6480-0299","authenticated-orcid":false,"given":"Xiaofan","family":"Jiang","sequence":"additional","affiliation":[{"name":"Columbia University, New York, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2144-6960","authenticated-orcid":false,"given":"Wei","family":"Gao","sequence":"additional","affiliation":[{"name":"University of Pittsburgh, Pittsburgh, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7206-6584","authenticated-orcid":false,"given":"Hongkai","family":"Chen","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1772-7751","authenticated-orcid":false,"given":"Guoliang","family":"Xing","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,20]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"mri: Multi-modal 3d human pose estimation dataset using mmwave, rgb-d, and inertial sensors. Advances in neural information processing systems 35","author":"An Sizhe","year":"2022","unstructured":"Sizhe An, Yin Li, and Umit Ogras. 2022. mri: Multi-modal 3d human pose estimation dataset using mmwave, rgb-d, and inertial sensors. Advances in neural information processing systems 35 (2022), 27414\u201327426."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.3390\/su15054529"},{"key":"e_1_3_2_1_3_1","volume-title":"arXiv preprint arXiv:2502.13923","author":"Bai Shuai","year":"2025","unstructured":"Shuai Bai, Keqin Chen, Xuejing Liu, Jialin Wang, Wenbin Ge, Sibo Song, Kai Dang, Peng Wang, Shijie Wang, Jun Tang, Humen Zhong, Yuanzhi Zhu, Mingkun Yang, Zhaohai Li, Jianqiang Wan, Pengfei Wang, Wei Ding, Zheren Fu, Yiheng Xu, Jiabo Ye, Xi Zhang, Tianbao Xie, Zesen Cheng, Hang Zhang, Zhibo Yang, Haiyang Xu, and Junyang Lin. 2025. Qwen2.5-VL Technical Report. arXiv preprint arXiv:2502.13923 (2025)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","first-page":"78","DOI":"10.1080\/24721735.2022.2139899","article-title":"Wellness tourism service preferences and their linkages to motivational factors: a multiple case study","volume":"6","author":"Bo\u010dkus Daumantas","year":"2023","unstructured":"Daumantas Bo\u010dkus, Timo Tammi, Elli Vento, and Raija Komppula. 2023. Wellness tourism service preferences and their linkages to motivational factors: a multiple case study. International Journal of Spa and Wellness 6, 1 (2023), 78\u2013108.","journal-title":"International Journal of Spa and Wellness"},{"key":"e_1_3_2_1_5_1","unstructured":"Bureau of Labor Statistics. [n.d.]. American Time Use Survey. U.S. Department of Labor. https:\/\/fraser.stlouisfed.org\/files\/docs\/releases\/atus\/atus_20250626.pdf"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.procs.2017.06.110"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2015.7350781"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3569478"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3292500.3330690"},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 24185\u201324198","author":"Chen Zhe","year":"2024","unstructured":"Zhe Chen, Jiannan Wu, Wenhai Wang, Weijie Su, Guo Chen, Sen Xing, Muyan Zhong, Qinglong Zhang, Xizhou Zhu, Lewei Lu, et al. 2024. Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 24185\u201324198."},{"key":"e_1_3_2_1_12_1","volume-title":"PKU-MMD: A Large Scale Benchmark for Continuous Multi-Modal Human Action Understanding. arXiv preprint arXiv:1703.07475","author":"Chunhui Liu","year":"2017","unstructured":"Liu Chunhui, Hu Yueyu, Li Yanghao, Song Sijie, and Liu Jiaying. 2017. PKU-MMD: A Large Scale Benchmark for Continuous Multi-Modal Human Action Understanding. arXiv preprint arXiv:1703.07475 (2017)."},{"key":"e_1_3_2_1_13_1","unstructured":"MMPose Contributors. 2020. OpenMMLab Pose Estimation Toolbox and Benchmark. https:\/\/github.com\/open-mmlab\/mmpose."},{"key":"e_1_3_2_1_14_1","volume-title":"Antonino Furnari, Evangelos Kazakos, Jian Ma, Davide Moltisanti, Jonathan Munro, Toby Perrett, Will Price, et al.","author":"Damen Dima","year":"2022","unstructured":"Dima Damen, Hazel Doughty, Giovanni Maria Farinella, Antonino Furnari, Evangelos Kazakos, Jian Ma, Davide Moltisanti, Jonathan Munro, Toby Perrett, Will Price, et al. 2022. Rescaling egocentric vision: Collection, pipeline and challenges for epic-kitchens-100. International Journal of Computer Vision (2022), 1\u201323."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3715014.3722045"},{"key":"e_1_3_2_1_17_1","volume-title":"2023 IEEE International Conference on Pervasive Computing and Communications (PerCom). IEEE, 160\u2013170","author":"Duan Di","year":"2023","unstructured":"Di Duan, Huanqi Yang, Guohao Lan, Tianxing Li, Xiaohua Jia, and Weitao Xu. 2023. Emgsense: A low-effort self-supervised domain adaptation framework for emg sensing. In 2023 IEEE International Conference on Pervasive Computing and Communications (PerCom). IEEE, 160\u2013170."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","first-page":"e12945","DOI":"10.1111\/csp2.12945","article-title":"Practicing mindfulness in addressing the biodiversity crisis","volume":"5","author":"Gerber Leah R","year":"2023","unstructured":"Leah R Gerber, Zachary Reeves-Blurton, Nika Gueci, Gwenllian D Iacona, JA Beaudette, and Teri Pipe. 2023. Practicing mindfulness in addressing the biodiversity crisis. Conservation Science and Practice 5, 7 (2023), e12945.","journal-title":"Conservation Science and Practice"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01842"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01834"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.123"},{"key":"e_1_3_2_1_22_1","unstructured":"Aaron Hurst Adam Lerer Adam P Goucher Adam Perelman Aditya Ramesh Aidan Clark AJ Ostrow Akila Welihinda Alan Hayes Alec Radford et al. 2024. Gpt-4o system card. arXiv preprint arXiv:2410.21276 (2024)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPSN61024.2024.00007"},{"key":"e_1_3_2_1_24_1","unstructured":"Siyang Jiang Bufang Yang Lilin Xu Mu Yuan Yeerzhati Abudunuer Kaiwei Liu Liekang Zeng Hongkai Chen Zhenyu Yan Xiaofan Jiang et al. 2025. An LLM-Empowered Low-Resolution Vision System for On-Device Human Behavior Understanding. arXiv preprint arXiv:2505.01743 (2025)."},{"key":"e_1_3_2_1_25_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Alina Kuznetsova Hassan Rom Neil Alldrin Jasper Uijlings Ivan Krasin Jordi Pont-Tuset Shahab Kamali Stefan Popov Matteo Malloci Alexander Kolesnikov et al. 2020. The open images dataset v4: Unified image classification object detection and visual relationship detection at scale. International journal of computer vision 128 7 (2020) 1956\u20131981.","DOI":"10.1007\/s11263-020-01316-z"},{"key":"e_1_3_2_1_27_1","volume-title":"Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV). 5715\u20135724","author":"Lee Shih-Po","year":"2023","unstructured":"Shih-Po Lee, Niraj Prakash Kini, Wen-Hsiao Peng, Ching-Wen Ma, and Jenq-Neng Hwang. 2023. HuPR: A Benchmark for Human Pose Estimation Using Millimeter Wave Radar. In Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV). 5715\u20135724."},{"key":"e_1_3_2_1_28_1","volume-title":"Videochat: Chat-centric video understanding. arXiv preprint arXiv:2305.06355","author":"Li KunChang","year":"2023","unstructured":"KunChang Li, Yinan He, YiWang, Yizhuo Li, Wenhai Wang, Ping Luo, Yali Wang, Limin Wang, and Yu Qiao. 2023. Videochat: Chat-centric video understanding. arXiv preprint arXiv:2305.06355 (2023)."},{"key":"e_1_3_2_1_29_1","volume-title":"Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122","author":"Lin Bin","year":"2023","unstructured":"Bin Lin, Bin Zhu, Yang Ye, Munan Ning, Peng Jin, and Li Yuan. 2023. Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122 (2023)."},{"key":"e_1_3_2_1_30_1","unstructured":"Aixin Liu Bei Feng Bing Xue Bingxuan Wang Bochao Wu Chengda Lu Chenggang Zhao Chengqi Deng Chenyu Zhang Chong Ruan et al. 2024. Deepseek-v3 technical report. arXiv preprint arXiv:2412.19437 (2024)."},{"key":"e_1_3_2_1_31_1","volume-title":"Ntu rgb+ d 120: A large-scale benchmark for 3d human activity understanding","author":"Liu Jun","year":"2019","unstructured":"Jun Liu, Amir Shahroudy, Mauricio Perez, Gang Wang, Ling-Yu Duan, and Alex C Kot. 2019. Ntu rgb+ d 120: A large-scale benchmark for 3d human activity understanding. IEEE transactions on pattern analysis and machine intelligence 42, 10 (2019), 2684\u20132701."},{"key":"e_1_3_2_1_32_1","volume-title":"Earda: Towards accurate and data-efficient earable activity sensing. In 2024 IEEE Coupling of Sensing & Computing in AIoT Systems (CSCAIoT)","author":"Lyu Shengzhe","year":"2024","unstructured":"Shengzhe Lyu, Yongliang Chen, Di Duan, Renqi Jia, and Weitao Xu. 2024. Earda: Towards accurate and data-efficient earable activity sensing. In 2024 IEEE Coupling of Sensing & Computing in AIoT Systems (CSCAIoT). IEEE, 1\u20137."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1093\/arclin\/acw049"},{"key":"e_1_3_2_1_34_1","volume-title":"Proceedings of the AAAI conference on artificial intelligence","volume":"38","author":"Mondal Debjyoti","year":"2024","unstructured":"Debjyoti Mondal, Suraj Modi, Subhadarshi Panda, Rituraj Singh, and Godawari Sudhakar Rao. 2024. Kam-cot: Knowledge augmented multimodal chain-of-thoughts reasoning. In Proceedings of the AAAI conference on artificial intelligence, Vol. 38. 18798\u201318806."},{"key":"e_1_3_2_1_35_1","volume-title":"American Time Use Survey","author":"U.S. Department of Labor. 2013.","unstructured":"U.S. Department of Labor. 2013. American Time Use Survey. http:\/\/www.bls.gov\/tus\/ Accessed: 2025-04-23."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3636534.3649370"},{"key":"e_1_3_2_1_37_1","volume-title":"Proceedings of the 28th Annual International Conference on Mobile Computing And Networking. 324\u2013337","author":"Ouyang Xiaomin","year":"2022","unstructured":"Xiaomin Ouyang, Xian Shuai, Jiayu Zhou, Ivy Wang Shi, Zhiyuan Xie, Guoliang Xing, and Jianwei Huang. 2022. Cosmo: contrastive fusion learning with small data for multimodal human activity recognition. In Proceedings of the 28th Annual International Conference on Mobile Computing And Networking. 324\u2013337."},{"key":"e_1_3_2_1_38_1","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition. 652\u2013660","author":"Qi Charles R","year":"2017","unstructured":"Charles R Qi, Hao Su, Kaichun Mo, and Leonidas J Guibas. 2017. Pointnet: Deep learning on point sets for 3d classification and segmentation. In Proceedings of the IEEE conference on computer vision and pattern recognition. 652\u2013660."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Wenhao Qi Xiaohong Zhu Bin Wang Yankai Shi Chaoqun Dong Shiying Shen Jiaqi Li Kun Zhang Yunfan He Mengjiao Zhao et al. 2025. Alzheimer's disease digital biomarkers multidimensional landscape and AI model scoping review. npj Digital Medicine 8 1 (2025) 366.","DOI":"10.1038\/s41746-025-01640-z"},{"key":"e_1_3_2_1_40_1","volume-title":"DailySTR: A Daily Human Activity Pattern Recognition Dataset for Spatio-temporal Reasoning. In 2024 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS). IEEE, 357\u2013363","author":"Qiu Yue","year":"2024","unstructured":"Yue Qiu, Shusaku Egami, Ken Fukuda, Natsuki Miyata, Takuma Yagi, Kensho Hara, Kenji Iwata, and Ryusuke Sagawa. 2024. DailySTR: A Daily Human Activity Pattern Recognition Dataset for Spatio-temporal Reasoning. In 2024 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS). IEEE, 357\u2013363."},{"key":"e_1_3_2_1_41_1","volume-title":"Functional assessment in older people. Bmj 343","author":"Quinn TJ","year":"2011","unstructured":"TJ Quinn, K McArthur, G Ellis, and DJ Stott. 2011. Functional assessment in older people. Bmj 343 (2011)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2015.07.085"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.115"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.3390\/s25134028"},{"key":"e_1_3_2_1_45_1","volume-title":"Enhancing Graph Of Thought: Enhancing Prompts with LLM Rationales and Dynamic Temperature Control. In The Thirteenth International Conference on Learning Representations.","author":"Shin SungUk","year":"2025","unstructured":"SungUk Shin and Youngjoon Kim. 2025. Enhancing Graph Of Thought: Enhancing Prompts with LLM Rationales and Dynamic Temperature Control. In The Thirteenth International Conference on Learning Representations."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.3390\/s140610146"},{"key":"e_1_3_2_1_47_1","volume-title":"Developing the American time use survey activity classification system. Monthly Labor Review","author":"Statistics Force","year":"2005","unstructured":"Force Statistics, Shelley Kristina, and Kristina J Shelley. 2005. Developing the American time use survey activity classification system. Monthly Labor Review (2005), 3."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/2809695.2809718"},{"key":"e_1_3_2_1_49_1","volume-title":"Digital biomarkers for precision diagnosis and monitoring in Parkinson's disease. NPJ digital medicine 7, 1","year":"2024","unstructured":"Yue-meng Sun, Zhi-yun Wang, Yuan-yuan Liang, Chen-wei Hao, and Chang-he Shi. 2024. Digital biomarkers for precision diagnosis and monitoring in Parkinson's disease. NPJ digital medicine 7, 1 (2024), 218."},{"key":"e_1_3_2_1_50_1","volume-title":"Omni-scale cnns: a simple and effective kernel size configuration for time series classification. arXiv preprint arXiv:2002.10061","author":"Tang Wensi","year":"2020","unstructured":"Wensi Tang, Guodong Long, Lu Liu, Tianyi Zhou, Michael Blumenstein, and Jing Jiang. 2020. Omni-scale cnns: a simple and effective kernel size configuration for time series classification. arXiv preprint arXiv:2002.10061 (2020)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Zitian Tang Wenjie Ye Wei-Chiu Ma and Hang Zhao. 2023. What Happened 3 Seconds Ago? Inferring the Past with Thermal Imaging. In CVPR.","DOI":"10.1109\/CVPR52729.2023.01641"},{"key":"e_1_3_2_1_52_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 17111\u201317120","author":"Tang Zitian","year":"2023","unstructured":"Zitian Tang, Wenjie Ye, Wei-Chiu Ma, and Hang Zhao. 2023. What happened 3 seconds ago? inferring the past with thermal imaging. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 17111\u201317120."},{"key":"e_1_3_2_1_53_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3699751"},{"key":"e_1_3_2_1_55_1","volume-title":"Tarsier: Recipes for training and evaluating large video description models. arXiv preprint arXiv:2407.00634","author":"Wang Jiawei","year":"2024","unstructured":"Jiawei Wang, Liping Yuan, Yuchen Zhang, and Haomiao Sun. 2024. Tarsier: Recipes for training and evaluating large video description models. arXiv preprint arXiv:2407.00634 (2024)."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00965"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3625687.3625782"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3712286"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/3699765"},{"key":"e_1_3_2_1_60_1","volume-title":"iRadar: Synthesizing Millimeter-Waves from Wearable Inertial Inputs for Human Gesture Sensing. arXiv preprint arXiv:2412.15980","author":"Yang Huanqi","year":"2024","unstructured":"Huanqi Yang, Mingda Han, Xinyue Li, Di Duan, Tianxing Li, and Weitao Xu. 2024. iRadar: Synthesizing Millimeter-Waves from Wearable Inertial Inputs for Human Gesture Sensing. arXiv preprint arXiv:2412.15980 (2024)."},{"key":"e_1_3_2_1_61_1","first-page":"18756","article-title":"Mm-fi: Multi-modal non-intrusive 4d human dataset for versatile wireless sensing","volume":"36","author":"Yang Jianfei","year":"2023","unstructured":"Jianfei Yang, He Huang, Yunjiao Zhou, Xinyan Chen, Yuecong Xu, Shenghai Yuan, Han Zou, Chris Xiaoxuan Lu, and Lihua Xie. 2023. Mm-fi: Multi-modal non-intrusive 4d human dataset for versatile wireless sensing. Advances in Neural Information Processing Systems 36 (2023), 18756\u201318768.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_62_1","volume-title":"Tree of thoughts: Deliberate problem solving with large language models. Advances in neural information processing systems 36","author":"Yao Shunyu","year":"2023","unstructured":"Shunyu Yao, Dian Yu, Jeffrey Zhao, Izhak Shafran, Tom Griffiths, Yuan Cao, and Karthik Narasimhan. 2023. Tree of thoughts: Deliberate problem solving with large language models. Advances in neural information processing systems 36 (2023), 11809\u201311822. 14"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"crossref","first-page":"9127","DOI":"10.1609\/aaai.v33i01.33019127","article-title":"Activitynet-qa: A dataset for understanding complex web videos via question answering","volume":"33","author":"Yu Zhou","year":"2019","unstructured":"Zhou Yu, Dejing Xu, Jun Yu, Ting Yu, Zhou Zhao, Yueting Zhuang, and Dacheng Tao. 2019. Activitynet-qa: A dataset for understanding complex web videos via question answering. In AAAI, Vol. 33. 9127\u20139134.","journal-title":"AAAI"},{"key":"e_1_3_2_1_64_1","volume-title":"Tarsier2: Advancing Large Vision-Language Models from Detailed Video Description to Comprehensive Video Understanding. arXiv preprint arXiv:2501.07888","author":"Yuan Liping","year":"2025","unstructured":"Liping Yuan, Jiawei Wang, Haomiao Sun, Yuchen Zhang, and Yuan Lin. 2025. Tarsier2: Advancing Large Vision-Language Models from Detailed Video Description to Comprehensive Video Understanding. arXiv preprint arXiv:2501.07888 (2025)."},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1145\/2370216.2370438"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"crossref","first-page":"15","DOI":"10.1186\/s13640-025-00677-0","article-title":"A comprehensive survey on RGB-D-based human action recognition: algorithms, datasets, and popular applications","volume":"2025","author":"Zhang Yumin","year":"2025","unstructured":"Yumin Zhang and Yanyong Wang. 2025. A comprehensive survey on RGB-D-based human action recognition: algorithms, datasets, and popular applications. EURASIP Journal on Image and Video Processing 2025, 1 (2025), 15.","journal-title":"EURASIP Journal on Image and Video Processing"},{"key":"e_1_3_2_1_67_1","volume-title":"Motionbert: A unified perspective on learning human motion representations. In CVPR. 15085\u201315099.","author":"Zhu Wentao","year":"2023","unstructured":"Wentao Zhu, Xiaoxuan Ma, Zhaoyang Liu, Libin Liu, Wayne Wu, and Yizhou Wang. 2023. Motionbert: A unified perspective on learning human motion representations. In CVPR. 15085\u201315099."}],"event":{"name":"MobiSys '26: 24th Annual International Conference on Mobile Systems, Applications and Services","location":"University of Cambridge Cambridge United Kingdom","acronym":"MobiSys '26","sponsor":["SIGMOBILE ACM Special Interest Group on Mobility of Systems, Users, Data and Computing","SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 24th Annual International Conference on Mobile Systems, Applications and Services"],"original-title":[],"deposited":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T12:55:29Z","timestamp":1780059329000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3745756.3809209"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,20]]},"references-count":67,"alternative-id":["10.1145\/3745756.3809209","10.1145\/3745756"],"URL":"https:\/\/doi.org\/10.1145\/3745756.3809209","relation":{},"subject":[],"published":{"date-parts":[[2026,6,20]]},"assertion":[{"value":"2026-06-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}