{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,14]],"date-time":"2026-01-14T08:23:43Z","timestamp":1768379023580,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681664","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"1848-1856","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Vi2ACT:Video-enhanced Cross-modal Co-learning with Representation Conditional Discriminator for Few-shot Human Activity Recognition"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5453-0643","authenticated-orcid":false,"given":"Kang","family":"Xia","sequence":"first","affiliation":[{"name":"Nanjing University, Nanjing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9199-3655","authenticated-orcid":false,"given":"Wenzhong","family":"Li","sequence":"additional","affiliation":[{"name":"Nanjing University, Nanjing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-5956-6159","authenticated-orcid":false,"given":"Yimiao","family":"Shao","sequence":"additional","affiliation":[{"name":"Nanjing University, Nanjing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1467-4519","authenticated-orcid":false,"given":"Sanglu","family":"Lu","sequence":"additional","affiliation":[{"name":"Nanjing University, Nanjing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Jamie Ryan Kiros, and Geoffrey E Hinton","author":"Ba Jimmy Lei","year":"2016","unstructured":"Jimmy Lei Ba, Jamie Ryan Kiros, and Geoffrey E Hinton. 2016. Layer normalization. arXiv preprint arXiv:1607.06450 (2016)."},{"key":"e_1_3_2_1_2_1","volume-title":"An empirical evaluation of generic convolutional and recurrent networks for sequence modeling. arXiv preprint arXiv:1803.01271","author":"Bai Shaojie","year":"2018","unstructured":"Shaojie Bai, J Zico Kolter, and Vladlen Koltun. 2018. An empirical evaluation of generic convolutional and recurrent networks for sequence modeling. arXiv preprint arXiv:1803.01271 (2018)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341162.3344854"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00371-021-02283-3"},{"key":"e_1_3_2_1_5_1","article-title":"Deep Compressed Sensing based Data Imputation for Urban Environmental Monitoring","volume":"20","author":"Chang Qingyi","year":"2023","unstructured":"Qingyi Chang, Dan Tao, Jiangtao Wang, and Ruipeng Gao. 2023. Deep Compressed Sensing based Data Imputation for Urban Environmental Monitoring. ACM Trans. Sen. Netw., Vol. 20, 1, Article 17 (nov 2023), 21 pages.","journal-title":"ACM Trans. Sen. Netw."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2019.2927224"},{"key":"e_1_3_2_1_7_1","volume-title":"International conference on machine learning. PMLR, 1597--1607","author":"Chen Ting","year":"2020","unstructured":"Ting Chen, Simon Kornblith, Mohammad Norouzi, and Geoffrey Hinton. 2020. A simple framework for contrastive learning of visual representations. In International conference on machine learning. PMLR, 1597--1607."},{"key":"e_1_3_2_1_8_1","volume-title":"Yeng Chai Soh, and Le Zhang","author":"Chen Zhenghua","year":"2017","unstructured":"Zhenghua Chen, Qingchang Zhu, Yeng Chai Soh, and Le Zhang. 2017. Robust human activity recognition using smartphone sensors via CT-PCA and online SVM. IEEE transactions on industrial informatics, Vol. 13, 6 (2017), 3070--3080."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.3390\/s22051911"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2017.12.012"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413591"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3550299"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3550294"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3517246"},{"key":"e_1_3_2_1_15_1","volume-title":"HARGPT: Are LLMs Zero-Shot Human Activity Recognizers? arXiv preprint arXiv:2403.02727","author":"Ji Sijie","year":"2024","unstructured":"Sijie Ji, Xinzhe Zheng, and Chenshu Wu. 2024. HARGPT: Are LLMs Zero-Shot Human Activity Recognizers? arXiv preprint arXiv:2403.02727 (2024)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3448093"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.3390\/s22103932"},{"key":"e_1_3_2_1_18_1","volume-title":"IMUTube: Automatic Extraction of Virtual on-body Accelerometry from Video for Human Activity Recognition. arxiv","author":"Kwon Hyeokhyen","year":"2006","unstructured":"Hyeokhyen Kwon, Catherine Tong, Harish Haresamudram, Yan Gao, Gregory D. Abowd, Nicholas D. Lane, and Thomas Ploetz. 2020. IMUTube: Automatic Extraction of Virtual on-body Accelerometry from Video for Human Activity Recognition. arxiv: 2006.05675 [cs.CV]"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3478096"},{"key":"e_1_3_2_1_20_1","volume-title":"Paolo Di Achille, and Shwetak Patel","author":"Liu Xin","year":"2023","unstructured":"Xin Liu, Daniel McDuff, Geza Kovacs, Isaac Galatzer-Levy, Jacob Sunshine, Jiening Zhan, Ming-Zher Poh, Shun Liao, Paolo Di Achille, and Shwetak Patel. 2023. Large language models are few-shot health learners. arXiv preprint arXiv:2305.15525 (2023)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3386901.3389030"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3307334.3326109"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMC.2018.2793913"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3448074"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3494998"},{"key":"e_1_3_2_1_26_1","volume-title":"Conditional Generative Adversarial Nets. CoRR","author":"Mirza Mehdi","year":"2014","unstructured":"Mehdi Mirza and Simon Osindero. 2014. Conditional Generative Adversarial Nets. CoRR, Vol. abs\/1411.1784 (2014). showeprint[arXiv]1411.1784 http:\/\/arxiv.org\/abs\/1411.1784"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3132028"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3463498","article-title":"Enabling real-time sign language translation on mobile platforms with on-board depth cameras","volume":"5","author":"Park HyeonJung","year":"2021","unstructured":"HyeonJung Park, Youngki Lee, and JeongGil Ko. 2021. Enabling real-time sign language translation on mobile platforms with on-board depth cameras. Proceedings of the ACM on Interactive, Mobile, Wearable and Ubiquitous Technologies, Vol. 5, 2 (2021), 1--30.","journal-title":"Proceedings of the ACM on Interactive, Mobile, Wearable and Ubiquitous Technologies"},{"key":"e_1_3_2_1_29_1","volume-title":"International Conference on Machine Learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-26561-2_6"},{"key":"e_1_3_2_1_31_1","volume-title":"Human activity recognition with smartphone sensors using deep learning neural networks. Expert systems with applications","author":"Ronao Charissa Ann","year":"2016","unstructured":"Charissa Ann Ronao and Sung-Bae Cho. 2016. Human activity recognition with smartphone sensors using deep learning neural networks. Expert systems with applications, Vol. 59 (2016), 235--244."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3328932"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-61030-6_23"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2021.3070646"},{"key":"e_1_3_2_1_35_1","volume-title":"Alexey Kurakin, and Chun-Liang Li.","author":"Sohn Kihyuk","year":"2020","unstructured":"Kihyuk Sohn, David Berthelot, Nicholas Carlini, Zizhao Zhang, Han Zhang, Colin A Raffel, Ekin Dogus Cubuk, Alexey Kurakin, and Chun-Liang Li. 2020. Fixmatch: Simplifying semi-supervised learning with consistency and confidence. Advances in neural information processing systems, Vol. 33 (2020), 596--608."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3448112"},{"key":"e_1_3_2_1_37_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_38_1","volume-title":"Modelscope text-to-video technical report. arXiv preprint arXiv:2308.06571","author":"Wang Jiuniu","year":"2023","unstructured":"Jiuniu Wang, Hangjie Yuan, Dayou Chen, Yingya Zhang, Xiang Wang, and Shiwei Zhang. 2023. Modelscope text-to-video technical report. arXiv preprint arXiv:2308.06571 (2023)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3580859"},{"key":"e_1_3_2_1_40_1","volume-title":"Proc. ACM Interact. Mob. Wearable Ubiquitous Technol.","volume":"7","author":"Xia Kang","year":"2024","unstructured":"Kang Xia, Wenzhong Li, Shiwei Gan, and Sanglu Lu. 2024. TS2ACT: Few-Shot Human Activity Sensing with Cross-Modal Co-Learning. Proc. ACM Interact. Mob. Wearable Ubiquitous Technol., Vol. 7, 4, Article 188 (jan 2024), 22 pages."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3411836"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3534572"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.4108\/icst.mobicase.2014.257786"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681664","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681664","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:50Z","timestamp":1750295870000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681664"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":43,"alternative-id":["10.1145\/3664647.3681664","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681664","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}