{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:07:08Z","timestamp":1765339628001,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":50,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62476015, 62171298"],"award-info":[{"award-number":["62476015, 62171298"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Joint Fund of the National Natural Science Foundation of China","award":["U21B2038"],"award-info":[{"award-number":["U21B2038"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754886","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:55:00Z","timestamp":1761375300000},"page":"2919-2928","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Cross-Modal Dual-Causal Learning for Long-Term Action Recognition"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-1852-9002","authenticated-orcid":false,"given":"Shaowu","family":"Xu","sequence":"first","affiliation":[{"name":"College of Computer Science, Beijing University of Technology, Chaoyang Qu, Beijing Shi, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8799-8042","authenticated-orcid":false,"given":"Xibin","family":"Jia","sequence":"additional","affiliation":[{"name":"College of Computer Science, Beijing University of Technology, Chaoyang Qu, Beijing Shi, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8105-5497","authenticated-orcid":false,"given":"Junyu","family":"Gao","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Haidian Qu, Beijing Shi, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1928-7772","authenticated-orcid":false,"given":"Qianmei","family":"Sun","sequence":"additional","affiliation":[{"name":"Beijing Chao-yang Hospital, Capital Medical University, Chaoyang Qu, Beijing Shi, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2708-3921","authenticated-orcid":false,"given":"Jing","family":"Chang","sequence":"additional","affiliation":[{"name":"Beijing Chao-yang Hospital, Capital Medical University, Chaoyang Qu, Beijing Shi, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-1379-760X","authenticated-orcid":false,"given":"Chao","family":"Fan","sequence":"additional","affiliation":[{"name":"College of Computer Science, Beijing University of Technology, Chaoyang Qu, Beijing Shi, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"4","article-title":"Is space-time attention all you need for video understanding?","volume":"2","author":"Bertasius Gedas","year":"2021","unstructured":"Gedas Bertasius, Heng Wang, and Lorenzo Torresani. 2021. Is space-time attention all you need for video understanding?. In ICML, Vol. 2. 4.","journal-title":"ICML"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3290012"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01768"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.337"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01942"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01282"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00034"},{"key":"e_1_3_2_1_10_1","volume-title":"Videograph: Recognizing minutes-long human activities in videos. arXiv preprint arXiv:1905.05143","author":"Hussein Noureldien","year":"2019","unstructured":"Noureldien Hussein, Efstratios Gavves, and Arnold WM Smeulders. 2019b. Videograph: Recognizing minutes-long human activities in videos. arXiv preprint arXiv:1905.05143 (2019)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_6"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01798"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00209"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.105"},{"key":"e_1_3_2_1_15_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730-19742."},{"key":"e_1_3_2_1_16_1","volume-title":"International conference on machine learning. PMLR, 12888-12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888-12900."},{"key":"e_1_3_2_1_17_1","volume-title":"European Conference on Computer Vision. Springer, 237-255","author":"Li Kunchang","year":"2024","unstructured":"Kunchang Li, Xinhao Li, Yi Wang, Yinan He, Yali Wang, Limin Wang, and Yu Qiao. 2024. Videomamba: State space model for efficient video understanding. In European Conference on Computer Vision. Springer, 237-255."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01348"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3284594"},{"key":"e_1_3_2_1_20_1","volume-title":"Deep Counterfactual Representation Learning for Visual Recognition against Weather Corruptions","author":"Liu Hong","year":"2023","unstructured":"Hong Liu, Yongqing Sun, Yukihiro Bandoh, Masaki Kitahara, and Shin'ichi Satoh. 2023d. Deep Counterfactual Representation Learning for Visual Recognition against Weather Corruptions. IEEE Transactions on Multimedia (2023)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3284038"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3284038"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2024.3386339"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00788"},{"key":"e_1_3_2_1_25_1","volume-title":"MSQNet: Actor-agnostic Action Recognition with Multi-modal Query. arXiv preprint arXiv:2307.10763","author":"Mondal Anindya","year":"2023","unstructured":"Anindya Mondal, Sauradip Nag, Joaquin M Prada, Xiatian Zhu, and Anjan Dutta. 2023. MSQNet: Actor-agnostic Action Recognition with Multi-modal Query. arXiv preprint arXiv:2307.10763 (2023)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1017\/S0266466603004109"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01251"},{"volume-title":"Causal inference in statistics: a primer","author":"Pearl Judea","key":"e_1_3_2_1_28_1","unstructured":"Judea Pearl. 2016. Causal inference in statistics: a primer. John Wiley & Sons."},{"key":"e_1_3_2_1_29_1","volume-title":"International conference on machine learning. PMLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748-8763."},{"key":"e_1_3_2_1_30_1","volume-title":"Tokenlearner: What can 8 learned tokens do for images and videos? arXiv preprint arXiv:2106.11297","author":"Ryoo Michael S","year":"2021","unstructured":"Michael S Ryoo, AJ Piergiovanni, Anurag Arnab, Mostafa Dehghani, and Anelia Angelova. 2021. Tokenlearner: What can 8 learned tokens do for images and videos? arXiv preprint arXiv:2106.11297 (2021)."},{"key":"e_1_3_2_1_31_1","volume-title":"Assemblenet: Searching for multi-stream neural connectivity in video architectures. arXiv preprint arXiv:1905.13209","author":"Ryoo Michael S","year":"2019","unstructured":"Michael S Ryoo, AJ Piergiovanni, Mingxing Tan, and Anelia Angelova. 2019. Assemblenet: Searching for multi-stream neural connectivity in video architectures. arXiv preprint arXiv:1905.13209 (2019)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Bernhard Sch\u00f6lkopf. 2022. Causality for machine learning. In Probabilistic and causal inference: The works of Judea Pearl. 765-804.","DOI":"10.1145\/3501714.3501755"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2021.3058954"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.74"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72658-3_25"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_31"},{"key":"e_1_3_2_1_37_1","volume-title":"Learning Invariant Causal Mechanism from Vision-Language Models. arXiv preprint arXiv:2405.15289","author":"Song Zeen","year":"2024","unstructured":"Zeen Song, Siyu Zhao, Xingyu Zhang, Jiangmeng Li, Changwen Zheng, and Wenwen Qiang. 2024. Learning Invariant Causal Mechanism from Vision-Language Models. arXiv preprint arXiv:2405.15289 (2024)."},{"volume-title":"prediction, and search","author":"Spirtes Peter","key":"e_1_3_2_1_38_1","unstructured":"Peter Spirtes, Clark Glymour, and Richard Scheines. 2001. Causation, prediction, and search. MIT press."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00130"},{"key":"e_1_3_2_1_40_1","volume-title":"Deconfounding causal inference for zero-shot action recognition","author":"Wang Junyan","year":"2023","unstructured":"Junyan Wang, Yiqi Jiang, Yang Long, Xiuyu Sun, Maurice Pagnucco, and Yang Song. 2023a. Deconfounding causal inference for zero-shot action recognition. IEEE Transactions on Multimedia (2023)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00618"},{"key":"e_1_3_2_1_42_1","volume-title":"Actionclip: A new paradigm for video action recognition. arXiv preprint arXiv:2109.08472","author":"Wang Mengmeng","year":"2021","unstructured":"Mengmeng Wang, Jiazheng Xing, and Yong Liu. 2021. Actionclip: A new paradigm for video action recognition. arXiv preprint arXiv:2109.08472 (2021)."},{"key":"e_1_3_2_1_43_1","first-page":"3933","article-title":"Weakly-supervised video object grounding via causal intervention","volume":"45","author":"Wang Wei","year":"2022","unstructured":"Wei Wang, Junyu Gao, and Changsheng Xu. 2022. Weakly-supervised video object grounding via causal intervention. IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. 45, 3 (2022), 3933-3948.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-023-01876-w"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00640"},{"key":"e_1_3_2_1_46_1","volume-title":"Attend and Tell: Neural Image Caption Generation with Visual Attention. In Proceedings of the 32nd International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"2057","author":"Xu Kelvin","year":"2015","unstructured":"Kelvin Xu, Jimmy Ba, Ryan Kiros, Kyunghyun Cho, Aaron Courville, Ruslan Salakhudinov, Rich Zemel, and Yoshua Bengio. 2015. Show, Attend and Tell: Neural Image Caption Generation with Visual Attention. In Proceedings of the 32nd International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 37), Francis Bach and David Blei (Eds.). PMLR, Lille, France, 2048-2057. https:\/\/proceedings.mlr.press\/v37\/xuc15.html"},{"key":"e_1_3_2_1_47_1","volume-title":"Rhyrnn: Rhythmic rnn for recognizing events in long and complex videos. In Computer Vision-ECCV 2020: 16th European Conference","author":"Yu Tianshu","year":"2020","unstructured":"Tianshu Yu, Yikang Li, and Baoxin Li. 2020. Rhyrnn: Rhythmic rnn for recognizing events in long and complex videos. In Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part X 16. Springer, 127-144."},{"key":"e_1_3_2_1_48_1","volume-title":"Video-llama: An instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858","author":"Zhang Hang","year":"2023","unstructured":"Hang Zhang, Xin Li, and Lidong Bing. 2023. Video-llama: An instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858 (2023)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00887"},{"key":"e_1_3_2_1_50_1","volume-title":"Twinformer: Fine-to-coarse temporal modeling for long-term action recognition","author":"Zhou Jiaming","year":"2023","unstructured":"Jiaming Zhou, Kun-Yu Lin, Yu-Kun Qiu, and Wei-Shi Zheng. 2023. Twinformer: Fine-to-coarse temporal modeling for long-term action recognition. IEEE Transactions on Multimedia (2023)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754886","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:03:53Z","timestamp":1765339433000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754886"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":50,"alternative-id":["10.1145\/3746027.3754886","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754886","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}