{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,21]],"date-time":"2026-05-21T16:45:59Z","timestamp":1779381959889,"version":"3.53.1"},"publisher-location":"New York, NY, USA","reference-count":47,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62376069"],"award-info":[{"award-number":["62376069"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Young Elite Scientists Sponsorship Program by CAST","award":["2023QNRC001"],"award-info":[{"award-number":["2023QNRC001"]}]},{"name":"Guangdong Basic and Applied Basic Research Foundation","award":["2024A1515012027"],"award-info":[{"award-number":["2024A1515012027"]}]},{"name":"Shenzhen Science and Technology Program","award":["KQTD20240729102207002"],"award-info":[{"award-number":["KQTD20240729102207002"]}]},{"name":"Shenzhen Science and Technology Program","award":["ZDSYS20230626091203008"],"award-info":[{"award-number":["ZDSYS20230626091203008"]}]},{"name":"Jiangsu Science and Technology Major Program","award":["BG2024041"],"award-info":[{"award-number":["BG2024041"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3758218","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:37:21Z","timestamp":1761377841000},"page":"12776-12783","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["FineBadminton: A Multi-Level Dataset for Fine-Grained Badminton Video Understanding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-8074-8538","authenticated-orcid":false,"given":"Xusheng","family":"He","sequence":"first","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-3685-5747","authenticated-orcid":false,"given":"Wei","family":"Liu","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-0569-700X","authenticated-orcid":false,"given":"Shanshan","family":"Ma","sequence":"additional","affiliation":[{"name":"China Electronics Standardization Institute, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-4334-2290","authenticated-orcid":false,"given":"Qian","family":"Liu","sequence":"additional","affiliation":[{"name":"Shandong University, Jinan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-1429-863X","authenticated-orcid":false,"given":"Chenghao","family":"Ma","sequence":"additional","affiliation":[{"name":"China Electronics Standardization Institute, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0247-5221","authenticated-orcid":false,"given":"Jianlong","family":"Wu","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang et al. 2025. Qwen2. 5-vl technical report. arXiv preprint arXiv:2502.13923 (2025)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0614"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3595916.3626370"},{"key":"e_1_3_2_1_4_1","volume-title":"RA-BLIP: Multimodal Adaptive Retrieval-Augmented Bootstrapping Language-Image Pre-training","author":"Ding Muhe","year":"2025","unstructured":"Muhe Ding, Yang Ma, Pengda Qin, Jianlong Wu, Yuhong Li, and Liqiang Nie. 2025. RA-BLIP: Multimodal Adaptive Retrieval-Augmented Bootstrapping Language-Image Pre-training. IEEE Transactions on Multimedia (2025)."},{"key":"e_1_3_2_1_5_1","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. IEEE, 961-970","author":"Fabian Caba Heilbron Bernard Ghanem","year":"2015","unstructured":"Bernard Ghanem Fabian Caba Heilbron, Victor Escorcia and Juan Carlos Niebles. 2015. ActivityNet: A Large-Scale Video Benchmark for Human Activity Understanding. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. IEEE, 961-970."},{"key":"e_1_3_2_1_6_1","volume-title":"Leandro Von Werra, and Thomas Wolf","author":"Farr\u00e9 Miquel","year":"2024","unstructured":"Miquel Farr\u00e9, Andi Marafioti, Lewis Tunstall, Leandro Von Werra, and Thomas Wolf. 2024. FineVideo. https:\/\/huggingface.co\/datasets\/HuggingFaceFV\/finevideo."},{"key":"e_1_3_2_1_7_1","volume-title":"Video-mme: The first-ever comprehensive evaluation benchmark of multi-modal llms in video analysis. arXiv preprint arXiv:2405.21075","author":"Fu Chaoyou","year":"2024","unstructured":"Chaoyou Fu, Yuhan Dai, Yongdong Luo, Lei Li, Shuhuai Ren, Renrui Zhang, Zihan Wang, Chenyu Zhou, Yunhang Shen, Mengdan Zhang, et al., 2024. Video-mme: The first-ever comprehensive evaluation benchmark of multi-modal llms in video analysis. arXiv preprint arXiv:2405.21075 (2024)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.622"},{"key":"e_1_3_2_1_10_1","volume-title":"Analyzing and Boosting the Power of Fine-Grained Visual Recognition for Multi-modal Large Language Models. arXiv preprint arXiv:2501.15140","author":"He Hulingxiao","year":"2025","unstructured":"Hulingxiao He, Geng Li, Zijun Geng, Jinglin Xu, and Yuxin Peng. 2025. Analyzing and Boosting the Power of Fine-Grained Visual Recognition for Multi-modal Large Language Models. arXiv preprint arXiv:2501.15140 (2025)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i8.28692"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00332"},{"key":"e_1_3_2_1_13_1","volume-title":"Video-MMMU: Evaluating Knowledge Acquisition from Multi-Discipline Professional Videos. arXiv preprint arXiv:2501.13826","author":"Hu Kairui","year":"2025","unstructured":"Kairui Hu, Penghao Wu, Fanyi Pu, Wang Xiao, Yuanhan Zhang, Xiang Yue, Bo Li, and Ziwei Liu. 2025. Video-MMMU: Evaluating Knowledge Acquisition from Multi-Discipline Professional Videos. arXiv preprint arXiv:2501.13826 (2025)."},{"key":"e_1_3_2_1_14_1","unstructured":"Will Kay Joao Carreira Karen Simonyan Brian Zhang Chloe Hillier Sudheendra Vijayanarasimhan Fabio Viola Tim Green Trevor Back Paul Natsev et al. 2017. The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)."},{"key":"e_1_3_2_1_15_1","first-page":"11846","volume-title":"Proceedings of the Advances in Neural Information Processing Systems","volume":"34","author":"Lei Jie","year":"2021","unstructured":"Jie Lei, Tamara L Berg, and Mohit Bansal. 2021. Detecting moments and highlights in videos via natural language queries. In Proceedings of the Advances in Neural Information Processing Systems, Vol. 34. NeurIPS Foundation, 11846-11858."},{"key":"e_1_3_2_1_16_1","volume-title":"Tvqa: Localized, compositional video question answering. arXiv preprint arXiv:1809.01696","author":"Lei Jie","year":"2018","unstructured":"Jie Lei, Licheng Yu, Mohit Bansal, and Tamara L Berg. 2018. Tvqa: Localized, compositional video question answering. arXiv preprint arXiv:1809.01696 (2018)."},{"key":"e_1_3_2_1_17_1","volume-title":"Sports-qa: A large-scale video question answering benchmark for complex and professional sports. arXiv preprint arXiv:2401.01505","author":"Li Haopeng","year":"2024","unstructured":"Haopeng Li, Andong Deng, Qiuhong Ke, Jun Liu, Hossein Rahmani, Yulan Guo, Bernt Schiele, and Chen Chen. 2024b. Sports-qa: A large-scale video question answering benchmark for complex and professional sports. arXiv preprint arXiv:2401.01505 (2024)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02095"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/BigData62323.2024.10825009"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612129"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612131"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01328"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3672758.3672824"},{"key":"e_1_3_2_1_24_1","volume-title":"Towards Analyzing Fast, Frequent, and Fine-grained Events from Videos. arXiv preprint arXiv:2504.08222","author":"Liu Zhaoyu","year":"2025","unstructured":"Zhaoyu Liu, Kan Jiang, Murong Ma, Zhe Hou, Yun Lin, and Jin Song Dong. 2025. F^3Set: Towards Analyzing Fast, Frequent, and Fine-grained Events from Videos. arXiv preprint arXiv:2504.08222 (2025)."},{"key":"e_1_3_2_1_25_1","first-page":"46212","article-title":"Egoschema: A diagnostic benchmark for very long-form video language understanding","volume":"36","author":"Mangalam Karttikeya","year":"2023","unstructured":"Karttikeya Mangalam, Raiymbek Akshulakov, and Jitendra Malik. 2023. Egoschema: A diagnostic benchmark for very long-form video language understanding. In Proceedings of the Advances in Neural Information Processing Systems, Vol. 36. 46212-46244.","journal-title":"Proceedings of the Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_26_1","unstructured":"OpenAI. 2025. GPT-4.1. https:\/\/openai.com\/index\/gpt-4-1\/"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00785"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.99"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00269"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00130"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00794"},{"key":"e_1_3_2_1_32_1","unstructured":"ByteDance Seed Team. 2025. Doubao-1.5-pro. https:\/\/seed.bytedance.com\/en\/special\/doubao_1_5_pro"},{"key":"e_1_3_2_1_33_1","unstructured":"Gemini Team Rohan Anil Sebastian Borgeaud Jean-Baptiste Alayrac Jiahui Yu Radu Soricut Johan Schalkwyk Andrew M Dai Anja Hauth Katie Millican et al. 2023. Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)."},{"key":"e_1_3_2_1_34_1","first-page":"10078","article-title":"VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training","author":"Tong Zhan","year":"2022","unstructured":"Zhan Tong, Yibing Song, Jue Wang, and Limin Wang. 2022. VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training. In Proceedings of the Advances in Neural Information Processing Systems. 10078-10093.","journal-title":"Proceedings of the Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01398"},{"key":"e_1_3_2_1_36_1","volume-title":"Proceedings of International Joint Conference on Artificial Intelligence. 8829-8832","author":"Wang Wei-Yao","year":"2024","unstructured":"Wei-Yao Wang, Wei-Wei Du, Wen-Chih Peng, and Tsi-Ui Ik. 2024. Benchmarking Stroke Forecasting with Stroke-Level Badminton Dataset. In Proceedings of International Joint Conference on Artificial Intelligence. 8829-8832."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3580305.3599906"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.501"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-acl.283"},{"key":"e_1_3_2_1_40_1","volume-title":"Video DataFlywheel: Resolving the Impossible Data Trinity in Video-Language Understanding","author":"Wang Xiao","year":"2025","unstructured":"Xiao Wang, Jianlong Wu, Zijia Lin, Fuzheng Zhang, Di Zhang, and Liqiang Nie. 2025c. Video DataFlywheel: Resolving the Impossible Data Trinity in Video-Language Understanding. IEEE Transactions on Pattern Analysis and Machine Intelligence (2025), 2912-2923."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01754"},{"key":"e_1_3_2_1_42_1","volume-title":"Proceedings of the International Conference on Learning Representations.","author":"Xia Haotian","year":"2025","unstructured":"Haotian Xia, Zhengbang Yang, Junbo Zou, Rhys Tracy, Yuqing Wang, Chi Lu, Christopher Lai, Yanjun He, Xun Shao, Zhuoqing Xie, Yuan fang Wang, Weining Shen, and Hanjie Chen. 2025. SPORTU: A Comprehensive Sports Understanding Benchmark for Multimodal Large Language Models. In Proceedings of the International Conference on Learning Representations."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123427"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02057"},{"key":"e_1_3_2_1_45_1","first-page":"57240","article-title":"Vript: A video is worth thousands of words","volume":"37","author":"Yang Dongjie","year":"2024","unstructured":"Dongjie Yang, Suyuan Huang, Chengqiang Lu, Xiaodong Han, Haoxin Zhang, Yan Gao, Yao Hu, and Hai Zhao. 2024. Vript: A video is worth thousands of words. In Proceedings of the Advances in Neural Information Processing Systems, Vol. 37. 57240-57261.","journal-title":"Proceedings of the Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_46_1","unstructured":"Boqiang Zhang Kehan Li Zesen Cheng Zhiqiang Hu Yuqian Yuan Guanzheng Chen Sicong Leng Yuming Jiang Hang Zhang Xin Li et al. 2025. VideoLLaMA 3: Frontier Multimodal Foundation Models for Image and Video Understanding. arXiv preprint arXiv:2501.13106 (2025)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00238"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3758218","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:02:35Z","timestamp":1765342955000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3758218"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":47,"alternative-id":["10.1145\/3746027.3758218","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3758218","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}