{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T11:36:57Z","timestamp":1773229017231,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":64,"publisher":"ACM","funder":[{"name":"National Natural Science Foundation of China","award":["No. 62376052"],"award-info":[{"award-number":["No. 62376052"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3728423.3759414","type":"proceedings-article","created":{"date-parts":[[2025,10,8]],"date-time":"2025-10-08T15:32:07Z","timestamp":1759937527000},"page":"47-64","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Commentary Master: Exploring Fine-grained Video Action Commentary"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-6458-7599","authenticated-orcid":false,"given":"Sifan","family":"Zhang","sequence":"first","affiliation":[{"name":"Dalian University of Technology, Dalian, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6418-0969","authenticated-orcid":false,"given":"Bingcheng","family":"Dong","sequence":"additional","affiliation":[{"name":"Dalian University of Technology, Dalian, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3267-1653","authenticated-orcid":false,"given":"Yuning","family":"Ding","sequence":"additional","affiliation":[{"name":"Dalian University of Technology, Dalian, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-0777-5499","authenticated-orcid":false,"given":"Jinrong","family":"Zhang","sequence":"additional","affiliation":[{"name":"Dalian University of Technology, Dalian, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0530-9297","authenticated-orcid":false,"given":"Qiang","family":"Gao","sequence":"additional","affiliation":[{"name":"Wuhan University, Wuhan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3823-4200","authenticated-orcid":false,"given":"Shenglan","family":"Liu","sequence":"additional","affiliation":[{"name":"Dalian University of Technology, Dalian, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8565-2545","authenticated-orcid":false,"given":"Tao","family":"Sun","sequence":"additional","affiliation":[{"name":"Dalian University of Technology, Dalian, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al., 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00232"},{"key":"e_1_3_2_2_3_1","unstructured":"Akari Asai Zeqiu Wu Yizhong Wang Avirup Sil and Hannaneh Hajishirzi. 2023. Self-RAG: Learning to Retrieve Generate and Critique through Self-Reflection. arXiv:2310.11511 [cs.CL] https:\/\/arxiv.org\/abs\/2310.11511"},{"key":"e_1_3_2_2_4_1","unstructured":"Jinze Bai Shuai Bai Yunfei Chu Zeyu Cui Kai Dang Xiaodong Deng Yang Fan Wenbin Ge Yu Han Fei Huang Binyuan Hui Luo Ji Mei Li Junyang Lin Runji Lin Dayiheng Liu Gao Liu Chengqiang Lu Keming Lu Jianxin Ma Rui Men Xingzhang Ren Xuancheng Ren Chuanqi Tan Sinan Tan Jianhong Tu Peng Wang Shijie Wang Wei Wang Shengguang Wu Benfeng Xu Jin Xu An Yang Hao Yang Jian Yang Shusheng Yang Yang Yao Bowen Yu Hongyi Yuan Zheng Yuan Jianwei Zhang Xingxuan Zhang Yichang Zhang Zhenru Zhang Chang Zhou Jingren Zhou Xiaohuan Zhou and Tianhang Zhu. 2023. Qwen Technical Report. arXiv:2309.16609 [cs.CL] https:\/\/arxiv.org\/abs\/2309.16609"},{"key":"e_1_3_2_2_5_1","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An Automatic Metric for MT Evaluation with Improved Correlation with Human Judgments. In Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization Jade Goldstein Alon Lavie Chin-Yew Lin and Clare Voss (Eds.). Association for Computational Linguistics Ann Arbor Michigan 65-72. https:\/\/aclanthology.org\/W05-0909"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_4"},{"key":"e_1_3_2_2_7_1","unstructured":"T. Tony Cai and Rong Ma. 2022. Theoretical Foundations of t-SNE for Visualizing High-Dimensional Clustered Data. arXiv:2105.07536 [stat.ML] https:\/\/arxiv.org\/abs\/2105.07536"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1080\/07421222.2022.2127441"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"crossref","unstructured":"Jianlv Chen Shitao Xiao Peitian Zhang Kun Luo Defu Lian and Zheng Liu. 2024b. BGE M3-Embedding: Multi-Lingual Multi-Functionality Multi-Granularity Text Embeddings Through Self-Knowledge Distillation. arXiv:2402.03216 [cs.CL] https:\/\/arxiv.org\/abs\/2402.03216","DOI":"10.18653\/v1\/2024.findings-acl.137"},{"key":"e_1_3_2_2_11_1","volume-title":"IPL: Leveraging Multimodal Large Language Models for Intelligent Product Listing. arXiv preprint arXiv:2410.16977","author":"Chen Kang","year":"2024","unstructured":"Kang Chen, Qingheng Zhang, Chengbao Lian, Yixin Ji, Xuwei Liu, Shuguang Han, Guoqiang Wu, Fei Huang, and Jufeng Chen. 2024c. IPL: Leveraging Multimodal Large Language Models for Intelligent Product Listing. arXiv preprint arXiv:2410.16977 (2024)."},{"key":"e_1_3_2_2_12_1","unstructured":"Zhe Chen Weiyun Wang Yue Cao Yangzhou Liu Zhangwei Gao Erfei Cui Jinguo Zhu Shenglong Ye Hao Tian Zhaoyang Liu et al. 2024a. Expanding performance boundaries of open-source multimodal models with model data and test-time scaling. arXiv preprint arXiv:2412.05271 (2024)."},{"key":"e_1_3_2_2_13_1","volume-title":"InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks. arXiv preprint arXiv:2312.14238","author":"Chen Zhe","year":"2023","unstructured":"Zhe Chen, Jiannan Wu, Wenhai Wang, Weijie Su, Guo Chen, Sen Xing, Muyan Zhong, Qinglong Zhang, Xizhou Zhu, Lewei Lu, Bin Li, Ping Luo, Tong Lu, Yu Qiao, and Jifeng Dai. 2023. InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks. arXiv preprint arXiv:2312.14238 (2023)."},{"key":"e_1_3_2_2_14_1","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 20186-20196","author":"Ha Myoung Hoon","year":"2022","unstructured":"Hyung-gun Chi, Myoung Hoon Ha, Seunggeun Chi, Sang Wan Lee, Qixing Huang, and Karthik Ramani. 2022. Infogcn: Representation learning for human skeleton-based action recognition. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 20186-20196."},{"key":"e_1_3_2_2_15_1","volume-title":"Temporal action segmentation: An analysis of modern techniques","author":"Ding Guodong","year":"2023","unstructured":"Guodong Ding, Fadime Sener, and Angela Yao. 2023. Temporal action segmentation: An analysis of modern techniques. IEEE Transactions on Pattern Analysis and Machine Intelligence (2023)."},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681084"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"crossref","unstructured":"Matthijs Douze Alexandr Guzhva Chengqi Deng Jeff Johnson Gergely Szilvasy Pierre-Emmanuel Mazar\u00e9 Maria Lomeli Lucas Hosseini and Herv\u00e9 J\u00e9gou. 2024. The Faiss library. (2024). arXiv:2401.08281 [cs.LG]","DOI":"10.1109\/TBDATA.2025.3618474"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72670-5_5"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.chaos.2024.114958"},{"key":"e_1_3_2_2_20_1","unstructured":"Aaron Grattafiori Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Alex Vaughan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_2_21_1","volume-title":"Vision-language models for medical report generation and visual question answering: A review. arXiv preprint arXiv:2403.02469","author":"Hartsock Iryna","year":"2024","unstructured":"Iryna Hartsock and Ghulam Rasool. 2024. Vision-language models for medical report generation and visual question answering: A review. arXiv preprint arXiv:2403.02469 (2024)."},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2017.01.010"},{"key":"e_1_3_2_2_23_1","volume-title":"Laura Greenstreet, Joshua Fan, Aaron Ferber, Marta Ummus, Alecsander Brito, Olivia Graham, Lillian Aoki, et al.","author":"Hogan Brendan","year":"2024","unstructured":"Brendan Hogan, Anmol Kabra, Felipe Siqueira Pacheco, Laura Greenstreet, Joshua Fan, Aaron Ferber, Marta Ummus, Alecsander Brito, Olivia Graham, Lillian Aoki, et al., 2024. AiSciVision: A Framework for Specializing Large Multimodal Models in Scientific Image Classification. arXiv preprint arXiv:2410.21480 (2024)."},{"key":"e_1_3_2_2_24_1","unstructured":"Yutong Hu Quzhe Huang Mingxu Tao Chen Zhang and Yansong Feng. 2024. Can Perplexity Reflect Large Language Model's Ability in Long Text Understanding? arXiv:2405.06105 [cs.CL] https:\/\/arxiv.org\/abs\/2405.06105"},{"key":"e_1_3_2_2_25_1","unstructured":"Bin Huang Xin Wang Hong Chen Zihan Song and Wenwu Zhu. 2023. VTimeLLM: Empower LLM to Grasp Video Moments. arXiv:2311.18445 [cs.CV] https:\/\/arxiv.org\/abs\/2311.18445"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.396"},{"key":"e_1_3_2_2_27_1","volume-title":"FlashRAG: A Modular Toolkit for Efficient Retrieval-Augmented Generation Research. CoRR","author":"Jin Jiajie","year":"2024","unstructured":"Jiajie Jin, Yutao Zhu, Xinyu Yang, Chenghao Zhang, and Zhicheng Dou. 2024. FlashRAG: A Modular Toolkit for Efficient Retrieval-Augmented Generation Research. CoRR, Vol. abs\/2405.13576 (2024). arXiv:2405.13576 https:\/\/arxiv.org\/abs\/2405.13576"},{"key":"e_1_3_2_2_28_1","volume-title":"Tanzila Saba, Usman Habib, Junaid Ali Khan, and Aaqif Afzaal Abbasi.","author":"Khan Muhammad Attique","year":"2024","unstructured":"Muhammad Attique Khan, Kashif Javed, Sajid Ali Khan, Tanzila Saba, Usman Habib, Junaid Ali Khan, and Aaqif Afzaal Abbasi. 2024. Human action recognition using fusion of multiview and deep features: an application to video surveillance. Multimedia tools and applications, Vol. 83, 5 (2024), 14885-14911."},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01594-9"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1103\/PhysRevPhysEducRes.19.010132"},{"key":"e_1_3_2_2_31_1","volume-title":"Tim Rockt\u00e4schel, Sebastian Riedel, and Douwe Kiela.","author":"Lewis Patrick","year":"2021","unstructured":"Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K\u00fcttler, Mike Lewis, Wen tau Yih, Tim Rockt\u00e4schel, Sebastian Riedel, and Douwe Kiela. 2021. Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. arXiv:2005.11401 [cs.CL] https:\/\/arxiv.org\/abs\/2005.11401"},{"key":"e_1_3_2_2_32_1","unstructured":"Dawei Li Bohan Jiang Liangjie Huang Alimohammad Beigi Chengshuai Zhao Zhen Tan Amrita Bhattacharjee Yuxuan Jiang Canyu Chen Tianhao Wu et al. 2024. From generation to judgment: Opportunities and challenges of llm-as-a-judge. arXiv preprint arXiv:2411.16594 (2024)."},{"key":"e_1_3_2_2_33_1","volume-title":"BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. arXiv:2201.12086 [cs.CV] https:\/\/arxiv.org\/abs\/2201.12086","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. arXiv:2201.12086 [cs.CV] https:\/\/arxiv.org\/abs\/2201.12086"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"crossref","unstructured":"Bin Lin Yang Ye Bin Zhu Jiaxi Cui Munan Ning Peng Jin and Li Yuan. 2024a. Video-LLaVA: Learning United Visual Representation by Alignment Before Projection. arXiv:2311.10122 [cs.CV] https:\/\/arxiv.org\/abs\/2311.10122","DOI":"10.18653\/v1\/2024.emnlp-main.342"},{"key":"e_1_3_2_2_35_1","volume-title":"VILA: On Pre-training for Visual Language Models. arXiv:2312.07533 [cs.CV] https:\/\/arxiv.org\/abs\/2312.07533","author":"Lin Ji","year":"2024","unstructured":"Ji Lin, Hongxu Yin, Wei Ping, Yao Lu, Pavlo Molchanov, Andrew Tao, Huizi Mao, Jan Kautz, Mohammad Shoeybi, and Song Han. 2024b. VILA: On Pre-training for Visual Language Models. arXiv:2312.07533 [cs.CV] https:\/\/arxiv.org\/abs\/2312.07533"},{"key":"e_1_3_2_2_36_1","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong Jae Lee. 2023. Visual Instruction Tuning. arXiv:2304.08485 [cs.CV] https:\/\/arxiv.org\/abs\/2304.08485"},{"key":"e_1_3_2_2_37_1","volume-title":"Multidimensional Refinement Graph Convolutional Network With Robust Decouple Loss for Fine-Grained Skeleton-Based Action Recognition","author":"Liu Sheng-Lan","year":"2024","unstructured":"Sheng-Lan Liu, Yu-Ning Ding, Jin-Rong Zhang, Kai-Yuan Liu, Si-Fan Zhang, Fei-Long Wang, and Gao Huang. 2024. Multidimensional Refinement Graph Convolutional Network With Robust Decouple Loss for Fine-Grained Skeleton-Based Action Recognition. IEEE Transactions on Neural Networks and Learning Systems (2024)."},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.7717\/peerj-cs.1400"},{"key":"e_1_3_2_2_39_1","unstructured":"Muhammad Maaz Hanoona Rasheed Salman Khan and Fahad Shahbaz Khan. 2024. Video-ChatGPT: Towards Detailed Video Understanding via Large Vision and Language Models. arXiv:2306.05424 [cs.CV] https:\/\/arxiv.org\/abs\/2306.05424"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00536"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.3390\/s23042182"},{"key":"e_1_3_2_2_42_1","volume-title":"Summarizing Instructional Videos with Task Relevance and Cross-Modal Saliency. In European Conference on Computer Vision. Springer, 540-557","author":"Narasimhan Medhini","year":"2022","unstructured":"Medhini Narasimhan, Arsha Nagrani, Chen Sun, Michael Rubinstein, Trevor Darrell, Anna Rohrbach, and Cordelia Schmid. 2022. TL; DW? Summarizing Instructional Videos with Task Relevance and Cross-Modal Saliency. In European Conference on Computer Vision. Springer, 540-557."},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00643"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"crossref","unstructured":"Kishore Papineni Salim Roukos Todd Ward and Wei jing Zhu. 2002. BLEU: a Method for Automatic Evaluation of Machine Translation. 311-318.","DOI":"10.3115\/1073083.1073135"},{"key":"e_1_3_2_2_45_1","first-page":"1468","article-title":"Action quality assessment across multiple actions. In 2019 IEEE winter conference on applications of computer vision (WACV)","author":"Parmar Paritosh","year":"2019","unstructured":"Paritosh Parmar and Brendan Morris. 2019a. Action quality assessment across multiple actions. In 2019 IEEE winter conference on applications of computer vision (WACV). IEEE, 1468-1476.","journal-title":"IEEE"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00039"},{"key":"e_1_3_2_2_47_1","volume-title":"Momentor: Advancing Video Large Language Model with Fine-Grained Temporal Reasoning. arXiv:2402.11435 [cs.CV] https:\/\/arxiv.org\/abs\/2402.11435","author":"Qian Long","year":"2024","unstructured":"Long Qian, Juncheng Li, Yu Wu, Yaobo Ye, Hao Fei, Tat-Seng Chua, Yueting Zhuang, and Siliang Tang. 2024. Momentor: Advancing Video Large Language Model with Fine-Grained Temporal Reasoning. arXiv:2402.11435 [cs.CV] https:\/\/arxiv.org\/abs\/2402.11435"},{"key":"e_1_3_2_2_48_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. arXiv:2103.00020 [cs.CV] https:\/\/arxiv.org\/abs\/2103.00020"},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00986"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10044-023-01166-8"},{"key":"e_1_3_2_2_51_1","volume-title":"Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training. Advances in neural information processing systems","author":"Tong Zhan","year":"2022","unstructured":"Zhan Tong, Yibing Song, Jue Wang, and Limin Wang. 2022. Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training. Advances in neural information processing systems, Vol. 35 (2022), 10078-10093."},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"crossref","unstructured":"Ramakrishna Vedantam C. Lawrence Zitnick and Devi Parikh. 2015. CIDEr: Consensus-based Image Description Evaluation. arXiv:1411.5726 [cs.CV] https:\/\/arxiv.org\/abs\/1411.5726","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475438"},{"key":"e_1_3_2_2_54_1","volume-title":"Denny Zhou, et al.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al., 2022. Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems, Vol. 35 (2022), 24824-24837."},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/tcsvt.2019.2927118"},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"crossref","unstructured":"Hu Xu Gargi Ghosh Po-Yao Huang Dmytro Okhonko Armen Aghajanyan Florian Metze Luke Zettlemoyer and Christoph Feichtenhofer. 2021. VideoCLIP: Contrastive Pre-training for Zero-shot Video-Text Understanding. arXiv:2109.14084 [cs.CV] https:\/\/arxiv.org\/abs\/2109.14084","DOI":"10.18653\/v1\/2021.emnlp-main.544"},{"key":"e_1_3_2_2_57_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2023.103707"},{"key":"e_1_3_2_2_58_1","volume-title":"WorldGPT: a Sora-inspired video AI agent as Rich world models from text and image inputs. arXiv preprint arXiv:2403.07944","author":"Yang Deshun","year":"2024","unstructured":"Deshun Yang, Luhui Hu, Yu Tian, Zihao Li, Chris Kelly, Bang Yang, Cindy Yang, and Yuexian Zou. 2024. WorldGPT: a Sora-inspired video AI agent as Rich world models from text and image inputs. arXiv preprint arXiv:2403.07944 (2024)."},{"key":"e_1_3_2_2_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00782"},{"key":"e_1_3_2_2_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2007.1078"},{"key":"e_1_3_2_2_61_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00521-023-09068-w"},{"key":"e_1_3_2_2_62_1","doi-asserted-by":"crossref","unstructured":"Hang Zhang Xin Li and Lidong Bing. 2023a. Video-LLaMA: An Instruction-tuned Audio-Visual Language Model for Video Understanding. arXiv:2306.02858 [cs.CL] https:\/\/arxiv.org\/abs\/2306.02858","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"e_1_3_2_2_63_1","volume-title":"End-to-End Streaming Video Temporal Action Segmentation with Reinforce Learning. arXiv preprint arXiv:2309.15683","author":"Zhang Jinrong","year":"2023","unstructured":"Jinrong Zhang, Wujun Wen, Shenglan Liu, Yunheng Li, Qifeng Li, and Lin Feng. 2023b. End-to-End Streaming Video Temporal Action Segmentation with Reinforce Learning. arXiv preprint arXiv:2309.15683 (2023)."},{"key":"e_1_3_2_2_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01744"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 8th International ACM Workshop on Multimedia Content Analysis in Sports"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3728423.3759414","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T14:31:39Z","timestamp":1773153099000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3728423.3759414"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":64,"alternative-id":["10.1145\/3728423.3759414","10.1145\/3728423"],"URL":"https:\/\/doi.org\/10.1145\/3728423.3759414","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}