{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:27:00Z","timestamp":1765308420482,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":49,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62472178, 62376244"],"award-info":[{"award-number":["62472178, 62376244"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Shanghai Urban Digital Transformation Special Fund Project","award":["202301027"],"award-info":[{"award-number":["202301027"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755077","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:50:47Z","timestamp":1761371447000},"page":"3418-3427","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["TimeSoccer: An End-to-End Multimodal Large Language Model for Soccer Commentary Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-2422-5106","authenticated-orcid":false,"given":"Ling","family":"You","sequence":"first","affiliation":[{"name":"East China Normal University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-9656-813X","authenticated-orcid":false,"given":"Wenxuan","family":"Huang","sequence":"additional","affiliation":[{"name":"East China Normal University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5815-1062","authenticated-orcid":false,"given":"Xinni","family":"Xie","sequence":"additional","affiliation":[{"name":"East China Normal University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7630-5558","authenticated-orcid":false,"given":"Xiangyi","family":"Wei","sequence":"additional","affiliation":[{"name":"East China Normal University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-2746-5150","authenticated-orcid":false,"given":"Bangyan","family":"Li","sequence":"additional","affiliation":[{"name":"East China Normal University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0284-9940","authenticated-orcid":false,"given":"Shaohui","family":"Lin","sequence":"additional","affiliation":[{"name":"East China Normal University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9427-7665","authenticated-orcid":false,"given":"Yang","family":"Li","sequence":"additional","affiliation":[{"name":"East China Normal University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8940-6418","authenticated-orcid":false,"given":"Changbo","family":"Wang","sequence":"additional","affiliation":[{"name":"East China Normal University, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.3390\/vehicles6030074"},{"key":"e_1_3_2_1_2_1","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican Malcolm Reynolds et al. 2022. Flamingo: a visual language model for few-shot learning Vol. 35. 23716-23736."},{"key":"e_1_3_2_1_3_1","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang et al. 2025. Qwen2. 5-vl technical report. arXiv preprint arXiv:2502.13923 (2025)."},{"key":"e_1_3_2_1_4_1","volume-title":"Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization. 65-72","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization. 65-72."},{"key":"e_1_3_2_1_5_1","volume-title":"Token Merging: Your ViT But Faster. In ICLR.","author":"Bolya Daniel","year":"2023","unstructured":"Daniel Bolya, Cheng-Yang Fu, Xiaoliang Dai, Peizhao Zhang, Christoph Feichtenhofer, and Judy Hoffman. 2023. Token Merging: Your ViT But Faster. In ICLR."},{"key":"e_1_3_2_1_6_1","first-page":"640","volume-title":"Tel Aviv","author":"Cheng Ho Kei","year":"2022","unstructured":"Ho Kei Cheng and Alexander G Schwing. 2022. XMem: Long-Term Video Object Segmentation with an Atkinson-Shiffrin Memory Model. In Computer Vision-ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part XXVIII. Springer, 640-658."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00448"},{"key":"e_1_3_2_1_8_1","volume-title":"Soccernet-tracking: Multiple object tracking dataset and benchmark in soccer videos. 3491-3502.","author":"Cioppa Anthony","year":"2022","unstructured":"Anthony Cioppa, Silvio Giancola, Adrien Deliege, Le Kang, Xin Zhou, Zhiyu Cheng, Bernard Ghanem, and Marc Van Droogenbroeck. 2022. Soccernet-tracking: Multiple object tracking dataset and benchmark in soccer videos. 3491-3502."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Anthony Cioppa Silvio Giancola Vladimir Somers Victor Joos Floriane Magera Jan Held Seyed Abolfazl Ghasemzadeh Xin Zhou Karolina Seweryn Mateusz Kowalczyk Zuzanna Mr\u00f3z Hassan Mkhallati Adrien Deli\u00e8ge Carlos Hinojosa Karen Sanchez Amir M. Mansourian Pierre Miralles Olivier Barnich Christophe De Vleeschouwer Alexandre Alahi Bernard Ghanem Marc Van Droogenbroeck Adam Gorski Albert Clap\u00e9s Andrei Boiarov Anton Afanasiev Artur Xarles Atom Scott ByoungKwon Lim Calvin Yeung Cristian Gonzalez Dominic R\u00fcfenacht Enzo Pacilio Fabian Deuser Faisal Sami Altawijri Francisco Cach\u00f3n HanKyul Kim Haobo Wang Hyeonmin Choe Hyunwoo J Kim Il-Min Kim Jae-Mo Kang Jamshid Tursunboev Jian Yang Jihwan Hong Jimin Lee Jing Zhang Junseok Lee Kexin Zhang Konrad Habel Licheng Jiao Linyi Li Marc Guti\u00e9rrez-P\u00e9rez Marcelo Ortega Menglong Li Milosz Lopatto Nikita Kasatkin Nikolay Nemtsev Norbert Oswald Oleg Udin Pavel Kononov Pei Geng Saad Ghazai Alotaibi Sehyung Kim Sergei Ulasen Sergio Escalera Shanshan Zhang Shuyuan Yang Sunghwan Moon Thomas B. Moeslund Vasyl Shandyba Vladimir Golovkin Wei Dai WonTaek Chung Xinyu Liu Yongqiang Zhu Youngseo Kim Yuan Li Yuting Yang Yuxuan Xiao Zehua Cheng and Zhihao Li. 2024. SoccerNet 2024 Challenges Results. arXiv preprint arXiv:2409.10587 (2024).","DOI":"10.1007\/s12283-024-00466-4"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW53098.2021.00508"},{"key":"e_1_3_2_1_11_1","volume-title":"European Conference on Computer Vision. Springer, 75-92","author":"Fan Yue","year":"2024","unstructured":"Yue Fan, Xiaojian Ma, Rujie Wu, Yuntao Du, Jiaqi Li, Zhi Gao, and Qing Li. 2024. Videoagent: A memory-augmented multimodal agent for video understanding. In European Conference on Computer Vision. Springer, 75-92."},{"key":"e_1_3_2_1_12_1","volume-title":"SODA: Story Oriented Dense Video Captioning Evaluation Framework. In European Conference on Computer Vision. https:\/\/api.semanticscholar.org\/CorpusID:226291804","author":"Fujita Soichiro","year":"2020","unstructured":"Soichiro Fujita, Tsutomu Hirao, Hidetaka Kamigaito, Manabu Okumura, and Masaaki Nagata. 2020. SODA: Story Oriented Dense Video Captioning Evaluation Framework. In European Conference on Computer Vision. https:\/\/api.semanticscholar.org\/CorpusID:226291804"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2023.3240502"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2018.00223"},{"key":"e_1_3_2_1_15_1","volume-title":"Vision-r1: Incentivizing reasoning capability in multimodal large language models. arXiv preprint arXiv:2503.06749","author":"Huang Wenxuan","year":"2025","unstructured":"Wenxuan Huang, Bohan Jia, Zijie Zhai, Shaosheng Cao, Zheyu Ye, Fei Zhao, Yao Hu, and Shaohui Lin. 2025a. Vision-r1: Incentivizing reasoning capability in multimodal large language models. arXiv preprint arXiv:2503.06749 (2025)."},{"key":"e_1_3_2_1_16_1","volume-title":"Dynamic-llava: Efficient multimodal large language models via dynamic vision-language context sparsification. ICLR","author":"Huang Wenxuan","year":"2025","unstructured":"Wenxuan Huang, Zijie Zhai, Yunhang Shen, Shaosheng Cao, Fei Zhao, Xiangfeng Xu, Zheyu Ye, and Shaohui Lin. 2025b. Dynamic-llava: Efficient multimodal large language models via dynamic vision-language context sparsification. ICLR (2025)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i4.32427"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01300"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"e_1_3_2_1_20_1","unstructured":"Bangyan Li Wenxuan Huang Yunhang Shen Yeqiang Wang Shaohui Lin Jingzhong Lin Ling You Yinqi Zhang Ke Li Xing Sun et al. 2025. LLaVA-RadZ: Can Multimodal Large Language Models Effectively Tackle Zero-shot Radiology Recognition? arXiv preprint arXiv:2503.07487 (2025)."},{"key":"e_1_3_2_1_21_1","volume-title":"Llava-onevision: Easy visual task transfer. arXiv preprint arXiv:2408.03326","author":"Li Bo","year":"2024","unstructured":"Bo Li, Yuanhan Zhang, Dong Guo, Renrui Zhang, Feng Li, Hao Zhang, Kaichen Zhang, Peiyuan Zhang, Yanwei Li, Ziwei Liu, et al., 2024c. Llava-onevision: Easy visual task transfer. arXiv preprint arXiv:2408.03326 (2024)."},{"key":"e_1_3_2_1_22_1","first-page":"28541","article-title":"Llava-med: Training a large language-and-vision assistant for biomedicine in one day","volume":"36","author":"Li Chunyuan","year":"2023","unstructured":"Chunyuan Li, Cliff Wong, Sheng Zhang, Naoto Usuyama, Haotian Liu, Jianwei Yang, Tristan Naumann, Hoifung Poon, and Jianfeng Gao. 2023c. Llava-med: Training a large language-and-vision assistant for biomedicine in one day. Advances in Neural Information Processing Systems, Vol. 36 (2023), 28541-28564.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681593"},{"key":"e_1_3_2_1_24_1","unstructured":"Junnan Li Dongxu Li Silvio Savarese and Steven Hoi. 2023b. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. 19730-19742."},{"key":"e_1_3_2_1_25_1","volume-title":"International conference on machine learning. PMLR, 12888-12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888-12900."},{"key":"e_1_3_2_1_26_1","volume-title":"VideoChat: Chat-Centric Video Understanding. CoRR","author":"Li Kunchang","year":"2023","unstructured":"Kunchang Li, Yinan He, Yi Wang, Yizhuo Li, Wenhai Wang, Ping Luo, Yali Wang, Limin Wang, and Yu Qiao. 2023a. VideoChat: Chat-Centric Video Understanding. CoRR (2023)."},{"key":"e_1_3_2_1_27_1","first-page":"22195","article-title":"MVBench","author":"Li Kunchang","year":"2024","unstructured":"Kunchang Li, Yali Wang, Yinan He, Yizhuo Li, Yi Wang, Yi Liu, Zun Wang, Jilan Xu, Guo Chen, Ping Luo, Limin Wang, and Yu Qiao. 2024a. MVBench: A Comprehensive Multi-modal Video Understanding Benchmark. 22195-22206.","journal-title":"A Comprehensive Multi-modal Video Understanding Benchmark."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00536"},{"key":"e_1_3_2_1_30_1","volume-title":"GOAL: A challenging knowledge-grounded video captioning benchmark for real-time soccer commentary generation. 5391-5395.","author":"Qi Ji","year":"2023","unstructured":"Ji Qi, Jifan Yu, Teng Tu, Kunyu Gao, Yifan Xu, Xinyu Guan, Xiaozhi Wang, Bin Xu, Lei Hou, Juanzi Li, et al., 2023. GOAL: A challenging knowledge-grounded video captioning benchmark for real-time soccer commentary generation. 5391-5395."},{"key":"e_1_3_2_1_31_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision."},{"key":"e_1_3_2_1_32_1","volume-title":"Towards universal Soccer video understanding. arXiv preprint arXiv:2412.01820","author":"Rao Jiayuan","year":"2024","unstructured":"Jiayuan Rao, Haoning Wu, Hao Jiang, Ya Zhang, Yanfeng Wang, and Weidi Xie. 2024a. Towards universal Soccer video understanding. arXiv preprint arXiv:2412.01820 (2024)."},{"key":"e_1_3_2_1_33_1","volume-title":"Matchtime: Towards automatic soccer game commentary generation. arXiv preprint arXiv:2406.18530","author":"Rao Jiayuan","year":"2024","unstructured":"Jiayuan Rao, Haoning Wu, Chang Liu, Yanfeng Wang, and Weidi Xie. 2024b. Matchtime: Towards automatic soccer game commentary generation. arXiv preprint arXiv:2406.18530 (2024)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01357"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01725"},{"key":"e_1_3_2_1_36_1","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition. 5179-5187","author":"Song Yale","year":"2015","unstructured":"Yale Song, Jordi Vallmitjana, Amanda Stent, and Alejandro Jaimes. 2015. Tvsum: Summarizing web videos using titles. In Proceedings of the IEEE conference on computer vision and pattern recognition. 5179-5187."},{"key":"e_1_3_2_1_37_1","volume-title":"Xinlong Wang, and Yue Cao.","author":"Sun Quan","year":"2023","unstructured":"Quan Sun, Yuxin Fang, Ledell Yu Wu, Xinlong Wang, and Yue Cao. 2023. EVA-CLIP: Improved Training Techniques for CLIP at Scale. ArXiv, Vol. abs\/2303.15389 (2023). https:\/\/api.semanticscholar.org\/CorpusID:257766387"},{"key":"e_1_3_2_1_38_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al., 2023a. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_39_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023b. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_40_1","volume-title":"Cider: Consensus-based image description evaluation. 4566-4575.","author":"Vedantam Ramakrishna","year":"2015","unstructured":"Ramakrishna Vedantam, C Lawrence Zitnick, and Devi Parikh. 2015. Cider: Consensus-based image description evaluation. 4566-4575."},{"key":"e_1_3_2_1_41_1","volume-title":"TimeZero: Temporal Video Grounding with Reasoning-Guided LVLM. arXiv preprint arXiv:2503.13377","author":"Wang Ye","year":"2025","unstructured":"Ye Wang, Boshen Xu, Zihao Yue, Zihan Xiao, Ziheng Wang, Liang Zhang, Dingyi Yang, Wenxuan Wang, and Qin Jin. 2025. TimeZero: Temporal Video Grounding with Reasoning-Guided LVLM. arXiv preprint arXiv:2503.13377 (2025)."},{"key":"e_1_3_2_1_42_1","volume-title":"European Conference on Computer Vision. Springer, 453-470","author":"Weng Yuetian","year":"2024","unstructured":"Yuetian Weng, Mingfei Han, Haoyu He, Xiaojun Chang, and Bohan Zhuang. 2024. Longvlm: Efficient long video understanding via large language models. In European Conference on Computer Vision. Springer, 453-470."},{"key":"e_1_3_2_1_43_1","volume-title":"Antoine Miech, Jordi Pont-Tuset, Ivan Laptev, Josef Sivic, and Cordelia Schmid.","author":"Yang Antoine","year":"2023","unstructured":"Antoine Yang, Arsha Nagrani, Paul Hongsuck Seo, Antoine Miech, Jordi Pont-Tuset, Ivan Laptev, Josef Sivic, and Cordelia Schmid. 2023. Vid2seq: Large-scale pretraining of a visual language model for dense video captioning. 10714-10726."},{"key":"e_1_3_2_1_44_1","volume-title":"Deco: Decoupling token compression from semantic abstraction in multimodal large language models. arXiv preprint arXiv:2405.20985","author":"Yao Linli","year":"2024","unstructured":"Linli Yao, Lei Li, Shuhuai Ren, Lean Wang, Yuanxin Liu, Xu Sun, and Lu Hou. 2024. Deco: Decoupling token compression from semantic abstraction in multimodal large language models. arXiv preprint arXiv:2405.20985 (2024)."},{"key":"e_1_3_2_1_45_1","volume-title":"Timesuite: Improving mllms for long video understanding via grounded tuning. arXiv preprint arXiv:2410.19702","author":"Zeng Xiangyu","year":"2024","unstructured":"Xiangyu Zeng, Kunchang Li, Chenting Wang, Xinhao Li, Tianxiang Jiang, Ziang Yan, Songze Li, Yansong Shi, Zhengrong Yue, Yi Wang, et al., 2024. Timesuite: Improving mllms for long video understanding via grounded tuning. arXiv preprint arXiv:2410.19702 (2024)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","unstructured":"Xiaohua Zhai Basil Mustafa Alexander Kolesnikov and Lucas Beyer. 2023. Sigmoid loss for language image pre-training. 11975-11986.","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"crossref","unstructured":"Hang Zhang Xin Li and Lidong Bing. 2023. Video-LLaMA: An Instruction-tuned Audio-Visual Language Model for Video Understanding. In EMNLP (Demos).","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"e_1_3_2_1_48_1","volume-title":"Video instruction tuning with synthetic data. arXiv preprint arXiv:2410.02713","author":"Zhang Yuanhan","year":"2024","unstructured":"Yuanhan Zhang, Jinming Wu, Wei Li, Bo Li, Zejun Ma, Ziwei Liu, and Chunyuan Li. 2024. Video instruction tuning with synthetic data. arXiv preprint arXiv:2410.02713 (2024)."},{"key":"e_1_3_2_1_49_1","volume-title":"Xuehan Xiong, Arsha Nagrani, and Cordelia Schmid.","author":"Zhou Xingyi","year":"2024","unstructured":"Xingyi Zhou, Anurag Arnab, Shyamal Buch, Shen Yan, Austin Myers, Xuehan Xiong, Arsha Nagrani, and Cordelia Schmid. 2024. Streaming Dense Video Captioning."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755077","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:23:46Z","timestamp":1765308226000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755077"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":49,"alternative-id":["10.1145\/3746027.3755077","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755077","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}