{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:46:56Z","timestamp":1765309616903,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":40,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62236008,U21B2038,U23B2051,62471013,62406305,62476068"],"award-info":[{"award-number":["62236008,U21B2038,U23B2051,62471013,62406305,62476068"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Youth Innovation Promotion Association CAS"},{"name":"Postdoctoral Fellowship Program of CPSF","award":["GZB20230732"],"award-info":[{"award-number":["GZB20230732"]}]},{"DOI":"10.13039\/501100002858","name":"China Postdoctoral Science Foundation","doi-asserted-by":"publisher","award":["2023M743441"],"award-info":[{"award-number":["2023M743441"]}],"id":[{"id":"10.13039\/501100002858","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3762058","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:54:17Z","timestamp":1761375257000},"page":"13815-13821","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["MGVC: MLLM-Guided Video Captioning for the IntentVC Challenge"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1132-859X","authenticated-orcid":false,"given":"Zhipeng","family":"Yu","sequence":"first","affiliation":[{"name":"SEECE, UCAS, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3512-7277","authenticated-orcid":false,"given":"Qianqian","family":"Xu","sequence":"additional","affiliation":[{"name":"IIP, ICT, CAS, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0148-8306","authenticated-orcid":false,"given":"Yangbangyan","family":"Jiang","sequence":"additional","affiliation":[{"name":"SCST, UCAS, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3207-5472","authenticated-orcid":false,"given":"Pinci","family":"Yang","sequence":"additional","affiliation":[{"name":"SEECE, UCAS, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7542-296X","authenticated-orcid":false,"given":"Qingming","family":"Huang","sequence":"additional","affiliation":[{"name":"SCST, UCAS IIP, ICT, CAS, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican Malcolm Reynolds et al. 2022. Flamingo: a visual language model for few-shot learning. Advances in neural information processing systems Vol. 35 (2022) 23716-23736."},{"key":"e_1_3_2_1_2_1","unstructured":"Jinze Bai Shuai Bai Yunfei Chu Zeyu Cui Kai Dang Xiaodong Deng Yang Fan Wenbin Ge Yu Han Fei Huang et al. 2023. Qwen technical report. arXiv preprint arXiv:2309.16609 (2023)."},{"key":"e_1_3_2_1_3_1","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang et al. 2025. Qwen2. 5-vl technical report. arXiv preprint arXiv:2502.13923 (2025)."},{"key":"e_1_3_2_1_4_1","volume-title":"Alexander Kolesnikov, Xiao Wang, Daniel Salz, Maxim Neumann, Ibrahim Alabdulmohsin, Michael Tschannen, Emanuele Bugliarello, et al.","author":"Beyer Lucas","year":"2024","unstructured":"Lucas Beyer, Andreas Steiner, Andr\u00e9 Susano Pinto, Alexander Kolesnikov, Xiao Wang, Daniel Salz, Maxim Neumann, Ibrahim Alabdulmohsin, Michael Tschannen, Emanuele Bugliarello, et al., 2024. Paligemma: A versatile 3b vlm for transfer. arXiv preprint arXiv:2407.07726 (2024)."},{"volume-title":"Natural Language Processing with Python","author":"Bird Steven","key":"e_1_3_2_1_5_1","unstructured":"Steven Bird, Edward Loper, and Ewan Klein. 2009. Natural Language Processing with Python. O'Reilly Media, Sebastopol, CA."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.5555\/2002472.2002497"},{"key":"e_1_3_2_1_7_1","volume-title":"Shikra: Unleashing multimodal llm's referential dialogue magic. arXiv preprint arXiv:2306.15195","author":"Chen Keqin","year":"2023","unstructured":"Keqin Chen, Zhao Zhang, Weili Zeng, Richong Zhang, Feng Zhu, and Rui Zhao. 2023. Shikra: Unleashing multimodal llm's referential dialogue magic. arXiv preprint arXiv:2306.15195 (2023)."},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 24185-24198","author":"Chen Zhe","year":"2024","unstructured":"Zhe Chen, Jiannan Wu, Wenhai Wang, Weijie Su, Guo Chen, Sen Xing, Muyan Zhong, Qinglong Zhang, Xizhou Zhu, Lewei Lu, et al., 2024. Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 24185-24198."},{"key":"e_1_3_2_1_9_1","volume-title":"Instructblip: Towards general-purpose vision-language models with instruction tuning. Advances in neural information processing systems","author":"Dai Wenliang","year":"2023","unstructured":"Wenliang Dai, Junnan Li, Dongxu Li, Anthony Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale N Fung, and Steven Hoi. 2023. Instructblip: Towards general-purpose vision-language models with instruction tuning. Advances in neural information processing systems, Vol. 36 (2023), 49250-49267."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01309"},{"key":"e_1_3_2_1_11_1","first-page":"3","article-title":"Lora: Low-rank adaptation of large language models","volume":"1","author":"Hu Edward J","year":"2022","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen, et al., 2022. Lora: Low-rank adaptation of large language models. ICLR, Vol. 1, 2 (2022), 3.","journal-title":"ICLR"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01273"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3746027.3762057"},{"key":"e_1_3_2_1_14_1","unstructured":"Yuwon Lee. 2024. Qwen2-VL-Finetune. https:\/\/github.com\/2U1\/Qwen2-VL-Finetune"},{"key":"e_1_3_2_1_15_1","unstructured":"Patrick Lewis Ethan Perez Aleksandra Piktus Fabio Petroni Vladimir Karpukhin Naman Goyal Heinrich K\u00fcttler Mike Lewis Wen-tau Yih Tim Rockt\u00e4schel et al. 2020. Retrieval-augmented generation for knowledge-intensive nlp tasks. Advances in neural information processing systems Vol. 33 (2020) 9459-9474."},{"key":"e_1_3_2_1_16_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730-19742."},{"key":"e_1_3_2_1_17_1","volume-title":"BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. In ICML.","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022a. BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. In ICML."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"e_1_3_2_1_19_1","volume-title":"Describe Anything: Detailed Localized Image and Video Captioning. arXiv preprint arXiv:2504.16072","author":"Lian Long","year":"2025","unstructured":"Long Lian, Yifan Ding, Yunhao Ge, Sifei Liu, Hanzi Mao, Boyi Li, Marco Pavone, Ming-Yu Liu, Trevor Darrell, Adam Yala, and Yin Cui. 2025. Describe Anything: Detailed Localized Image and Video Captioning. arXiv preprint arXiv:2504.16072 (2025)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"e_1_3_2_1_21_1","volume-title":"European conference on computer vision. Springer, 38-55","author":"Liu Shilong","year":"2024","unstructured":"Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Qing Jiang, Chunyuan Li, Jianwei Yang, Hang Su, et al., 2024b. Grounding dino: Marrying dino with grounded pre-training for open-set object detection. In European conference on computer vision. Springer, 38-55."},{"key":"e_1_3_2_1_22_1","unstructured":"OpenAI. 2024. Gpt-4o System Card. Technical Report. OpenAI. https:\/\/openai.com\/gpt-4o"},{"key":"e_1_3_2_1_23_1","first-page":"114321","article-title":"Artemis: Towards referential understanding in complex videos","volume":"37","author":"Qiu Jihao","year":"2024","unstructured":"Jihao Qiu, Yuan Zhang, Xi Tang, Lingxi Xie, Tianren Ma, Pengyu Yan, David Doermann, Qixiang Ye, and Yunjie Tian. 2024. Artemis: Towards referential understanding in complex videos. Advances in Neural Information Processing Systems, Vol. 37 (2024), 114321-114347.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_24_1","unstructured":"Christoph Schuhmann Romain Beaumont Richard Vencu Cade Gordon Ross Wightman Mehdi Cherti Theo Coombes Aarush Katta Clayton Mullis Mitchell Wortsman et al. 2022. Laion-5b: An open large-scale dataset for training next generation image-text models. Advances in neural information processing systems Vol. 35 (2022) 25278-25294."},{"key":"e_1_3_2_1_25_1","volume-title":"SOVC: Subject-Oriented Video Captioning. arXiv preprint arXiv:2312.13330","author":"Teng Chang","year":"2023","unstructured":"Chang Teng, Yunchuan Ma, Guorong Li, Yuankai Qi, Laiyu Qing, and Qingming Huang. 2023. SOVC: Subject-Oriented Video Captioning. arXiv preprint arXiv:2312.13330 (2023)."},{"key":"e_1_3_2_1_26_1","unstructured":"Weihan Wang Qingsong Lv Wenmeng Yu Wenyi Hong Ji Qi Yan Wang Junhui Ji Zhuoyi Yang Lei Zhao Xixuan Song Jiazheng Xu Bin Xu Juanzi Li Yuxiao Dong Ming Ding and Jie Tang. 2023b. CogVLM: Visual Expert for Pretrained Language Models. arXiv:2311.03079 [cs.CV]"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612152"},{"key":"e_1_3_2_1_28_1","volume-title":"European Conference on Computer Vision. Springer, 207-224","author":"Wu Jialian","year":"2024","unstructured":"Jialian Wu, Jianfeng Wang, Zhengyuan Yang, Zhe Gan, Zicheng Liu, Junsong Yuan, and Lijuan Wang. 2024. Grit: A generative region-to-text transformer for object understanding. In European Conference on Computer Vision. Springer, 207-224."},{"key":"e_1_3_2_1_29_1","first-page":"54683","article-title":"Datasetdm: Synthesizing data with perception annotations using diffusion models","volume":"36","author":"Wu Weijia","year":"2023","unstructured":"Weijia Wu, Yuzhong Zhao, Hao Chen, Yuchao Gu, Rui Zhao, Yefei He, Hong Zhou, Mike Zheng Shou, and Chunhua Shen. 2023. Datasetdm: Synthesizing data with perception annotations using diffusion models. Advances in Neural Information Processing Systems, Vol. 36 (2023), 54683-54695.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01284"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"e_1_3_2_1_32_1","volume-title":"Set-of-mark prompting unleashes extraordinary visual grounding in gpt-4v. arXiv preprint arXiv:2310.11441","author":"Yang Jianwei","year":"2023","unstructured":"Jianwei Yang, Hao Zhang, Feng Li, Xueyan Zou, Chunyuan Li, and Jianfeng Gao. 2023. Set-of-mark prompting unleashes extraordinary visual grounding in gpt-4v. arXiv preprint arXiv:2310.11441 (2023)."},{"key":"e_1_3_2_1_33_1","volume-title":"European Conference on Computer Vision. Springer, 425-443","author":"Yu En","year":"2024","unstructured":"En Yu, Liang Zhao, Yana Wei, Jinrong Yang, Dongming Wu, Lingyu Kong, Haoran Wei, Tiancai Wang, Zheng Ge, Xiangyu Zhang, et al., 2024. Merlin: Empowering multimodal llms with foresight minds. In European Conference on Computer Vision. Springer, 425-443."},{"key":"e_1_3_2_1_34_1","volume-title":"Coca: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917","author":"Yu Jiahui","year":"2022","unstructured":"Jiahui Yu, Zirui Wang, Vijay Vasudevan, Legg Yeung, Mojtaba Seyedhosseini, and Yonghui Wu. 2022. Coca: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917 (2022)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01767"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"crossref","unstructured":"Yuzhong Zhao Feng Liu Yue Liu Mingxiang Liao Chen Gong Qixiang Ye and Fang Wan. 2024b. DynRefer: Delving into Region-level Multi-modality Tasks via Dynamic Resolution. arXiv:2405.16071 [cs.CV]","DOI":"10.1109\/CVPR52734.2025.02304"},{"key":"e_1_3_2_1_37_1","volume-title":"ControlCap: Controllable Region-Level Captioning. In European Conference on Computer Vision. Springer, 21-38","author":"Zhao Yuzhong","year":"2024","unstructured":"Yuzhong Zhao, Yue Liu, Zonghao Guo, Weijia Wu, Chen Gong, Qixiang Ye, and Fang Wan. 2024a. ControlCap: Controllable Region-Level Captioning. In European Conference on Computer Vision. Springer, 21-38."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"e_1_3_2_1_39_1","volume-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592","author":"Zhu Deyao","year":"2023","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2023. Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)."},{"key":"e_1_3_2_1_40_1","unstructured":"Jinguo Zhu Weiyun Wang Zhe Chen Zhaoyang Liu Shenglong Ye Lixin Gu Hao Tian Yuchen Duan Weijie Su Jie Shao Zhangwei Gao Erfei Cui Xuehui Wang Yue Cao Yangzhou Liu Xingguang Wei Hongjie Zhang Haomin Wang Weiye Xu Hao Li Jiahao Wang Nianchen Deng Songze Li Yinan He Tan Jiang Jiapeng Luo Yi Wang Conghui He Botian Shi Xingcheng Zhang Wenqi Shao Junjun He Yingtong Xiong Wenwen Qu Peng Sun Penglong Jiao Han Lv Lijun Wu Kaipeng Zhang Huipeng Deng Jiaye Ge Kai Chen Limin Wang Min Dou Lewei Lu Xizhou Zhu Tong Lu Dahua Lin Yu Qiao Jifeng Dai and Wenhai Wang. 2025. InternVL3: Exploring Advanced Training and Test-Time Recipes for Open-Source Multimodal Models. arXiv:2504.10479 [cs.CV] https:\/\/arxiv.org\/abs\/2504.10479"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3762058","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:44:08Z","timestamp":1765309448000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3762058"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":40,"alternative-id":["10.1145\/3746027.3762058","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3762058","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}