{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:51:40Z","timestamp":1765309900925,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":50,"publisher":"ACM","funder":[{"name":"Beijing Natural Science Foundation","award":["L243006"],"award-info":[{"award-number":["L243006"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62476150, 62277033 and 62407027"],"award-info":[{"award-number":["62476150, 62277033 and 62407027"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755221","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:38Z","timestamp":1761377198000},"page":"10965-10974","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["LongWriter-V: Enabling Ultra-Long and High-Fidelity Generation in Vision-Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-0640-3413","authenticated-orcid":false,"given":"Shangqing","family":"Tu","sequence":"first","affiliation":[{"name":"DCST, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-8940-5133","authenticated-orcid":false,"given":"Yucheng","family":"Wang","sequence":"additional","affiliation":[{"name":"DCST, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-3681-1896","authenticated-orcid":false,"given":"Daniel","family":"Zhang-Li","sequence":"additional","affiliation":[{"name":"DCST, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-7611-1093","authenticated-orcid":false,"given":"Yushi","family":"Bai","sequence":"additional","affiliation":[{"name":"DCST, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3430-4048","authenticated-orcid":false,"given":"Jifan","family":"Yu","sequence":"additional","affiliation":[{"name":"IoE, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-0817-7508","authenticated-orcid":false,"given":"Yuhao","family":"Wu","sequence":"additional","affiliation":[{"name":"Singapore University of Technology and Design, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8907-3526","authenticated-orcid":false,"given":"Lei","family":"Hou","sequence":"additional","affiliation":[{"name":"DCST, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5754-2623","authenticated-orcid":false,"given":"Huiqin","family":"Liu","sequence":"additional","affiliation":[{"name":"IoE, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7709-2543","authenticated-orcid":false,"given":"Zhiyuan","family":"Liu","sequence":"additional","affiliation":[{"name":"DCST, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3040-4391","authenticated-orcid":false,"given":"Bin","family":"Xu","sequence":"additional","affiliation":[{"name":"DCST, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6244-0664","authenticated-orcid":false,"given":"Juanzi","family":"Li","sequence":"additional","affiliation":[{"name":"DCST, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican Malcolm Reynolds et al. 2022. Flamingo: a visual language model for few-shot learning. Advances in neural information processing systems Vol. 35 (2022) 23716-23736."},{"volume-title":"Anthropic: Introducing the next generation of Claude. https:\/\/www.anthropic.com\/news\/claude-3-family","year":"2024","key":"e_1_3_2_1_2_1","unstructured":"Anthropic. 2024. Anthropic: Introducing the next generation of Claude. https:\/\/www.anthropic.com\/news\/claude-3-family"},{"key":"e_1_3_2_1_3_1","volume-title":"Longwriter: Unleashing 10,000 word generation from long context llms. arXiv preprint arXiv:2408.07055","author":"Bai Yushi","year":"2024","unstructured":"Yushi Bai, Jiajie Zhang, Xin Lv, Linzhi Zheng, Siqi Zhu, Lei Hou, Yuxiao Dong, Jie Tang, and Juanzi Li. 2024. Longwriter: Unleashing 10,000 word generation from long context llms. arXiv preprint arXiv:2408.07055 (2024)."},{"key":"e_1_3_2_1_4_1","volume-title":"European Conference on Computer Vision. Springer, 370-387","author":"Chen Lin","year":"2024","unstructured":"Lin Chen, Jinsong Li, Xiaoyi Dong, Pan Zhang, Conghui He, Jiaqi Wang, Feng Zhao, and Dahua Lin. 2024a. Sharegpt4v: Improving large multi-modal models with better captions. In European Conference on Computer Vision. Springer, 370-387."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Lin Chen Xilin Wei Jinsong Li Xiaoyi Dong Pan Zhang Yuhang Zang Zehui Chen Haodong Duan Bin Lin Zhenyu Tang et al. 2024b. Sharegpt4video: Improving video understanding and generation with better captions. arXiv preprint arXiv:2406.04325 (2024).","DOI":"10.52202\/079017-0614"},{"key":"e_1_3_2_1_6_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 24185-24198","author":"Chen Zhe","year":"2024","unstructured":"Zhe Chen, Jiannan Wu, Wenhai Wang, Weijie Su, Guo Chen, Sen Xing, Muyan Zhong, Qinglong Zhang, Xizhou Zhu, Lewei Lu, et al., 2024c. Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 24185-24198."},{"key":"e_1_3_2_1_7_1","volume-title":"VisionArena: 230K Real World User-VLM Conversations with Preference Labels. arXiv preprint arXiv:2412.08687","author":"Chou Christopher","year":"2024","unstructured":"Christopher Chou, Lisa Dunlap, Koki Mashita, Krishna Mandal, Trevor Darrell, Ion Stoica, Joseph E Gonzalez, and Wei-Lin Chiang. 2024. VisionArena: 230K Real World User-VLM Conversations with Preference Labels. arXiv preprint arXiv:2412.08687 (2024)."},{"key":"e_1_3_2_1_8_1","unstructured":"Chao Deng Jiale Yuan Pi Bu Peijie Wang Zhong-Zhi Li Jian Xu Xiao-Hui Li Yuan Gao Jun Song Bo Zheng et al. 2024. LongDocURL: a Comprehensive Multimodal Long Document Benchmark Integrating Understanding Reasoning and Locating. arXiv preprint arXiv:2412.18424 (2024)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01356"},{"key":"e_1_3_2_1_10_1","volume-title":"V2PE: Improving Multimodal Long-Context Capability of Vision-Language Models with Variable Visual Position Encoding. arXiv preprint arXiv:2412.09616","author":"Ge Junqi","year":"2024","unstructured":"Junqi Ge, Ziyi Chen, Jintao Lin, Jinguo Zhu, Xihui Liu, Jifeng Dai, and Xizhou Zhu. 2024. V2PE: Improving Multimodal Long-Context Capability of Vision-Language Models with Variable Visual Position Encoding. arXiv preprint arXiv:2412.09616 (2024)."},{"key":"e_1_3_2_1_11_1","unstructured":"Team GLM Aohan Zeng Bin Xu Bowen Wang Chenhui Zhang Da Yin Diego Rojas Guanyu Feng Hanlin Zhao Hanyu Lai Hao Yu Hongning Wang Jiadai Sun Jiajie Zhang Jiale Cheng Jiayi Gui Jie Tang Jing Zhang Juanzi Li Lei Zhao Lindong Wu Lucen Zhong Mingdao Liu Minlie Huang Peng Zhang Qinkai Zheng Rui Lu Shuaiqi Duan Shudan Zhang Shulin Cao Shuxun Yang Weng Lam Tam Wenyi Zhao Xiao Liu Xiao Xia Xiaohan Zhang Xiaotao Gu Xin Lv Xinghan Liu Xinyi Liu Xinyue Yang Xixuan Song Xunkai Zhang Yifan An Yifan Xu Yilin Niu Yuantao Yang Yueyan Li Yushi Bai Yuxiao Dong Zehan Qi Zhaoyu Wang Zhen Yang Zhengxiao Du Zhenyu Hou and Zihan Wang. 2024. ChatGLM: A Family of Large Language Models from GLM-130B to GLM-4 All Tools. arXiv preprint arXiv:2406.12793 (2024)."},{"key":"e_1_3_2_1_12_1","unstructured":"Daya Guo Dejian Yang Haowei Zhang Junxiao Song Ruoyu Zhang Runxin Xu Qihao Zhu Shirong Ma Peiyi Wang Xiao Bi et al. 2025. Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning. arXiv preprint arXiv:2501.12948 (2025)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.3389\/frai.2024.1430984"},{"key":"e_1_3_2_1_14_1","unstructured":"Wenyi Hong Wenmeng Yu Xiaotao Gu Guo Wang Guobing Gan Haomiao Tang Jiale Cheng Ji Qi Junhui Ji Lihang Pan et al. 2025. GLM-4.1 V-Thinking: Towards Versatile Multimodal Reasoning with Scalable Reinforcement Learning. arXiv preprint arXiv:2507.01006 (2025)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00553"},{"key":"e_1_3_2_1_16_1","volume-title":"Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lucile Saulnier, et al.","author":"Jiang Albert Q","year":"2023","unstructured":"Albert Q Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lucile Saulnier, et al., 2023. Mistral 7B. arXiv preprint arXiv:2310.06825 (2023)."},{"key":"e_1_3_2_1_17_1","volume-title":"Llms-as-judges: a comprehensive survey on llm-based evaluation methods. arXiv preprint arXiv:2412.05579","author":"Li Haitao","year":"2024","unstructured":"Haitao Li, Qian Dong, Junjie Chen, Huixue Su, Yujia Zhou, Qingyao Ai, Ziyi Ye, and Yiqun Liu. 2024a. Llms-as-judges: a comprehensive survey on llm-based evaluation methods. arXiv preprint arXiv:2412.05579 (2024)."},{"key":"e_1_3_2_1_18_1","volume-title":"GIRAFFE: Design Choices for Extending the Context Length of Visual Language Models. arXiv preprint arXiv:2412.12735","author":"Li Mukai","year":"2024","unstructured":"Mukai Li, Lei Li, Shansan Gong, and Qi Liu. 2024b. GIRAFFE: Design Choices for Extending the Context Length of Visual Language Models. arXiv preprint arXiv:2412.12735 (2024)."},{"key":"e_1_3_2_1_19_1","volume-title":"Self-alignment with instruction backtranslation. arXiv preprint arXiv:2308.06259","author":"Li Xian","year":"2023","unstructured":"Xian Li, Ping Yu, Chunting Zhou, Timo Schick, Omer Levy, Luke Zettlemoyer, Jason Weston, and Mike Lewis. 2023. Self-alignment with instruction backtranslation. arXiv preprint arXiv:2308.06259 (2023)."},{"key":"e_1_3_2_1_20_1","first-page":"740","volume-title":"Zurich","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. In Computer Vision-ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13. Springer, 740-755."},{"key":"e_1_3_2_1_21_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2024a. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_22_1","volume-title":"European Conference on Computer Vision. Springer, 38-55","author":"Liu Shilong","year":"2024","unstructured":"Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Qing Jiang, Chunyuan Li, Jianwei Yang, Hang Su, et al., 2024c. Grounding dino: Marrying dino with grounded pre-training for open-set object detection. In European Conference on Computer Vision. Springer, 38-55."},{"key":"e_1_3_2_1_23_1","volume-title":"Mia-dpo: Multi-image augmented direct preference optimization for large vision-language models. arXiv preprint arXiv:2410.17637","author":"Liu Ziyu","year":"2024","unstructured":"Ziyu Liu, Yuhang Zang, Xiaoyi Dong, Pan Zhang, Yuhang Cao, Haodong Duan, Conghui He, Yuanjun Xiong, Dahua Lin, and Jiaqi Wang. 2024b. Mia-dpo: Multi-image augmented direct preference optimization for large vision-language models. arXiv preprint arXiv:2410.17637 (2024)."},{"key":"e_1_3_2_1_24_1","volume-title":"Mmevol: Empowering multimodal large language models with evol-instruct. arXiv preprint arXiv:2409.05840","author":"Luo Run","year":"2024","unstructured":"Run Luo, Haonan Zhang, Longze Chen, Ting-En Lin, Xiong Liu, Yuchuan Wu, Min Yang, Minzheng Wang, Pengpeng Zeng, Lianli Gao, et al., 2024. Mmevol: Empowering multimodal large language models with evol-instruct. arXiv preprint arXiv:2409.05840 (2024)."},{"key":"e_1_3_2_1_25_1","volume-title":"Mmlongbench-doc: Benchmarking long-context document understanding with visualizations. arXiv preprint arXiv:2407.01523","author":"Ma Yubo","year":"2024","unstructured":"Yubo Ma, Yuhang Zang, Liangyu Chen, Meiqi Chen, Yizhu Jiao, Xinze Li, Xinyuan Lu, Ziyu Liu, Yan Ma, Xiaoyi Dong, et al., 2024. Mmlongbench-doc: Benchmarking long-context document understanding with visualizations. arXiv preprint arXiv:2407.01523 (2024)."},{"key":"e_1_3_2_1_26_1","unstructured":"OpenAI. 2024. OpenAI: Hello GPT-4o. https:\/\/openai.com\/index\/hello-gpt-4o\/"},{"key":"e_1_3_2_1_27_1","volume-title":"Suri: Multi-constraint instruction following for long-form text generation. arXiv preprint arXiv:2406.19371","author":"Pham Chau Minh","year":"2024","unstructured":"Chau Minh Pham, Simeng Sun, and Mohit Iyyer. 2024. Suri: Multi-constraint instruction following for long-form text generation. arXiv preprint arXiv:2406.19371 (2024)."},{"key":"e_1_3_2_1_28_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Rafailov Rafael","year":"2024","unstructured":"Rafael Rafailov, Archit Sharma, Eric Mitchell, Christopher D Manning, Stefano Ermon, and Chelsea Finn. 2024. Direct preference optimization: Your language model is secretly a reward model. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_29_1","first-page":"25278","article-title":"Laion-5b: An open large-scale dataset for training next generation image-text models","volume":"35","author":"Schuhmann Christoph","year":"2022","unstructured":"Christoph Schuhmann, Romain Beaumont, Richard Vencu, Cade Gordon, Ross Wightman, Mehdi Cherti, Theo Coombes, Aarush Katta, Clayton Mullis, Mitchell Wortsman, et al., 2022. Laion-5b: An open large-scale dataset for training next generation image-text models. Advances in Neural Information Processing Systems, Vol. 35 (2022), 25278-25294.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_30_1","volume-title":"Video-xl: Extra-long vision language model for hour-scale video understanding. arXiv preprint arXiv:2409.14485","author":"Shu Yan","year":"2024","unstructured":"Yan Shu, Peitian Zhang, Zheng Liu, Minghao Qin, Junjie Zhou, Tiejun Huang, and Bo Zhao. 2024. Video-xl: Extra-long vision language model for hour-scale video understanding. arXiv preprint arXiv:2409.14485 (2024)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1177\/00754240022004875"},{"key":"e_1_3_2_1_32_1","volume-title":"Ryan Burnell, Libin Bai, Anmol Gulati, Garrett Tanzer, Damien Vincent, Zhufeng Pan, Shibo Wang, et al.","author":"Team Gemini","year":"2024","unstructured":"Gemini Team, Petko Georgiev, Ving Ian Lei, Ryan Burnell, Libin Bai, Anmol Gulati, Garrett Tanzer, Damien Vincent, Zhufeng Pan, Shibo Wang, et al., 2024. Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context. arXiv preprint arXiv:2403.05530 (2024)."},{"key":"e_1_3_2_1_33_1","unstructured":"Qwen Team. 2025. Qwen2.5-VL. https:\/\/qwenlm.github.io\/blog\/qwen2.5-vl\/"},{"key":"e_1_3_2_1_34_1","volume-title":"Git: A generative image-to-text transformer for vision and language. arXiv preprint arXiv:2205.14100","author":"Wang Jianfeng","year":"2022","unstructured":"Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, and Lijuan Wang. 2022. Git: A generative image-to-text transformer for vision and language. arXiv preprint arXiv:2205.14100 (2022)."},{"key":"e_1_3_2_1_35_1","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan Jinze Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge et al. 2024a. Qwen2-vl: Enhancing vision-language model's perception of the world at any resolution. arXiv preprint arXiv:2409.12191 (2024)."},{"key":"e_1_3_2_1_36_1","volume-title":"Large language models are not fair evaluators. arXiv preprint arXiv:2305.17926","author":"Wang Peiyi","year":"2023","unstructured":"Peiyi Wang, Lei Li, Liang Chen, Zefan Cai, Dawei Zhu, Binghuai Lin, Yunbo Cao, Qi Liu, Tianyu Liu, and Zhifang Sui. 2023. Large language models are not fair evaluators. arXiv preprint arXiv:2305.17926 (2023)."},{"key":"e_1_3_2_1_37_1","volume-title":"Weaver: Foundation models for creative writing. arXiv preprint arXiv:2401.17268","author":"Wang Tiannan","year":"2024","unstructured":"Tiannan Wang, Jiamin Chen, Qingrui Jia, Shuai Wang, Ruoyu Fang, Huilin Wang, Zhaowei Gao, Chunzhao Xie, Chuou Xu, Jihong Dai, et al., 2024b. Weaver: Foundation models for creative writing. arXiv preprint arXiv:2401.17268 (2024)."},{"key":"e_1_3_2_1_38_1","volume-title":"Lvbench: An extreme long video understanding benchmark. arXiv preprint arXiv:2406.08035","author":"Wang Weihan","year":"2024","unstructured":"Weihan Wang, Zehai He, Wenyi Hong, Yean Cheng, Xiaohan Zhang, Ji Qi, Xiaotao Gu, Shiyu Huang, Bin Xu, Yuxiao Dong, et al., 2024c. Lvbench: An extreme long video understanding benchmark. arXiv preprint arXiv:2406.08035 (2024)."},{"key":"e_1_3_2_1_39_1","volume-title":"Longvideobench: A benchmark for long-context interleaved video-language understanding. arXiv preprint arXiv:2407.15754","author":"Wu Haoning","year":"2024","unstructured":"Haoning Wu, Dongxu Li, Bei Chen, and Junnan Li. 2024b. Longvideobench: A benchmark for long-context interleaved video-language understanding. arXiv preprint arXiv:2407.15754 (2024)."},{"key":"e_1_3_2_1_40_1","volume-title":"LongViTU: Instruction Tuning for Long-Form Video Understanding. arXiv preprint arXiv:2501.05037","author":"Wu Rujie","year":"2025","unstructured":"Rujie Wu, Xiaojian Ma, Hai Ci, Yue Fan, Yuxuan Wang, Haozhe Zhao, Qing Li, and Yizhou Wang. 2025b. LongViTU: Instruction Tuning for Long-Form Video Understanding. arXiv preprint arXiv:2501.05037 (2025)."},{"key":"e_1_3_2_1_41_1","volume-title":"Roy Ka-Wei Lee, and Juanzi Li","author":"Wu Yuhao","year":"2025","unstructured":"Yuhao Wu, Yushi Bai, Zhiqiang Hu, Roy Ka-Wei Lee, and Juanzi Li. 2025a. LongWriter-Zero: Mastering Ultra-Long Text Generation via Reinforcement Learning. arXiv preprint arXiv:2506.18841 (2025)."},{"key":"e_1_3_2_1_42_1","volume-title":"Zhiqing Hu, and Roy Ka-Wei Lee.","author":"Wu Yuhao","year":"2024","unstructured":"Yuhao Wu, Ming Shan Hee, Zhiqing Hu, and Roy Ka-Wei Lee. 2024a. LongGenBench: Benchmarking Long-Form Generation in Long Context LLMs. arXiv preprint arXiv:2409.02076 (2024)."},{"key":"e_1_3_2_1_43_1","volume-title":"Longvila: Scaling long-context visual language models for long videos. arXiv preprint arXiv:2408.10188","author":"Xue Fuzhao","year":"2024","unstructured":"Fuzhao Xue, Yukang Chen, Dacheng Li, Qinghao Hu, Ligeng Zhu, Xiuyu Li, Yunhao Fang, Haotian Tang, Shang Yang, Zhijian Liu, et al., 2024. Longvila: Scaling long-context visual language models for long videos. arXiv preprint arXiv:2408.10188 (2024)."},{"key":"e_1_3_2_1_44_1","unstructured":"Yuan Yao Tianyu Yu Ao Zhang Chongyi Wang Junbo Cui Hongji Zhu Tianchi Cai Haoyu Li Weilin Zhao Zhihui He et al. 2024. MiniCPM-V: A GPT-4V Level MLLM on Your Phone. arXiv preprint arXiv:2408.01800 (2024)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.285"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01310"},{"key":"e_1_3_2_1_47_1","volume-title":"Rlaif-v: Aligning mllms through open-source ai feedback for super gpt-4v trustworthiness. arXiv preprint arXiv:2405.17220","author":"Yu Tianyu","year":"2024","unstructured":"Tianyu Yu, Haoye Zhang, Yuan Yao, Yunkai Dang, Da Chen, Xiaoman Lu, Ganqu Cui, Taiwen He, Zhiyuan Liu, Tat-Seng Chua, et al., 2024b. Rlaif-v: Aligning mllms through open-source ai feedback for super gpt-4v trustworthiness. arXiv preprint arXiv:2405.17220 (2024)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3369699"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"crossref","unstructured":"Hao Zheng Xinyan Guan Hao Kong Jia Zheng Hongyu Lin Yaojie Lu Ben He Xianpei Han and Le Sun. 2025. PPTAgent: Generating and Evaluating Presentations Beyond Text-to-Slides. arXiv:2501.03936 [cs.AI] https:\/\/arxiv.org\/abs\/2501.03936","DOI":"10.18653\/v1\/2025.emnlp-main.728"},{"key":"e_1_3_2_1_50_1","volume-title":"Aligning modalities in vision large language models via preference fine-tuning. arXiv preprint arXiv:2402.11411","author":"Zhou Yiyang","year":"2024","unstructured":"Yiyang Zhou, Chenhang Cui, Rafael Rafailov, Chelsea Finn, and Huaxiu Yao. 2024. Aligning modalities in vision large language models via preference fine-tuning. arXiv preprint arXiv:2402.11411 (2024)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755221","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:48:58Z","timestamp":1765309738000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755221"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":50,"alternative-id":["10.1145\/3746027.3755221","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755221","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}