{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,20]],"date-time":"2026-04-20T21:14:53Z","timestamp":1776719693329,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":87,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754824","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:56:44Z","timestamp":1761375404000},"page":"11189-11198","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["MVQA-68K: A Multi-dimensional and Causally-annotated Dataset with Quality Interpretability for Video Assessment"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-7120-2777","authenticated-orcid":false,"given":"Yanyun","family":"Pu","sequence":"first","affiliation":[{"name":"Huawei Technologies Co., Hangzhou, Zhejiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5318-4806","authenticated-orcid":false,"given":"Kehan","family":"Li","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Hangzhou, Zhejiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-5897-2172","authenticated-orcid":false,"given":"Zeyi","family":"Huang","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Hangzhou, Zhejiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0203-8419","authenticated-orcid":false,"given":"Zhijie","family":"Zhong","sequence":"additional","affiliation":[{"name":"South China University of Technology, GuangZhou, Guangdong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2180-2101","authenticated-orcid":false,"given":"Kaixiang","family":"Yang","sequence":"additional","affiliation":[{"name":"South China University of Technology, GuangZhou, Guangdong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2018.2868262"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3680528.3687614"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3088505"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01265"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2010.5496296"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Huiyu Duan Qiang Hu JiaruiWang Liu Yang Zitong Xu Lu Liu Xiongkuo Min Chunlei Cai Tianxiao Ye Xiaoyun Zhang et al. 2024. FineVQ: Fine-Grained User Generated Content Video Quality Assessment. arXiv preprint arXiv:2412.19238 (2024).","DOI":"10.1109\/CVPR52734.2025.00305"},{"key":"e_1_3_2_1_7_1","volume-title":"MMBench-Video: A Long-Form Multi-Shot Benchmark for Holistic Video Understanding. arXiv preprint arXiv:2406.14515","author":"Fang Xinyu","year":"2024","unstructured":"Xinyu Fang, Kangrui Mao, Haodong Duan, Xiangyu Zhao, Yining Li, Dahua Lin, and Kai Chen. 2024. MMBench-Video: A Long-Form Multi-Shot Benchmark for Holistic Video Understanding. arXiv preprint arXiv:2406.14515 (2024)."},{"key":"e_1_3_2_1_8_1","unstructured":"Chaoyou Fu Peixian Chen Yunhang Shen Yulei Qin Mengdan Zhang Xu Lin Zhenyu Qiu Wei Lin Jinrui Yang Xiawu Zheng et al. 2023. MME: a comprehensive evaluation benchmark for multimodal large language models. CoRR abs\/2306.13394 (2023)."},{"key":"e_1_3_2_1_9_1","volume-title":"Video-mme: The first-ever comprehensive evaluation benchmark of multi-modal llms in video analysis. arXiv preprint arXiv:2405.21075","author":"Fu Chaoyou","year":"2024","unstructured":"Chaoyou Fu, Yuhan Dai, Yongdong Luo, Lei Li, Shuhuai Ren, Renrui Zhang, Zihan Wang, Chenyu Zhou, Yunhang Shen, Mengdan Zhang, et al. 2024. Video-mme: The first-ever comprehensive evaluation benchmark of multi-modal llms in video analysis. arXiv preprint arXiv:2405.21075 (2024)."},{"key":"e_1_3_2_1_10_1","volume-title":"Mme-survey: A comprehensive survey on evaluation of multimodal llms. arXiv preprint arXiv:2411.15296","author":"Fu Chaoyou","year":"2024","unstructured":"Chaoyou Fu, Yi-Fan Zhang, Shukang Yin, Bo Li, Xinyu Fang, Sirui Zhao, Haodong Duan, Xing Sun, Ziwei Liu, Liang Wang, et al. 2024. Mme-survey: A comprehensive survey on evaluation of multimodal llms. arXiv preprint arXiv:2411.15296 (2024)."},{"key":"e_1_3_2_1_11_1","volume-title":"LMM-VQA: Advancing video quality assessment with large multimodal models. arXiv preprint arXiv:2408.14008","author":"Ge Qihang","year":"2024","unstructured":"Qihang Ge,Wei Sun, Yu Zhang, Yunhao Li, Zhongpeng Ji, Fengyu Sun, Shangling Jui, Xiongkuo Min, and Guangtao Zhai. 2024. LMM-VQA: Advancing video quality assessment with large multimodal models. arXiv preprint arXiv:2408.14008 (2024)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2017.2707479"},{"key":"e_1_3_2_1_13_1","volume-title":"Videoscore: Building automatic metrics to simulate fine-grained human feedback for video generation. arXiv preprint arXiv:2406.15252","author":"He Xuan","year":"2024","unstructured":"Xuan He, Dongfu Jiang, Ge Zhang, Max Ku, Achint Soni, Sherman Siu, Haonan Chen, Abhranil Chandra, Ziyan Jiang, Aaran Arulraj, et al. 2024. Videoscore: Building automatic metrics to simulate fine-grained human feedback for video generation. arXiv preprint arXiv:2406.15252 (2024)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/QoMEX.2017.7965673"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/QoMEX.2017.7965673"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02060"},{"key":"e_1_3_2_1_17_1","volume-title":"International conference on machine learning. PMLR, 4904-4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and visionlanguage representation learning with noisy text supervision. In International conference on machine learning. PMLR, 4904-4916."},{"key":"e_1_3_2_1_18_1","volume-title":"VQA2: Visual Question Answering for Video Quality Assessment. arXiv preprint arXiv:2411.03795","author":"Jia Ziheng","year":"2024","unstructured":"Ziheng Jia, Zicheng Zhang, Jiaying Qian, HaoningWu,Wei Sun, Chunyi Li, Xiaohong Liu,Weisi Lin, Guangtao Zhai, and Xiongkuo Min. 2024. VQA2: Visual Question Answering for Video Quality Assessment. arXiv preprint arXiv:2411.03795 (2024)."},{"key":"e_1_3_2_1_19_1","volume-title":"International conference on machine learning. PMLR, 5583-5594","author":"Kim Wonjae","year":"2021","unstructured":"Wonjae Kim, Bokyung Son, and Ildoo Kim. 2021. Vilt: Vision-and-language transformer without convolution or region supervision. In International conference on machine learning. PMLR, 5583-5594."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2019.2923051"},{"key":"e_1_3_2_1_21_1","volume-title":"Natural Language Understanding and Inference with MLLM in Visual Question Answering: A Survey. Comput. Surveys","author":"Kuang Jiayi","year":"2024","unstructured":"Jiayi Kuang, Ying Shen, Jingyou Xie, Haohao Luo, Zhe Xu, Ronghao Li, Yinghui Li, Xianfeng Cheng, Xika Lin, and Yu Han. 2024. Natural Language Understanding and Inference with MLLM in Visual Question Answering: A Survey. Comput. Surveys (2024)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3164467"},{"key":"e_1_3_2_1_23_1","volume-title":"Proceedings of the 27th ACM international conference on multimedia. 2351-2359","author":"Li Dingquan","year":"2019","unstructured":"Dingquan Li, Tingting Jiang, and Ming Jiang. 2019. Quality assessment of in-thewild videos. In Proceedings of the 27th ACM international conference on multimedia. 2351-2359."},{"key":"e_1_3_2_1_24_1","volume-title":"Proceedings of the 27th ACM international conference on multimedia. 2351-2359","author":"Li Dingquan","year":"2019","unstructured":"Dingquan Li, Tingting Jiang, and Ming Jiang. 2019. Quality assessment of in-thewild videos. In Proceedings of the 27th ACM international conference on multimedia. 2351-2359."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/MIPR49039.2020.00015"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/MIPR49039.2020.00015"},{"key":"e_1_3_2_1_27_1","volume-title":"VMAF: The journey continues. Netflix Technology Blog 25, 1","author":"Li Zhi","year":"2018","unstructured":"Zhi Li, Christos Bampis, Julie Novak, Anne Aaron, Kyle Swanson, Anush Moorthy, and JD Cock. 2018. VMAF: The journey continues. Netflix Technology Blog 25, 1 (2018)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547849"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","first-page":"1500","DOI":"10.1109\/TIP.2011.2175935","article-title":"Image quality assessment based on gradient similarity","volume":"21","author":"Liu Anmin","year":"2011","unstructured":"Anmin Liu, Weisi Lin, and Manish Narwaria. 2011. Image quality assessment based on gradient similarity. IEEE Transactions on Image Processing 21, 4 (2011), 1500-1512.","journal-title":"IEEE Transactions on Image Processing"},{"key":"e_1_3_2_1_30_1","volume-title":"Llava-next: Improved reasoning, ocr, and world knowledge.","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Yuheng Li, Bo Li, Yuanhan Zhang, Sheng Shen, and Yong Jae Lee. 2024. Llava-next: Improved reasoning, ocr, and world knowledge."},{"key":"e_1_3_2_1_31_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems 36","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2024. Visual instruction tuning. Advances in neural information processing systems 36 (2024)."},{"key":"e_1_3_2_1_32_1","unstructured":"Jie Liu Gongye Liu Jiajun Liang Ziyang Yuan Xiaokun Liu Mingwu Zheng Xiele Wu Qiulin Wang Wenyu Qin Menghan Xia et al. 2025. Improving Video Generation with Human Feedback. arXiv preprint arXiv:2501.13918 (2025)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02090"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"e_1_3_2_1_35_1","first-page":"62352","article-title":"Fetv: A benchmark for fine-grained evaluation of opendomain text-to-video generation","volume":"36","author":"Liu Yuanxin","year":"2023","unstructured":"Yuanxin Liu, Lei Li, Shuhuai Ren, Rundong Gao, Shicheng Li, Sishuo Chen, Xu Sun, and Lu Hou. 2023. Fetv: A benchmark for fine-grained evaluation of opendomain text-to-video generation. Advances in Neural Information Processing Systems 36 (2023), 62352-62387.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_36_1","volume-title":"Sora: A review on background, technology, limitations, and opportunities of large vision models. arXiv preprint arXiv:2402.17177","author":"Liu Yixin","year":"2024","unstructured":"Yixin Liu, Kai Zhang, Yuan Li, Zhiling Yan, Chujie Gao, Ruoxi Chen, Zhengqing Yuan, Yue Huang, Hanchi Sun, Jianfeng Gao, et al. 2024. Sora: A review on background, technology, limitations, and opportunities of large vision models. arXiv preprint arXiv:2402.17177 (2024)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02453"},{"key":"e_1_3_2_1_39_1","first-page":"46212","article-title":"Egoschema: A diagnostic benchmark for very long-form video language understanding","volume":"36","author":"Mangalam Karttikeya","year":"2023","unstructured":"Karttikeya Mangalam, Raiymbek Akshulakov, and Jitendra Malik. 2023. Egoschema: A diagnostic benchmark for very long-form video language understanding. Advances in Neural Information Processing Systems 36 (2023), 46212- 46244.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4133-3"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2012.2214050"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2015.2502725"},{"key":"e_1_3_2_1_43_1","volume-title":"Making a ''completely blind'' image quality analyzer","author":"Mittal Anish","year":"2012","unstructured":"Anish Mittal, Rajiv Soundararajan, and Alan C Bovik. 2012. Making a ''completely blind'' image quality analyzer. IEEE Signal processing letters 20, 3 (2012), 209-212."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.23919\/FRUCT50888.2021.9347604"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2016.2562513"},{"key":"e_1_3_2_1_46_1","volume-title":"International conference on machine learning. PMLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748-8763."},{"key":"e_1_3_2_1_47_1","volume-title":"A Guide to Image-and Video-Based Small Object Detection Using Deep Learning: Case Study of Maritime Surveillance","author":"Rekavandi Aref Miri","year":"2025","unstructured":"Aref Miri Rekavandi, Lian Xu, Farid Boussaid, Abd-Krim Seghouane, Stephen Hoefs, and Mohammed Bennamoun. 2025. A Guide to Image-and Video-Based Small Object Detection Using Deep Learning: Case Study of Maritime Surveillance. IEEE Transactions on Intelligent Transportation Systems (2025)."},{"key":"e_1_3_2_1_48_1","volume-title":"Blind image quality assessment: A natural scene statistics approach in the DCT domain","author":"Saad Michele A","year":"2012","unstructured":"Michele A Saad, Alan C Bovik, and Christophe Charrier. 2012. Blind image quality assessment: A natural scene statistics approach in the DCT domain. IEEE transactions on Image Processing 21, 8 (2012), 3339-3352."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2869673"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2869673"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3685517"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00756"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548329"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548329"},{"key":"e_1_3_2_1_55_1","volume-title":"Enhancing blind video quality assessment with rich quality-aware features. arXiv preprint arXiv:2405.08745","author":"Sun Wei","year":"2024","unstructured":"Wei Sun, Haoning Wu, Zicheng Zhang, Jun Jia, Zhichao Zhang, Linhan Cao, Qiubo Chen, Xiongkuo Min, Weisi Lin, and Guangtao Zhai. 2024. Enhancing blind video quality assessment with rich quality-aware features. arXiv preprint arXiv:2405.08745 (2024)."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3072221"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/OJSP.2021.3090333"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2011.6116171"},{"key":"e_1_3_2_1_59_1","volume-title":"A benchmark for multi-view video understanding. arXiv preprint arXiv:2308.12345","author":"Hwang Jenq-Neng","year":"2023","unstructured":"GaoangWang, Jenq-Neng Hwang, Yanting Zhang, and Yan Lu Mv-bench. 2023. A benchmark for multi-view video understanding. arXiv preprint arXiv:2308.12345 (2023)."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2016.7532610"},{"key":"e_1_3_2_1_61_1","first-page":"5517","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","volume":"38","author":"Xing Jiazheng","year":"2024","unstructured":"MengmengWang, Jiazheng Xing, Boyuan Jiang, Jun Chen, Jianbiao Mei, Xingxing Zuo, Guang Dai, Jingdong Wang, and Yong Liu. 2024. A multimodal, multi-task adapting framework for video action recognition. In Proceedings of the AAAI Conference on Artificial Intelligence, Vol. 38. 5517-5525."},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"crossref","unstructured":"Qiuheng Wang Yukai Shi Jiarong Ou Rui Chen Ke Lin Jiahao Wang Boyuan Jiang Haotian Yang Mingwu Zheng Xin Tao et al. 2024. Koala-36m: A largescale video dataset improving consistency between fine-grained conditions and video content. arXiv preprint arXiv:2410.08260 (2024).","DOI":"10.1109\/CVPR52734.2025.00789"},{"key":"e_1_3_2_1_63_1","volume-title":"2019 IEEE 21st international workshop on multimedia signal processing (MMSP). IEEE, 1-5.","author":"Inguva Sasi","year":"2019","unstructured":"YilinWang, Sasi Inguva, and Balu Adsumilli. 2019. YouTube UGC dataset for video compression research. In 2019 IEEE 21st international workshop on multimedia signal processing (MMSP). IEEE, 1-5."},{"key":"e_1_3_2_1_64_1","volume-title":"2019 IEEE 21st international workshop on multimedia signal processing (MMSP). IEEE, 1-5.","author":"Inguva Sasi","year":"2019","unstructured":"YilinWang, Sasi Inguva, and Balu Adsumilli. 2019. YouTube UGC dataset for video compression research. In 2019 IEEE 21st international workshop on multimedia signal processing (MMSP). IEEE, 1-5."},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01323"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20068-7_31"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3249741"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01843"},{"key":"e_1_3_2_1_69_1","volume-title":"Proceedings of the 31st ACM International Conference on Multimedia. 1045-1054","author":"Zhang Erli","year":"2023","unstructured":"HaoningWu, Erli Zhang, Liang Liao, Chaofeng Chen, Jingwen Hou, AnnanWang, Wenxiu Sun, Qiong Yan, and Weisi Lin. 2023. Towards explainable in-the-wild video quality assessment: A database and a language-prompted approach. In Proceedings of the 31st ACM International Conference on Multimedia. 1045-1054."},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01843"},{"key":"e_1_3_2_1_71_1","volume-title":"Q-bench: A benchmark for general-purpose foundation models on low-level vision. arXiv preprint arXiv:2309.14181","author":"Wu Haoning","year":"2023","unstructured":"Haoning Wu, Zicheng Zhang, Erli Zhang, Chaofeng Chen, Liang Liao, Annan Wang, Chunyi Li, Wenxiu Sun, Qiong Yan, Guangtao Zhai, et al. 2023. Q-bench: A benchmark for general-purpose foundation models on low-level vision. arXiv preprint arXiv:2309.14181 (2023)."},{"key":"e_1_3_2_1_72_1","volume-title":"Q-bench: A benchmark for general-purpose foundation models on low-level vision. arXiv preprint arXiv:2309.14181","author":"Wu Haoning","year":"2023","unstructured":"Haoning Wu, Zicheng Zhang, Erli Zhang, Chaofeng Chen, Liang Liao, Annan Wang, Chunyi Li, Wenxiu Sun, Qiong Yan, Guangtao Zhai, et al. 2023. Q-bench: A benchmark for general-purpose foundation models on low-level vision. arXiv preprint arXiv:2309.14181 (2023)."},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02408"},{"key":"e_1_3_2_1_74_1","volume-title":"Q-align: Teaching lmms for visual scoring via discrete text-defined levels. arXiv preprint arXiv:2312.17090","author":"Zhang Zicheng","year":"2023","unstructured":"HaoningWu, Zicheng Zhang,Weixia Zhang, Chaofeng Chen, Liang Liao, Chunyi Li, Yixuan Gao, Annan Wang, Erli Zhang, Wenxiu Sun, et al. 2023. Q-align: Teaching lmms for visual scoring via discrete text-defined levels. arXiv preprint arXiv:2312.17090 (2023)."},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680598"},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC59245.2023.00014"},{"key":"e_1_3_2_1_77_1","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 14019-14029","author":"Ying Zhenqiang","year":"2021","unstructured":"Zhenqiang Ying, Maniratnam Mandal, Deepti Ghadiyaram, and Alan Bovik. 2021. Patch-vq:'patching up'the video quality problem. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 14019-14029."},{"key":"e_1_3_2_1_78_1","volume-title":"Descriptive image quality assessment in the wild. arXiv preprint arXiv:2405.18842","author":"You Zhiyuan","year":"2024","unstructured":"Zhiyuan You, Jinjin Gu, Zheyuan Li, Xin Cai, Kaiwen Zhu, Chao Dong, and Tianfan Xue. 2024. Descriptive image quality assessment in the wild. arXiv preprint arXiv:2405.18842 (2024)."},{"key":"e_1_3_2_1_79_1","volume-title":"European Conference on Computer Vision. Springer, 259-276","author":"You Zhiyuan","year":"2024","unstructured":"Zhiyuan You, Zheyuan Li, Jinjin Gu, Zhenfei Yin, Tianfan Xue, and Chao Dong. 2024. Depicting beyond scores: Advancing image quality assessment through multi-modal language models. In European Conference on Computer Vision. Springer, 259-276."},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612023"},{"key":"e_1_3_2_1_81_1","volume-title":"Internlm-xcomposer: A vision-language large model for advanced text-image comprehension and composition. arXiv preprint arXiv:2309.15112","author":"Zhang Pan","year":"2023","unstructured":"Pan Zhang, Xiaoyi Dong, Bin Wang, Yuhang Cao, Chao Xu, Linke Ouyang, Zhiyuan Zhao, Haodong Duan, Songyang Zhang, Shuangrui Ding, et al. 2023. Internlm-xcomposer: A vision-language large model for advanced text-image comprehension and composition. arXiv preprint arXiv:2309.15112 (2023)."},{"key":"e_1_3_2_1_82_1","unstructured":"Yi-Fan Zhang Huanyu Zhang Haochen Tian Chaoyou Fu Shuangqing Zhang Junfei Wu Feng Li Kun Wang Qingsong Wen Zhang Zhang et al. 2024. MMERealWorld: Could Your Multimodal LLM Challenge High-Resolution Real-World Scenarios that are Difficult for Humans? arXiv preprint arXiv:2408.13257 (2024)."},{"key":"e_1_3_2_1_83_1","unstructured":"Zicheng Zhang Ziheng Jia Haoning Wu Chunyi Li Zijian Chen Yingjie Zhou Wei Sun Xiaohong Liu Xiongkuo Min Weisi Lin et al. 2024. Q-Bench-Video: Benchmarking the Video Quality Understanding of LMMs. arXiv preprint arXiv:2409.20063 (2024)."},{"key":"e_1_3_2_1_84_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICMEW63481.2024.10645451"},{"key":"e_1_3_2_1_85_1","volume-title":"andWeisi Lin","author":"Zhang Zicheng","year":"2024","unstructured":"Zicheng Zhang, HaoningWu, Erli Zhang, Guangtao Zhai, andWeisi Lin. 2024. QBench: A Benchmark for Multi-modal Foundation Models on Low-level Vision from Single Images to Pairs. arXiv preprint arXiv:2402.07116 (2024)."},{"key":"e_1_3_2_1_86_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00174"},{"key":"e_1_3_2_1_87_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2022.3215311"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754824","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:45:43Z","timestamp":1765309543000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754824"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":87,"alternative-id":["10.1145\/3746027.3754824","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754824","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}