{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,24]],"date-time":"2026-03-24T19:07:35Z","timestamp":1774379255541,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":71,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"OPPO Research Fund"},{"name":"Double First-Class Overseas Research Project of Xidian University"},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62171340, 61991451 and 61771473"],"award-info":[{"award-number":["62171340, 61991451 and 61771473"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680649","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"5911-5920","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":25,"title":["AesExpert: Towards Multi-modality Foundation Model for Image Aesthetics Perception"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0908-2180","authenticated-orcid":false,"given":"Yipo","family":"Huang","sequence":"first","affiliation":[{"name":"School of Artificial Intelligence, Xidian University &amp; Nanyang Technological University, Xi'an, Shaanxi, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-8468-1970","authenticated-orcid":false,"given":"Xiangfei","family":"Sheng","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Xidian University, Xi'an, Shaanxi, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-3398-1286","authenticated-orcid":false,"given":"Zhichao","family":"Yang","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Xidian University, Xi'an, Shaanxi, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5929-2026","authenticated-orcid":false,"given":"Quan","family":"Yuan","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Xidian University, Xi'an, Shaanxi, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8004-387X","authenticated-orcid":false,"given":"Zhichao","family":"Duan","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Xidian University, Xi'an, Shaanxi, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0509-3782","authenticated-orcid":false,"given":"Pengfei","family":"Chen","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Xidian University, Xi'an, Shaanxi, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9069-8796","authenticated-orcid":false,"given":"Leida","family":"Li","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Xidian University, Xi'an, Shaanxi, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9866-1947","authenticated-orcid":false,"given":"Weisi","family":"Lin","sequence":"additional","affiliation":[{"name":"College of Computing and Data Science, Nanyang Technological University, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2179-3292","authenticated-orcid":false,"given":"Guangming","family":"Shi","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Xidian University, Xi'an, Shaanxi, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01140"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"e_1_3_2_1_3_1","volume-title":"Localization, Text Reading, and Beyond. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_1_4_1","volume-title":"MiniGPT-v2: large language model as a unified interface for vision-language multi-task learning. arXiv preprint arXiv:2310.09478","author":"Chen Jun","year":"2023","unstructured":"Jun Chen, Deyao Zhu, Xiaoqian Shen, Xiang Li, Zechu Liu, Pengchuan Zhang, Raghuraman Krishnamoorthi, Vikas Chandra, Yunyang Xiong, and Mohamed Elhoseiny. 2023. MiniGPT-v2: large language model as a unified interface for vision-language multi-task learning. arXiv preprint arXiv:2310.09478 (2023)."},{"key":"e_1_3_2_1_5_1","volume-title":"ShareGPT4V: Improving Large Multi-Modal Models with Better Captions. arXiv preprint arXiv:2311.12793","author":"Chen Lin","year":"2023","unstructured":"Lin Chen, Jisong Li, Xiaoyi Dong, Pan Zhang, Conghui He, Jiaqi Wang, Feng Zhao, and Dahua Lin. 2023. ShareGPT4V: Improving Large Multi-Modal Models with Better Captions. arXiv preprint arXiv:2311.12793 (2023)."},{"key":"e_1_3_2_1_6_1","unstructured":"Hyung Won Chung Le Hou Shayne Longpre Barret Zoph Yi Tay William Fedus Yunxuan Li Xuezhi Wang Mostafa Dehghani Siddhartha Brahma Albert Webson Shixiang Shane Gu Zhuyun Dai Mirac Suzgun Xinyun Chen Aakanksha Chowdhery Alex Castro-Ros Marie Pellat Kevin Robinson Dasha Valter Sharan Narang Gaurav Mishra Adams Yu Vincent Zhao Yanping Huang Andrew Dai Hongkun Yu Slav Petrov Ed H. Chi Jeff Dean Jacob Devlin Adam Roberts Denny Zhou Quoc V. Le and Jason Wei. 2022. Scaling Instruction-Finetuned Language Models. arXiv preprint arXiv:2210.11416 (2022)."},{"key":"e_1_3_2_1_7_1","volume-title":"Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven Hoi.","author":"Dai Wenliang","year":"2023","unstructured":"Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven Hoi. 2023. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. arXiv preprint arXiv:2305.06500 (2023)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2017.2696576"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00373"},{"key":"e_1_3_2_1_10_1","unstructured":"Google. 2023. Build with Gemini. https:\/\/ai.google.dev\/"},{"key":"e_1_3_2_1_11_1","volume-title":"The Instinctive Bias: Spurious Images lead to Hallucination in MLLMs. arXiv preprint arXiv:2402.03757","author":"Han Tianyang","year":"2024","unstructured":"Tianyang Han, Qing Lian, Rui Pan, Renjie Pi, Jipeng Zhang, Shizhe Diao, Yong Lin, and Tong Zhang. 2024. The Instinctive Bias: Spurious Images lead to Hallucination in MLLMs. arXiv preprint arXiv:2402.03757 (2024)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2022\/132"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.3389\/frai.2022.976235"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3295748"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.2967829"},{"key":"e_1_3_2_1_16_1","volume-title":"Visual Hallucinations of Multi-modal Large Language Models. arXiv preprint arXiv:2402.14683","author":"Huang Wen","year":"2024","unstructured":"Wen Huang, Hongbin Liu, Minxin Guo, and Neil Zhenqiang Gong. 2024. Visual Hallucinations of Multi-modal Large Language Models. arXiv preprint arXiv:2402.14683 (2024)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2024.3389452"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3225728"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1049\/iet-ipr.2019.1195"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2020.2988830"},{"key":"e_1_3_2_1_21_1","volume-title":"AesBench: An Expert Benchmark for Multimodal Large Language Models on Image Aesthetics Perception. arXiv preprint arXiv:2401.08276","author":"Huang Yipo","year":"2024","unstructured":"Yipo Huang, Quan Yuan, Xiangfei Sheng, Zhichao Yang, Haoning Wu, Pengfei Chen, Yuzhe Yang, Leida Li, and Weisi Lin. 2024. AesBench: An Expert Benchmark for Multimodal Large Language Models on Image Aesthetics Perception. arXiv preprint arXiv:2401.08276 (2024)."},{"key":"e_1_3_2_1_22_1","unstructured":"Huggingface. 2023. Introducing IDEFICS: An open reproduction of state-of-the-art visual language model. https:\/\/huggingface.co\/blog\/idefics"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICME52920.2022.9859846"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503160"},{"key":"e_1_3_2_1_25_1","volume-title":"Rate-adaptive neural network for image compressive sensing","author":"Hui Chen","year":"2023","unstructured":"Chen Hui, Shengping Zhang, Wenxue Cui, Shaohui Liu, Feng Jiang, and Debin Zhao. 2023. Rate-adaptive neural network for image compressive sensing. IEEE Trans. Multimedia (2023)."},{"key":"e_1_3_2_1_26_1","unstructured":"ITU. 2012. Methodology for the Subjective Assessment of the Quality of Television Pictures. In Recommendation ITU-R BT.500--13. ITU."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00968"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_40"},{"key":"e_1_3_2_1_29_1","volume-title":"Impressions: Understanding Visual Semiotics and Aesthetic Impact. arXiv preprint arXiv:2310.17887","author":"Kruk Julia","year":"2023","unstructured":"Julia Kruk, Caleb Ziems, and Diyi Yang. 2023. Impressions: Understanding Visual Semiotics and Aesthetic Impact. arXiv preprint arXiv:2310.17887 (2023)."},{"key":"e_1_3_2_1_30_1","volume-title":"LISA: Reasoning Segmentation via Large Language Model. arXiv preprint arXiv:2308.00692","author":"Lai Xin","year":"2023","unstructured":"Xin Lai, Zhuotao Tian, Yukang Chen, Yanwei Li, Yuhui Yuan, Shu Liu, and Jiaya Jia. 2023. LISA: Reasoning Segmentation via Large Language Model. arXiv preprint arXiv:2308.00692 (2023)."},{"key":"e_1_3_2_1_31_1","volume-title":"Neural Abstract Style Transfer for Chinese Traditional Painting. arXiv preprint arXiv:1812.03264","author":"Li Bo","year":"2018","unstructured":"Bo Li, Caiming Xiong, Tianfu Wu, Yu Zhou, Lun Zhang, and Rufeng Chu. 2018. Neural Abstract Style Transfer for Chinese Traditional Painting. arXiv preprint arXiv:1812.03264 (2018)."},{"key":"e_1_3_2_1_32_1","volume-title":"Otter: A Multi-Modal Model with In-Context Instruction Tuning. arXiv preprint arXiv:2305.03726","author":"Li Bo","year":"2023","unstructured":"Bo Li, Yuanhan Zhang, Liangyu Chen, Jinghao Wang, Jingkang Yang, and Ziwei Liu. 2023. Otter: A Multi-Modal Model with In-Context Instruction Tuning. arXiv preprint arXiv:2305.03726 (2023)."},{"key":"e_1_3_2_1_33_1","volume-title":"AGIQA-3K: An Open Database for AI-Generated Image Quality Assessment. arXiv preprint arXiv:2306.04717","author":"Li Chunyi","year":"2023","unstructured":"Chunyi Li, Zicheng Zhang, Haoning Wu, Wei Sun, Xiongkuo Min, Xiaohong Liu, Guangtao Zhai, and Weisi Lin. 2023. AGIQA-3K: An Open Database for AI-Generated Image Quality Assessment. arXiv preprint arXiv:2306.04717 (2023)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3249185"},{"key":"e_1_3_2_1_35_1","article-title":"Personality-assisted multi-task learning for generic and personalized image aesthetics assessment","volume":"29","author":"Li Leida","year":"2020","unstructured":"Leida Li, Hancheng Zhu, Sicheng Zhao, Guiguang Ding, and Weisi Lin. 2020. Personality-assisted multi-task learning for generic and personalized image aesthetics assessment. IEEE Trans. Image Process., Vol. 29 (Jan. 2020), 3898--3910.","journal-title":"IEEE Trans. Image Process."},{"key":"e_1_3_2_1_36_1","volume-title":"FakeBench: Uncover the Achilles' Heels of Fake Images with Large Multimodal Models. arXiv preprint arXiv:2404.13306","author":"Li Yixuan","year":"2024","unstructured":"Yixuan Li, Xuelin Liu, Xiaoyang Wang, Shiqi Wang, and Weisi Lin. 2024. FakeBench: Uncover the Achilles' Heels of Fake Images with Large Multimodal Models. arXiv preprint arXiv:2404.13306 (2024)."},{"key":"e_1_3_2_1_37_1","volume-title":"Microsoft COCO: Common Objects in Context. arXiv preprint arXiv:1405.0312","author":"Lin Tsung-Yi","year":"2015","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, Lubomir Bourdev, Ross Girshick, James Hays, Pietro Perona, Deva Ramanan, C. Lawrence Zitnick, and Piotr Doll\u00e1r. 2015. Microsoft COCO: Common Objects in Context. arXiv preprint arXiv:1405.0312 (2015)."},{"key":"e_1_3_2_1_38_1","volume-title":"Improved Baselines with Visual Instruction Tuning. arXiv preprint arXiv:2310.03744","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Yuheng Li, and Yong Jae Lee. 2023. Improved Baselines with Visual Instruction Tuning. arXiv preprint arXiv:2310.03744 (2023)."},{"key":"e_1_3_2_1_39_1","volume-title":"Visual Instruction Tuning. arXiv preprint arXiv:2304.08485","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual Instruction Tuning. arXiv preprint arXiv:2304.08485 (2023)."},{"key":"e_1_3_2_1_40_1","volume-title":"Proc. Neural Inf. Process. Syst. 2507--2521","author":"Lu Pan","year":"2022","unstructured":"Pan Lu, Swaroop Mishra, Tony Xia, Liang Qiu, Kai-Wei Chang, Song-Chun Zhu, Oyvind Tafjord, Peter Clark, and Ashwin Kalyan. 2022. Learn to Explain: Multimodal Reasoning via Thought Chains for Science Question Answering. In Proc. Neural Inf. Process. Syst. 2507--2521."},{"key":"e_1_3_2_1_41_1","volume-title":"CLiF-VQA: Enhancing Video Quality Assessment by Incorporating High-Level Semantic Information related to Human Feelings. arXiv preprint arXiv:2311.07090","author":"Mi Yachun","year":"2023","unstructured":"Yachun Mi, Yu Li, Yan Shu, Chen Hui, Puchao Zhou, and Shaohui Liu. 2023. CLiF-VQA: Enhancing Video Quality Assessment by Incorporating High-Level Semantic Information related to Human Feelings. arXiv preprint arXiv:2311.07090 (2023)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448422"},{"key":"e_1_3_2_1_43_1","volume-title":"VVA: Video Values Analysis. In Chinese Conf. Pattern Recog. Comput. Vis. Springer, 346--358","author":"Mi Yachun","year":"2023","unstructured":"Yachun Mi, Yan Shu, Honglei Xu, Shaohui Liu, and Feng Jiang. 2023. VVA: Video Values Analysis. In Chinese Conf. Pattern Recog. Comput. Vis. Springer, 346--358."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00156"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6247954"},{"key":"e_1_3_2_1_47_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. arXiv preprint arXiv:2103.00020 (2021)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611969"},{"key":"e_1_3_2_1_49_1","volume-title":"LLaMA: Open and Efficient Foundation Language Models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, and Guillaume Lample. 2023. LLaMA: Open and Efficient Foundation Language Models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_50_1","unstructured":"Unsplash. 2023. Access the world's largest open library dataset. https:\/\/unsplash.com\/data"},{"key":"e_1_3_2_1_51_1","volume-title":"DiffusionDB: A Large-scale Prompt Gallery Dataset for Text-to-Image Generative Models. arXiv preprint arXiv:2210.14896","author":"Wang Zijie J.","year":"2023","unstructured":"Zijie J. Wang, Evan Montoya, David Munechika, Haoyang Yang, Benjamin Hoover, and Duen Horng Chau. 2023. DiffusionDB: A Large-scale Prompt Gallery Dataset for Text-to-Image Generative Models. arXiv preprint arXiv:2210.14896 (2023)."},{"key":"e_1_3_2_1_52_1","volume-title":"Q-Bench: A Benchmark for General-Purpose Foundation Models on Low-level Vision. arXiv preprint arXiv:2309.14181","author":"Wu Haoning","year":"2024","unstructured":"Haoning Wu, Zicheng Zhang, Erli Zhang, Chaofeng Chen, Liang Liao, Annan Wang, Chunyi Li, Wenxiu Sun, Qiong Yan, Guangtao Zhai, and Weisi Lin. 2024. Q-Bench: A Benchmark for General-Purpose Foundation Models on Low-level Vision. arXiv preprint arXiv:2309.14181 (2024)."},{"key":"e_1_3_2_1_53_1","volume-title":"Q-Instruct: Improving Low-level Visual Abilities for Multi-modality Foundation Models. arXiv preprint arXiv:2311.06783","author":"Wu Haoning","year":"2023","unstructured":"Haoning Wu, Zicheng Zhang, Erli Zhang, Chaofeng Chen, Liang Liao, Annan Wang, Kaixin Xu, Chunyi Li, Jingwen Hou, Guangtao Zhai, Geng Xue, Wenxiu Sun, Qiong Yan, and Weisi Lin. 2023. Q-Instruct: Improving Low-level Visual Abilities for Multi-modality Foundation Models. arXiv preprint arXiv:2311.06783 (2023)."},{"key":"e_1_3_2_1_54_1","volume-title":"Q-Align: Teaching LMMs for Visual Scoring via Discrete Text-Defined Levels. arXiv preprint arXiv:2312.17090","author":"Wu Haoning","year":"2023","unstructured":"Haoning Wu, Zicheng Zhang, Weixia Zhang, Chaofeng Chen, Chunyi Li, Liang Liao, Annan Wang, Erli Zhang, Wenxiu Sun, Qiong Yan, Xiongkuo Min, Guangtai Zhai, and Weisi Lin. 2023. Q-Align: Teaching LMMs for Visual Scoring via Discrete Text-Defined Levels. arXiv preprint arXiv:2312.17090 (2023)."},{"key":"e_1_3_2_1_55_1","unstructured":"Haoning Wu Hanwei Zhu Zicheng Zhang Erli Zhang Chaofeng Chen Liang Liao Chunyi Li Annan Wang Wenxiu Sun Qiong Yan Xiaohong Liu Guangtao Zhai Shiqi Wang and Weisi Lin. 2024. Towards Open-ended Visual Quality Comparison."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2012.2190924"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/2393347.2393400"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/2072298.2072350"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/2502081.2502083"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01924"},{"key":"e_1_3_2_1_61_1","volume-title":"The Dawn of LMMs: Preliminary Explorations with GPT-4V(ision). arXiv preprint arXiv:2309.17421","author":"Yang Zhengyuan","year":"2023","unstructured":"Zhengyuan Yang, Linjie Li, Kevin Lin, Jianfeng Wang, Chung-Ching Lin, Zicheng Liu, and Lijuan Wang. 2023. The Dawn of LMMs: Preliminary Explorations with GPT-4V(ision). arXiv preprint arXiv:2309.17421 (2023)."},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3290479"},{"key":"e_1_3_2_1_63_1","volume-title":"mPLUG-Owl: Modularization Empowers Large Language Models with Multimodality. arXiv preprint arXiv:2304.14178","author":"Ye Qinghao","year":"2023","unstructured":"Qinghao Ye, Haiyang Xu, Guohai Xu, Jiabo Ye, Ming Yan, Yiyang Zhou, Junyang Wang, Anwen Hu, Pengcheng Shi, Yaya Shi, Chenliang Li, Yuanhong Xu, Hehong Chen, Junfeng Tian, Qian Qi, Ji Zhang, and Fei Huang. 2023. mPLUG-Owl: Modularization Empowers Large Language Models with Multimodality. arXiv preprint arXiv:2304.14178 (2023)."},{"key":"e_1_3_2_1_64_1","volume-title":"mPLUG-Owl2: Revolutionizing Multi-modal Large Language Model with Modality Collaboration. arXiv preprint arXiv:2311.04257","author":"Ye Qinghao","year":"2023","unstructured":"Qinghao Ye, Haiyang Xu, Jiabo Ye, Ming Yan, Anwen Hu, Haowei Liu, Qi Qian, Ji Zhang, Fei Huang, and Jingren Zhou. 2023. mPLUG-Owl2: Revolutionizing Multi-modal Large Language Model with Modality Collaboration. arXiv preprint arXiv:2311.04257 (2023)."},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02144"},{"key":"e_1_3_2_1_66_1","volume-title":"Instruction Tuning for Large Language Models: A Survey. arXiv preprint arXiv:2308.10792","author":"Zhang Shengyu","year":"2024","unstructured":"Shengyu Zhang, Linfeng Dong, Xiaoya Li, Sen Zhang, Xiaofei Sun, Shuhe Wang, Jiwei Li, Runyi Hu, Tianwei Zhang, Fei Wu, and Guoyin Wang. 2024. Instruction Tuning for Large Language Models: A Survey. arXiv preprint arXiv:2308.10792 (2024)."},{"key":"e_1_3_2_1_67_1","volume-title":"A Perceptual Quality Assessment Exploration for AIGC Images. arXiv preprint arXiv:2303.12618","author":"Zhang Zicheng","year":"2023","unstructured":"Zicheng Zhang, Chunyi Li, Wei Sun, Xiaohong Liu, Xiongkuo Min, and Guangtao Zhai. 2023. A Perceptual Quality Assessment Exploration for AIGC Images. arXiv preprint arXiv:2303.12618 (2023)."},{"key":"e_1_3_2_1_68_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Zheng Lianmin","year":"2024","unstructured":"Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, Zhanghao Wu, Yonghao Zhuang, Zi Lin, Zhuohan Li, Dacheng Li, Eric Xing, et al. 2024. Judging llm-as-a-judge with mt-bench and chatbot arena. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_69_1","volume-title":"MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models. arXiv preprint arXiv:2304.10592","author":"Zhu Deyao","year":"2023","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2023. MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models. arXiv preprint arXiv:2304.10592 (2023)."},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2020.2984670"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611942"},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3123468"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680649","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680649","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:57Z","timestamp":1750295877000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680649"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":71,"alternative-id":["10.1145\/3664647.3680649","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680649","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}