{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:11:40Z","timestamp":1778080300302,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":84,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"The RIE2020 Industry Alignment Fund ? Industry Collaboration Projects (IAF-ICP) Funding Initiative, along with cash and in-kind contributions from our industry partner(s)."},{"name":"College of Engineering (CoE) Research Award 2023 at Nanyang Technological University","award":["022877"],"award-info":[{"award-number":["022877"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680575","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"486-495","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":15,"title":["Q-Ground: Image Quality Grounding with Large Multi-modality Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6137-5162","authenticated-orcid":false,"given":"Chaofeng","family":"Chen","sequence":"first","affiliation":[{"name":"S-Lab, Nanyang Technological University, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-9382-9947","authenticated-orcid":false,"given":"Sensen","family":"Yang","sequence":"additional","affiliation":[{"name":"S-Lab, Nanyang Technological University, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8642-8101","authenticated-orcid":false,"given":"Haoning","family":"Wu","sequence":"additional","affiliation":[{"name":"S-Lab, Nanyang Technological University, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2238-2420","authenticated-orcid":false,"given":"Liang","family":"Liao","sequence":"additional","affiliation":[{"name":"S-Lab, Nanyang Technological University, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7247-7938","authenticated-orcid":false,"given":"Zicheng","family":"Zhang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-2998-9817","authenticated-orcid":false,"given":"Annan","family":"Wang","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5026-8820","authenticated-orcid":false,"given":"Wenxiu","family":"Sun","sequence":"additional","affiliation":[{"name":"SenseTime Research, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-2942-267X","authenticated-orcid":false,"given":"Qiong","family":"Yan","sequence":"additional","affiliation":[{"name":"SenseTime Research, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9866-1947","authenticated-orcid":false,"given":"Weisi","family":"Lin","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore, Singapore"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Proceedings of the European Conference on Computer Vision Workshops (ECCVW). 334--355","author":"Blau Yochai","year":"2018","unstructured":"Yochai Blau, Roey Mechrez, Radu Timofte, Tomer Michaeli, and Lihi Zelnik-Manor. 2018. The 2018 PIRM challenge on perceptual image super-resolution. In Proceedings of the European Conference on Computer Vision Workshops (ECCVW). 334--355."},{"key":"e_1_3_2_2_2_1","first-page":"1","article-title":"Deep neural networks for no-reference and full-reference image quality assessment","volume":"27","author":"Bosse Sebastian","year":"2017","unstructured":"Sebastian Bosse, Dominique Maniry, Klaus-Robert M\u00fcller, Thomas Wiegand, and Wojciech Samek. 2017. Deep neural networks for no-reference and full-reference image quality assessment. IEEE Transactions on Image Processing (TIP), Vol. 27, 1 (Oct. 2017), 206--219.","journal-title":"IEEE Transactions on Image Processing (TIP)"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00132"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2024.3378466"},{"key":"e_1_3_2_2_5_1","volume-title":"Shikra: Unleashing Multimodal LLM's Referential Dialogue Magic. arXiv preprint arXiv:2306.15195","author":"Chen Keqin","year":"2023","unstructured":"Keqin Chen, Zhao Zhang, Weili Zeng, Richong Zhang, Feng Zhu, and Rui Zhao. 2023. Shikra: Unleashing Multimodal LLM's Referential Dialogue Magic. arXiv preprint arXiv:2306.15195 (2023)."},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW53098.2021.00054"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2023.3250956"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00373"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2015.2500021"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00633"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.2967829"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3186307"},{"key":"e_1_3_2_2_14_1","volume-title":"Towards Transparent Deep Image Aesthetics Assessment with Tag-based Content Descriptors","author":"Hou Jingwen","year":"2023","unstructured":"Jingwen Hou, Weisi Lin, Yuming Fang, Haoning Wu, Chaofeng Chen, Liang Liao, and Weide Liu. 2023. Towards Transparent Deep Image Aesthetics Assessment with Tag-based Content Descriptors. IEEE Transactions on Image Processing (TIP) (2023)."},{"key":"e_1_3_2_2_15_1","unstructured":"Zhipeng Huang Zhizheng Zhang Yiting Lu Zheng-Jun Zha Zhibo Chen and Baining Guo. 2024. VisualCritic: Making LMMs Perceive Visual Quality Like Humans. arxiv: 2403.12806 [cs.CV]"},{"key":"e_1_3_2_2_16_1","volume-title":"Multi-channel adaptive partitioning network for block-based image compressive sensing","author":"Hui Chen","unstructured":"Chen Hui, Shaohui Liu, and Feng Jiang. 2022. Multi-channel adaptive partitioning network for block-based image compressive sensing. IEEE, 1--6."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3301213"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00510"},{"key":"e_1_3_2_2_19_1","volume-title":"VILA: Learning Image Aesthetics from User Comments with Vision-Language Pretraining. arxiv: 2303.14302 [cs.CV]","author":"Ke Junjie","year":"2023","unstructured":"Junjie Ke, Keren Ye, Jiahui Yu, Yonghui Wu, Peyman Milanfar, and Feng Yang. 2023. VILA: Learning Image Aesthetics from User Comments with Vision-Language Pretraining. arxiv: 2303.14302 [cs.CV]"},{"key":"e_1_3_2_2_20_1","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 1676--1684","author":"Kim Jongyoo","year":"2017","unstructured":"Jongyoo Kim and Sanghoon Lee. 2017. Deep learning of human Visional sensitivity in image quality assessment framework. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 1676--1684."},{"key":"e_1_3_2_2_21_1","volume-title":"LISA: Reasoning Segmentation via Large Language Model. arXiv preprint arXiv:2308.00692","author":"Lai Xin","year":"2023","unstructured":"Xin Lai, Zhuotao Tian, Yukang Chen, Yanwei Li, Yuhui Yuan, Shu Liu, and Jiaya Jia. 2023. LISA: Reasoning Segmentation via Large Language Model. arXiv preprint arXiv:2308.00692 (2023)."},{"key":"e_1_3_2_2_22_1","volume-title":"Perceptual image quality assessment using a normalized Laplacian pyramid. Human Visionon and Electronic Imaging (HVEI)","author":"Laparra Valero","year":"2016","unstructured":"Valero Laparra, Johannes Ball\u00e9, Alexander Berardino, and Eero P Simoncelli. 2016. Perceptual image quality assessment using a normalized Laplacian pyramid. Human Visionon and Electronic Imaging (HVEI) (2016), 43--48."},{"key":"e_1_3_2_2_23_1","volume-title":"Most apparent distortion: full-reference image quality assessment and the role of strategy","author":"Larson Eric Cooper","year":"2010","unstructured":"Eric Cooper Larson and Damon Michael Chandler. 2010. Most apparent distortion: full-reference image quality assessment and the role of strategy. , Vol. 19, 1 (2010), 011006."},{"key":"e_1_3_2_2_24_1","unstructured":"Chunyi Li Zicheng Zhang Haoning Wu Wei Sun Xiongkuo Min Xiaohong Liu Guangtao Zhai and Weisi Lin. 2023. AGIQA-3K: An Open Database for AI-Generated Image Quality Assessment. arxiv: 2306.04717 [cs.CV]"},{"key":"e_1_3_2_2_25_1","volume-title":"Semantic-SAM: Segment and Recognize Anything at Any Granularity. arXiv preprint arXiv:2307.04767","author":"Li Feng","year":"2023","unstructured":"Feng Li, Hao Zhang, Peize Sun, Xueyan Zou, Shilong Liu, Jianwei Yang, Chunyuan Li, Lei Zhang, and Jianfeng Gao. 2023. Semantic-SAM: Segment and Recognize Anything at Any Granularity. arXiv preprint arXiv:2307.04767 (2023)."},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"crossref","unstructured":"Liang Liao Kangmin Xu Haoning Wu Chaofeng Chen Wenxiu Sun Qiong Yan and Weisi Lin. 2022. Exploring the Effectiveness of Video Perceptual Representation in Blind Video Quality Assessment. In ACM Multimedia (MM).","DOI":"10.1145\/3503161.3547849"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/QoMEX.2019.8743252"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_2_29_1","volume-title":"Visual Instruction Tuning. arXiv:2304.08485","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual Instruction Tuning. arXiv:2304.08485 (2023)."},{"key":"e_1_3_2_2_30_1","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR).","author":"Loshchilov Ilya","year":"2019","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. Decoupled Weight Decay Regularization. In Proceedings of the International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2016.12.009"},{"key":"e_1_3_2_2_32_1","article-title":"No-Reference Image Quality Assessment in the Spatial Domain","volume":"21","author":"Mittal Anish","year":"2012","unstructured":"Anish Mittal, Anush Krishna Moorthy, and Alan Conrad Bovik. 2012. No-Reference Image Quality Assessment in the Spatial Domain. IEEE Transactions on Image Processing (TIP), Vol. 21, 12 (2012).","journal-title":"IEEE Transactions on Image Processing (TIP)"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2012.2214050"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2012.2227726"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2011.2147325"},{"key":"e_1_3_2_2_36_1","unstructured":"OpenAI. 2023. GPT-4V(ision) System Card. https:\/\/api.semanticscholar.org\/CorpusID:263218031"},{"key":"e_1_3_2_2_37_1","volume-title":"Kosmos-2: Grounding Multimodal Large Language Models to the World. ArXiv","author":"Peng Zhiliang","year":"2023","unstructured":"Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, and Furu Wei. 2023. Kosmos-2: Grounding Multimodal Large Language Models to the World. ArXiv, Vol. abs\/2306 (2023)."},{"key":"e_1_3_2_2_38_1","first-page":"30","article-title":"TID2008-a database for evaluation of full-reference Visional quality assessment metrics","volume":"10","author":"Ponomarenko Nikolay","year":"2009","unstructured":"Nikolay Ponomarenko, Vladimir Lukin, Alexander Zelensky, Karen Egiazarian, Marco Carli, and Federica Battisti. 2009. TID2008-a database for evaluation of full-reference Visional quality assessment metrics. Advances of Modern Radioelectronics, Vol. 10, 4 (2009), 30--45.","journal-title":"Advances of Modern Radioelectronics"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00194"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.3042066"},{"key":"e_1_3_2_2_41_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision."},{"key":"e_1_3_2_2_42_1","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Rasheed Hanoona","year":"2024","unstructured":"Hanoona Rasheed, Muhammad Maaz, Sahal Shaji, Abdelrahman Shaker, Salman Khan, Hisham Cholakkal, Rao M. Anwer, Eric Xing, Ming-Hsuan Yang, and Fahad S. Khan. 2024. GLaMM: Pixel Grounding Large Multimodal Model. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2024)."},{"key":"e_1_3_2_2_43_1","volume-title":"Pixellm: Pixel reasoning with large multimodal model.","author":"Ren Zhongwei","year":"2024","unstructured":"Zhongwei Ren, Zhicheng Huang, Yunchao Wei, Yao Zhao, Dongmei Fu, Jiashi Feng, and Xiaojie Jin. 2024. Pixellm: Pixel reasoning with large multimodal model. (2024), 26374--26383."},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2009.2025923"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2005.859378"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2006.881959"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00372"},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2023.3270621"},{"key":"e_1_3_2_2_49_1","volume-title":"NIMA: Neural Image Assessment","author":"Talebi Hossein","year":"2018","unstructured":"Hossein Talebi and Peyman Milanfar. 2018. NIMA: Neural Image Assessment. IEEE Transactions on Image Processing (TIP) (2018)."},{"key":"e_1_3_2_2_50_1","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar Aurelien Rodriguez Armand Joulin Edouard Grave and Guillaume Lample. 2023. LLaMA: Open and Efficient Foundation Language Models. arxiv: 2302.13971 [cs.CL]"},{"key":"e_1_3_2_2_51_1","unstructured":"Jianyi Wang Kelvin C. K. Chan and Chen Change Loy. 2022. Exploring CLIP for Assessing the Look and Feel of Images."},{"key":"e_1_3_2_2_52_1","volume-title":"Proceedings of Advances in Neural Information Processing Systems (NeurIPS)","volume":"36","author":"Wang Wenhai","year":"2024","unstructured":"Wenhai Wang, Zhe Chen, Xiaokang Chen, Jiannan Wu, Xizhou Zhu, Gang Zeng, Ping Luo, Tong Lu, Jie Zhou, Yu Qiao, et al. 2024. Visionllm: Large language model is also an open-ended decoder for vision-centric tasks. Proceedings of Advances in Neural Information Processing Systems (NeurIPS), Vol. 36 (2024)."},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2003.819861"},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2003.819861"},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3257564"},{"key":"e_1_3_2_2_56_1","first-page":"4577","article-title":"Active fine-tuning from gMAD examples improves blind image quality assessment","volume":"44","author":"Wang Zhihua","year":"2021","unstructured":"Zhihua Wang and Kede Ma. 2021. Active fine-tuning from gMAD examples improves blind image quality assessment. IEEE Transactions on Pattern Analysis and Machine Intelligence (PAMI), Vol. 44, 9 (2021), 4577--4590.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence (PAMI)"},{"key":"e_1_3_2_2_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01599"},{"key":"e_1_3_2_2_58_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20068-7_31"},{"key":"e_1_3_2_2_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3319332"},{"key":"e_1_3_2_2_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01843"},{"key":"e_1_3_2_2_61_1","unstructured":"Haoning Wu Zicheng Zhang Erli Zhang Chaofeng Chen Liang Liao Annan Wang Chunyi Li Wenxiu Sun Qiong Yan Guangtao Zhai and Weisi Lin. 2023. Q-Bench: A Benchmark for General-Purpose Foundation Models on Low-level Vision. arxiv: 2309.14181 [cs.CV]"},{"key":"e_1_3_2_2_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02408"},{"key":"e_1_3_2_2_63_1","volume-title":"Proceedings of the International Conference on Machine Learning (ICML)","author":"Wu Haoning","year":"2024","unstructured":"Haoning Wu, Zicheng Zhang, Weixia Zhang, Chaofeng Chen, Chunyi Li, Liang Liao, Annan Wang, Erli Zhang, Wenxiu Sun, Qiong Yan, Xiongkuo Min, Guangtai Zhai, and Weisi Lin. 2024. Q-Align: Teaching LMMs for Visual Scoring via Discrete Text-Defined Levels. Proceedings of the International Conference on Machine Learning (ICML) (2024)."},{"key":"e_1_3_2_2_64_1","unstructured":"Haoning Wu Hanwei Zhu Zicheng Zhang Erli Zhang Chaofeng Chen Liang Liao Chunyi Li Annan Wang Wenxiu Sun Qiong Yan Xiaohong Liu Guangtao Zhai Shiqi Wang and Weisi Lin. 2024 d. Towards Open-ended Visual Quality Comparison. arxiv: 2402.16641 [cs.CV]"},{"key":"e_1_3_2_2_65_1","unstructured":"Tianhe Wu Kede Ma Jie Liang Yujiu Yang and Lei Zhang. 2024. A Comprehensive Study of Multimodal Large Language Models for Image Quality Assessment. arxiv: 2403.10854 [cs.CV]"},{"key":"e_1_3_2_2_66_1","volume-title":"A Semantic Space is Worth 256 Language Descriptions: Make Stronger Segmentation Models with Descriptive Properties. arXiv preprint arXiv:2312.13764","author":"Xiao Junfei","year":"2023","unstructured":"Junfei Xiao, Ziqi Zhou, Wenxuan Li, Shiyi Lan, Jieru Mei, Zhiding Yu, Alan Yuille, Yuyin Zhou, and Cihang Xie. 2023. A Semantic Space is Worth 256 Language Descriptions: Make Stronger Segmentation Models with Descriptive Properties. arXiv preprint arXiv:2312.13764 (2023)."},{"key":"e_1_3_2_2_67_1","volume-title":"Proceedings of Advances in Neural Information Processing Systems (NeurIPS).","author":"Xie Enze","year":"2021","unstructured":"Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M Alvarez, and Ping Luo. 2021. SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers. In Proceedings of Advances in Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_2_68_1","unstructured":"Jiazheng Xu Xiao Liu Yuchen Wu Yuxuan Tong Qinkai Li Ming Ding Jie Tang and Yuxiao Dong. 2023. ImageReward: Learning and Evaluating Human Preferences for Text-to-Image Generation. arxiv: 2304.05977 [cs.CV]"},{"key":"e_1_3_2_2_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00257"},{"key":"e_1_3_2_2_70_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00288"},{"key":"e_1_3_2_2_71_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2013.2293423"},{"key":"e_1_3_2_2_72_1","volume-title":"Set-of-Mark Prompting Unleashes Extraordinary Visual Grounding in GPT-4V. arXiv preprint arXiv:2310.11441","author":"Yang Jianwei","year":"2023","unstructured":"Jianwei Yang, Hao Zhang, Feng Li, Xueyan Zou, Chunyuan Li, and Jianfeng Gao. 2023. Set-of-Mark Prompting Unleashes Extraordinary Visual Grounding in GPT-4V. arXiv preprint arXiv:2310.11441 (2023)."},{"key":"e_1_3_2_2_73_1","unstructured":"Qinghao Ye Haiyang Xu Guohai Xu Jiabo Ye Ming Yan Yiyang Zhou Junyang Wang Anwen Hu Pengcheng Shi Yaya Shi Chaoya Jiang Chenliang Li Yuanhong Xu Hehong Chen Junfeng Tian Qian Qi Ji Zhang and Fei Huang. 2023. mPLUG-Owl: Modularization Empowers Large Language Models with Multimodality. arxiv: 2304.14178 [cs.CL]"},{"key":"e_1_3_2_2_74_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00363"},{"key":"e_1_3_2_2_75_1","volume-title":"Descriptive Image Quality Assessment in the Wild. arXiv preprint arXiv:2405.18842","author":"You Zhiyuan","year":"2024","unstructured":"Zhiyuan You, Jinjin Gu, Zheyuan Li, Xin Cai, Kaiwen Zhu, Tianfan Xue, and Chao Dong. 2024. Descriptive Image Quality Assessment in the Wild. arXiv preprint arXiv:2405.18842 (2024)."},{"key":"e_1_3_2_2_76_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2014.2346028"},{"key":"e_1_3_2_2_77_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2015.2426416"},{"key":"e_1_3_2_2_78_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2015.2426416"},{"key":"e_1_3_2_2_79_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2011.2109730"},{"key":"e_1_3_2_2_80_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"e_1_3_2_2_81_1","volume-title":"Gpt4roi: Instruction tuning large language model on region-of-interest. arXiv preprint arXiv:2307.03601","author":"Zhang Shilong","year":"2023","unstructured":"Shilong Zhang, Peize Sun, Shoufa Chen, Min Xiao, Wenqi Shao, Wenwei Zhang, Kai Chen, and Ping Luo. 2023. Gpt4roi: Instruction tuning large language model on region-of-interest. arXiv preprint arXiv:2307.03601 (2023)."},{"key":"e_1_3_2_2_82_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2018.2886771"},{"key":"e_1_3_2_2_83_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01352"},{"key":"e_1_3_2_2_84_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.544"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680575","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680575","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:56Z","timestamp":1750295876000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680575"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":84,"alternative-id":["10.1145\/3664647.3680575","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680575","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}