{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:20:12Z","timestamp":1765340412386,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":65,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62471349, 62171340, and 62301378"],"award-info":[{"award-number":["62471349, 62171340, and 62301378"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["QTZX25076"],"award-info":[{"award-number":["QTZX25076"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Guangzhou Municipal Government-University (Institute)-Enterprises Jointly Founded Project","award":["2025A03J3123"],"award-info":[{"award-number":["2025A03J3123"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754931","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:47:18Z","timestamp":1761374838000},"page":"6830-6839","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["InstructCrop: Teaching Multimodal Large Language Models to Crop Aesthetic Images"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-8468-1970","authenticated-orcid":false,"given":"Xiangfei","family":"Sheng","sequence":"first","affiliation":[{"name":"School of Artificial Intelligence, Xidian University, Xi'an, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-7768-981X","authenticated-orcid":false,"given":"Pangu","family":"Xie","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Xidian University, Xi'an, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-4284-6042","authenticated-orcid":false,"given":"Weidong","family":"Zou","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Xidian University, Xi'an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0509-3782","authenticated-orcid":false,"given":"Pengfei","family":"Chen","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Xidian University, Xi'an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3082-7848","authenticated-orcid":false,"given":"Tong","family":"Zhu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Network Engineering, Guangzhou University, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9069-8796","authenticated-orcid":false,"given":"Leida","family":"Li","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence and State Key Laboratory of Electromechanical Integrated Manufacturing of High-Performance Electronic Equipments, Xidian University, Xi'an, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang et al. 2025. Qwen2. 5-vl technical report. arXiv preprint arXiv:2502.13923 (2025)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.61"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2017.32"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123274"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2654979"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2794262"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00698"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i3.27990"},{"key":"e_1_3_2_1_10_1","volume-title":"International Conference on Learning Representations","volume":"1","author":"Hu Edward J","year":"2022","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen, et al., 2022. Lora: Low-rank adaptation of large language models. International Conference on Learning Representations, Vol. 1, 2 (2022), 3."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3492259"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680649"},{"key":"e_1_3_2_1_13_1","volume-title":"Aesbench: An expert benchmark for multimodal large language models on image aesthetics perception. arXiv preprint arXiv:2401.08276","author":"Huang Yipo","year":"2024","unstructured":"Yipo Huang, Quan Yuan, Xiangfei Sheng, Zhichao Yang, Haoning Wu, Pengfei Chen, Yuzhe Yang, Leida Li, and Weisi Lin. 2024b. Aesbench: An expert benchmark for multimodal large language models on image aesthetics perception. arXiv preprint arXiv:2401.08276 (2024)."},{"key":"e_1_3_2_1_14_1","unstructured":"Aaron Hurst Adam Lerer Adam P Goucher Adam Perelman Aditya Ramesh Aidan Clark AJ Ostrow Akila Welihinda Alan Hayes Alec Radford et al. 2024. Gpt-4o system card. arXiv preprint arXiv:2410.21276 (2024)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00248"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.224"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00915"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jvcir.2018.05.018"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02793"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00855"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2019.2914360"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00427"},{"key":"e_1_3_2_1_23_1","series-title":"Lecture Notes in Computer Science","volume-title":"Studying aesthetics in photographic images using a computational approach","author":"Li J","year":"2006","unstructured":"J Li, R Datta, D Joshi, and JZ Wang. 2006. Studying aesthetics in photographic images using a computational approach. Lecture Notes in Computer Science, Vol. 3953 (2006), 288-301."},{"key":"e_1_3_2_1_24_1","volume-title":"Next-vit: Next generation vision transformer for efficient deployment in realistic industrial scenarios. arXiv preprint arXiv:2207.05501","author":"Li Jiashi","year":"2022","unstructured":"Jiashi Li, Xin Xia, Wei Li, Huixia Li, Xing Wang, Xuefeng Xiao, Rui Wang, Min Zheng, and Xin Pan. 2022. Next-vit: Next generation vision transformer for efficient deployment in realistic industrial scenarios. arXiv preprint arXiv:2207.05501 (2022)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3470870"},{"key":"e_1_3_2_1_26_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023a. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2023), 34892-34916."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01197"},{"key":"e_1_3_2_1_28_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2654927"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CCNC.2004.1286964"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6247954"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2013.2241042"},{"key":"e_1_3_2_1_33_1","unstructured":"OpenAI. 2023. GPT-4 Technical Report. arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00418"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i2.25293"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-87361-5_23"},{"key":"e_1_3_2_1_37_1","volume-title":"Kosmos-2: Grounding Multimodal Large Language Models to the World. arXiv:2306.14824","author":"Peng Zhiliang","year":"2023","unstructured":"Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, and Furu Wei. 2023. Kosmos-2: Grounding Multimodal Large Language Models to the World. arXiv:2306.14824 (2023)."},{"key":"e_1_3_2_1_38_1","volume-title":"DetGPT: Detect What You Need via Reasoning. arXiv:2305.14167","author":"Pi Renjie","year":"2023","unstructured":"Renjie Pi, Jiahui Gao, Shizhe Diao, Rui Pan, Hanze Dong, Jipeng Zhang, Lewei Yao, Jianhua Han, Hang Xu, Lingpeng Kong, and Tong Zhang. 2023. DetGPT: Detect What You Need via Reasoning. arXiv:2305.14167 (2023)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/1124772.1124886"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611969"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP49359.2023.10222223"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2831899"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6889"},{"key":"e_1_3_2_1_44_1","volume-title":"Advances and challenges in computational image aesthetics. Human perception of visual information: Psychological and computational perspectives","author":"Valenzise Giuseppe","year":"2021","unstructured":"Giuseppe Valenzise, Chen Kang, and Fr\u00e9d\u00e9ric Dufaux. 2021. Advances and challenges in computational image aesthetics. Human perception of visual information: Psychological and computational perspectives (2021), 133-181."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00969"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00969"},{"key":"e_1_3_2_1_47_1","first-page":"61501","article-title":"Visionllm: Large language model is also an open-ended decoder for vision-centric tasks","volume":"36","author":"Wang Wenhai","year":"2023","unstructured":"Wenhai Wang, Zhe Chen, Xiaokang Chen, Jiannan Wu, Xizhou Zhu, Gang Zeng, Ping Luo, Tong Lu, Jie Zhou, Yu Qiao, et al., 2023a. Visionllm: Large language model is also an open-ended decoder for vision-centric tasks. Advances in Neural Information Processing Systems, Vol. 36 (2023), 61501-61513.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.240"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2840724"},{"key":"e_1_3_2_1_50_1","volume-title":"A deep network solution for attention and aesthetics aware photo cropping","author":"Wang Wenguan","year":"2018","unstructured":"Wenguan Wang, Jianbing Shen, and Haibin Ling. 2018b. A deep network solution for attention and aesthetics aware photo cropping. IEEE transactions on pattern analysis and machine intelligence, Vol. 41, 7 (2018), 1531-1544."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00570"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00570"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02408"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.130"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680972"},{"key":"e_1_3_2_1_56_1","volume-title":"Language-Guided Visual Perception Disentanglement for Image Quality Assessment and Conditional Image Generation. arXiv preprint arXiv:2503.02206","author":"Yang Zhichao","year":"2025","unstructured":"Zhichao Yang, Leida Li, Pengfei Chen, Jinjian Wu, and Giuseppe Valenzise. 2025. Language-Guided Visual Perception Disentanglement for Image Quality Assessment and Conditional Image Generation. arXiv preprint arXiv:2503.02206 (2025)."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jvcir.2024.104316"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00610"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.3024207"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20071-7_11"},{"key":"e_1_3_2_1_61_1","volume-title":"Proceedings of the IEEE international conference on multimedia and expo. 4-pp.","author":"Zhang Mingju","year":"2005","unstructured":"Mingju Zhang, Lei Zhang, Yanfeng Sun, Lin Feng, and Weiying Ma. 2005. Auto cropping for digital photographs. In Proceedings of the IEEE international conference on multimedia and expo. 4-pp."},{"key":"e_1_3_2_1_62_1","unstructured":"Zicheng Zhang Ziheng Jia Haoning Wu Chunyi Li Zijian Chen Yingjie Zhou Wei Sun Xiaohong Liu Xiongkuo Min Weisi Lin et al. 2024a. Q-Bench-Video: Benchmarking the Video Quality Understanding of LMMs. arXiv preprint arXiv:2409.20063 (2024)."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3445770"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00037"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00037"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754931","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:16:42Z","timestamp":1765340202000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754931"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":65,"alternative-id":["10.1145\/3746027.3754931","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754931","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}