{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T20:01:20Z","timestamp":1765310480841,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":61,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62206318, 62306344"],"award-info":[{"award-number":["62206318, 62306344"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Guangdong Basic and Applied Basic Research Foundation","award":["2024A1515010253"],"award-info":[{"award-number":["2024A1515010253"]}]},{"name":"Guangzhou Intelligent Educational Technology Collaborative Innovation Center Construction Project","award":["2023B04J0004"],"award-info":[{"award-number":["2023B04J0004"]}]},{"name":"Research Project on Large Model-Based Higher Education Institution Development Data Governance Technology","award":["2025B04J0037"],"award-info":[{"award-number":["2025B04J0037"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755124","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:30:51Z","timestamp":1761377451000},"page":"10945-10954","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Detecting Violations of Physical Common Sense in Images: A Challenge Dataset and Effective Model"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7262-6219","authenticated-orcid":false,"given":"Weibin","family":"Wu","sequence":"first","affiliation":[{"name":"School of Software Engineering, Zhuhai Key Laboratory of Trusted Large Language Models, Sun Yat-sen University, Zhuhai, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-8328-6197","authenticated-orcid":false,"given":"Zitong","family":"Wang","sequence":"additional","affiliation":[{"name":"International School of Business and Finance, Sun Yat-sen University, Zhuhai, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-1406-735X","authenticated-orcid":false,"given":"Zhengjie","family":"Luo","sequence":"additional","affiliation":[{"name":"School of Software Engineering, Zhuhai Key Laboratory of Trusted Large Language Models, Sun Yat-sen University, Zhuhai, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8739-2216","authenticated-orcid":false,"given":"Wenqing","family":"Chen","sequence":"additional","affiliation":[{"name":"School of Software Engineering, Zhuhai Key Laboratory of Trusted Large Language Models, Sun Yat-sen University, Zhuhai, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7878-4330","authenticated-orcid":false,"given":"Zibin","family":"Zheng","sequence":"additional","affiliation":[{"name":"School of Software Engineering, Zhuhai Key Laboratory of Trusted Large Language Models, Sun Yat-sen University, Zhuhai, Guangdong, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"Aanthropic. 2025. Claude 3.7 Sonnet and Claude Code. https:\/\/www.anthropic.com\/news\/claude-3-7-sonnet"},{"key":"e_1_3_2_2_2_1","volume-title":"Flamingo: A Visual Language Model for Few-Shot Learning. In Conference on Neural Information Processing Systems. 23716-23736","author":"Alayrac Jean-Baptiste","year":"2022","unstructured":"Jean-Baptiste Alayrac, Jeff Donahue, Pauline Luc, Antoine Miech, Iain Barr, Yana Hasson, Karel Lenc, Arthur Mensch, Katherine Millican, Malcolm Reynolds, et al., 2022. Flamingo: A Visual Language Model for Few-Shot Learning. In Conference on Neural Information Processing Systems. 23716-23736."},{"key":"e_1_3_2_2_3_1","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang et al. 2025. Qwen2.5-VL Technical Report. arXiv preprint arXiv:2502.13923 (2025)."},{"key":"e_1_3_2_2_4_1","volume-title":"Amir Hosein Khasahmadi, and Rahul G Krishnan","author":"Balazadeh Vahid","year":"2024","unstructured":"Vahid Balazadeh, Mohammadmehdi Ataei, Hyunmin Cheong, Amir Hosein Khasahmadi, and Rahul G Krishnan. 2024. Synthetic Vision: Training Vision-Language Models to Understand Physics. arXiv preprint arXiv:2412.08619 (2024)."},{"key":"e_1_3_2_2_5_1","first-page":"1","article-title":"Physics-Informed Computer Vision","volume":"57","author":"Banerjee Chayan","year":"2024","unstructured":"Chayan Banerjee, Kien Nguyen, Clinton Fookes, and Karniadakis George. 2024. Physics-Informed Computer Vision: A Review and Perspectives. Comput. Surveys, Vol. 57, 1 (2024), 1-38.","journal-title":"A Review and Perspectives. Comput. Surveys"},{"key":"e_1_3_2_2_6_1","volume-title":"VideoPhy: Evaluating Physical Commonsense for Video Generation. In International Conference on Learning Representations.","author":"Bansal Hritik","year":"2025","unstructured":"Hritik Bansal, Zongyu Lin, Tianyi Xie, Zeshun Zong, Michal Yarom, Yonatan Bitton, Chenfanfu Jiang, Yizhou Sun, Kai-Wei Chang, and Aditya Grover. 2025. VideoPhy: Evaluating Physical Commonsense for Video Generation. In International Conference on Learning Representations."},{"key":"e_1_3_2_2_7_1","unstructured":"Xiao Bi Deli Chen Guanting Chen Shanhuang Chen Damai Dai Chengqi Deng Honghui Ding Kai Dong Qiushi Du Zhe Fu et al. 2024. DeepSeek LLM: Scaling Open-Source Language Models With Longtermism. arXiv preprint arXiv:2401.02954 (2024)."},{"key":"e_1_3_2_2_8_1","volume-title":"Language Models Are Few-Shot Learners. In Conference on Neural Information Processing Systems. 1877-1901","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al., 2020. Language Models Are Few-Shot Learners. In Conference on Neural Information Processing Systems. 1877-1901."},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10409-021-01148-1"},{"key":"e_1_3_2_2_10_1","volume-title":"REVEAL: Relation-Based Video Representation Learning for Video-Question-Answering. arXiv preprint arXiv:2504.05463","author":"Chaybouti Sofian","year":"2025","unstructured":"Sofian Chaybouti, Walid Bousselham, Moritz Wolter, and Hilde Kuehne. 2025. REVEAL: Relation-Based Video Representation Learning for Video-Question-Answering. arXiv preprint arXiv:2504.05463 (2025)."},{"key":"e_1_3_2_2_11_1","unstructured":"Zhe Chen Weiyun Wang Yue Cao Yangzhou Liu Zhangwei Gao Erfei Cui Jinguo Zhu Shenglong Ye Hao Tian Zhaoyang Liu et al. 2024a. Expanding Performance Boundaries of Open-Source Multimodal Models With Model Data and Test-Time Scaling. arXiv preprint arXiv:2412.05271 (2024)."},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4231-5"},{"key":"e_1_3_2_2_13_1","volume-title":"PhysBench: Benchmarking and Enhancing Vision-Language Models for Physical World Understanding. In International Conference on Learning Representations.","author":"Chow Wei","year":"2025","unstructured":"Wei Chow, Jiageng Mao, Boyi Li, Daniel Seita, Vitor Campagnolo Guizilini, and Yue Wang. 2025. PhysBench: Benchmarking and Enhancing Vision-Language Models for Physical World Understanding. In International Conference on Learning Representations."},{"key":"e_1_3_2_2_14_1","first-page":"1","article-title":". Scaling Instruction-Finetuned Language Models","volume":"25","author":"Chung Hyung Won","year":"2024","unstructured":"Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Yunxuan Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, et al., 2024. Scaling Instruction-Finetuned Language Models. Journal of Machine Learning Research, Vol. 25, 70 (2024), 1-53.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_2_15_1","volume-title":"On the Robustness of Large Multimodal Models Against Image Adversarial Attacks. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 24625-24634","author":"Cui Xuanming","year":"2024","unstructured":"Xuanming Cui, Alejandro Aparcedo, Young Kyun Jang, and Ser-Nam Lim. 2024. On the Robustness of Large Multimodal Models Against Image Adversarial Attacks. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 24625-24634."},{"key":"e_1_3_2_2_16_1","volume-title":"Blurred-Dilated Method for Adversarial Attacks. In Conference on Neural Information Processing Systems. 58613-58624","author":"Deng Yang","year":"2023","unstructured":"Yang Deng, Weibin Wu, Jianping Zhang, and Zibin Zheng. 2023. Blurred-Dilated Method for Adversarial Attacks. In Conference on Neural Information Processing Systems. 58613-58624."},{"key":"e_1_3_2_2_17_1","volume-title":"Words: Transformers for Image Recognition at Scale. arXiv preprint arXiv:2010.11929","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, et al., 2020. An Image Is Worth 16x16 Words: Transformers for Image Recognition at Scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_2_18_1","volume-title":"Scene-LLM: Extending Language Model for 3D Visual Understanding and Reasoning. arXiv preprint arXiv:2403.11401","author":"Fu Rao","year":"2024","unstructured":"Rao Fu, Jingyu Liu, Xilun Chen, Yixin Nie, and Wenhan Xiong. 2024. Scene-LLM: Extending Language Model for 3D Visual Understanding and Reasoning. arXiv preprint arXiv:2403.11401 (2024)."},{"key":"e_1_3_2_2_19_1","volume-title":"Improving Dynamic Object Interactions in Text-To-Video Generation With AI Feedback. arXiv preprint arXiv:2412.02617","author":"Furuta Hiroki","year":"2024","unstructured":"Hiroki Furuta, Heiga Zen, Dale Schuurmans, Aleksandra Faust, Yutaka Matsuo, Percy Liang, and Sherry Yang. 2024. Improving Dynamic Object Interactions in Text-To-Video Generation With AI Feedback. arXiv preprint arXiv:2412.02617 (2024)."},{"key":"e_1_3_2_2_20_1","unstructured":"Daya Guo Dejian Yang Haowei Zhang Junxiao Song Ruoyu Zhang Runxin Xu Qihao Zhu Shirong Ma Peiyi Wang Xiao Bi et al. 2025. DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning. arXiv preprint arXiv:2501.12948 (2025)."},{"key":"e_1_3_2_2_21_1","volume-title":"Efficient Multimodal Learning From Data-Centric Perspective. arXiv preprint arXiv:2402.11530","author":"He Muyang","year":"2024","unstructured":"Muyang He, Yexin Liu, Boya Wu, Jianhao Yuan, Yueze Wang, Tiejun Huang, and Bo Zhao. 2024. Efficient Multimodal Learning From Data-Centric Perspective. arXiv preprint arXiv:2402.11530 (2024)."},{"key":"e_1_3_2_2_22_1","volume-title":"AutoVFX: Physically Realistic Video Editing from Natural Language Instructions. arXiv preprint arXiv:2411.02394","author":"Hsu Hao-Yu","year":"2024","unstructured":"Hao-Yu Hsu, Zhi-Hao Lin, Albert J Zhai, Hongchi Xia, and Shenlong Wang. 2024. AutoVFX: Physically Realistic Video Editing from Natural Language Instructions. arXiv preprint arXiv:2411.02394 (2024)."},{"key":"e_1_3_2_2_23_1","volume-title":"LoRA: Low-Rank Adaptation of Large Language Models. In International Conference on Learning Representations.","author":"Hu Edward J","year":"2022","unstructured":"Edward J Hu, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen, et al., 2022. LoRA: Low-Rank Adaptation of Large Language Models. In International Conference on Learning Representations."},{"key":"e_1_3_2_2_24_1","first-page":"2256","volume-title":"BLIVA: A Simple Multimodal LLM for Better Handling of Text-Rich Visual Questions. In AAAI Conference on Artificial Intelligencee","volume":"38","author":"Hu Wenbo","year":"2024","unstructured":"Wenbo Hu, Yifan Xu, Yi Li, Weiyue Li, Zeyuan Chen, and Zhuowen Tu. 2024. BLIVA: A Simple Multimodal LLM for Better Handling of Text-Rich Visual Questions. In AAAI Conference on Artificial Intelligencee, Vol. 38. 2256-2264."},{"key":"e_1_3_2_2_25_1","volume-title":"Language Is Not All You Need: Aligning Perception With Language Models. In Conference on Neural Information Processing Systems. 72096-72109","author":"Huang Shaohan","year":"2023","unstructured":"Shaohan Huang, Li Dong, Wenhui Wang, Yaru Hao, Saksham Singhal, Shuming Ma, Tengchao Lv, Lei Cui, Owais Khan Mohammed, Barun Patra, et al., 2023. Language Is Not All You Need: Aligning Perception With Language Models. In Conference on Neural Information Processing Systems. 72096-72109."},{"key":"e_1_3_2_2_26_1","unstructured":"Aaron Hurst Adam Lerer Adam P Goucher Adam Perelman Aditya Ramesh Aidan Clark AJ Ostrow Akila Welihinda Alan Hayes Alec Radford et al. 2024. GPT-4o System Card. arXiv preprint arXiv:2410.21276 (2024)."},{"key":"e_1_3_2_2_27_1","volume-title":"VCoder: Versatile Vision Encoders for Multimodal Large Language Models. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 27992-28002","author":"Jain Jitesh","year":"2024","unstructured":"Jitesh Jain, Jianwei Yang, and Humphrey Shi. 2024. VCoder: Versatile Vision Encoders for Multimodal Large Language Models. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 27992-28002."},{"key":"e_1_3_2_2_28_1","unstructured":"Koray Kavukcuoglu. 2025. Gemini 2.0 Is Now Available to Everyone. https:\/\/blog.google\/technology\/google-deepmind\/gemini-model-updates-february-2025\/"},{"key":"e_1_3_2_2_29_1","volume-title":"International Conference on Machine Learning. 19730-19742","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-Training With Frozen Image Encoders and Large Language Models. In International Conference on Machine Learning. 19730-19742."},{"key":"e_1_3_2_2_30_1","volume-title":"Improved Baselines With Visual Instruction Tuning. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 26296-26306","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Yuheng Li, and Yong Jae Lee. 2024a. Improved Baselines With Visual Instruction Tuning. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 26296-26306."},{"key":"e_1_3_2_2_31_1","volume-title":"Visual Instruction Tuning. In Conference on Neural Information Processing Systems. 34892-34916","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual Instruction Tuning. In Conference on Neural Information Processing Systems. 34892-34916."},{"key":"e_1_3_2_2_32_1","volume-title":"PhysGen: Rigid-Body Physics-Grounded Image-To-Video Generation. In European Conference on Computer Vision. 360-378","author":"Liu Shaowei","year":"2024","unstructured":"Shaowei Liu, Zhongzheng Ren, Saurabh Gupta, and Shenlong Wang. 2024b. PhysGen: Rigid-Body Physics-Grounded Image-To-Video Generation. In European Conference on Computer Vision. 360-378."},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/VRW66409.2025.00263"},{"key":"e_1_3_2_2_34_1","volume-title":"Towards World Simulator: Crafting Physical Commonsense-Based Benchmark for Video Generation. In International Conference on Machine Learning.","author":"Meng Fanqing","year":"2025","unstructured":"Fanqing Meng, Jiaqi Liao, Xinyu Tan, Quanfeng Lu, Wenqi Shao, Kaipeng Zhang, Yu Cheng, Dianqi Li, and Ping Luo. 2025. Towards World Simulator: Crafting Physical Commonsense-Based Benchmark for Video Generation. In International Conference on Machine Learning."},{"key":"e_1_3_2_2_35_1","unstructured":"Fanqing Meng Wenqi Shao Lixin Luo Yahong Wang Yiran Chen Quanfeng Lu Yue Yang Tianshuo Yang Kaipeng Zhang Yu Qiao et al. 2024. PhyBench: A Physical Commonsense Benchmark for Evaluating Text-To-Image Models. arXiv preprint arXiv:2406.11802 (2024)."},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2024.3425448"},{"key":"e_1_3_2_2_37_1","volume-title":"Kosmos-2: Grounding Multimodal Large Language Models to the World. arXiv preprint arXiv:2306.14824","author":"Peng Zhiliang","year":"2023","unstructured":"Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, and Furu Wei. 2023. Kosmos-2: Grounding Multimodal Large Language Models to the World. arXiv preprint arXiv:2306.14824 (2023)."},{"key":"e_1_3_2_2_38_1","volume-title":"Feature Splatting: Language-Driven Physics-Based Scene Synthesis and Editing. arXiv preprint arXiv:2404.01223","author":"Qiu Ri-Zhao","year":"2024","unstructured":"Ri-Zhao Qiu, Ge Yang, Weijia Zeng, and Xiaolong Wang. 2024. Feature Splatting: Language-Driven Physics-Based Scene Synthesis and Editing. arXiv preprint arXiv:2404.01223 (2024)."},{"key":"e_1_3_2_2_39_1","volume-title":"arXiv preprint arXiv:2412.15115","author":"Yang An","year":"2025","unstructured":"Qwen, :, An Yang, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chengyuan Li, Dayiheng Liu, Fei Huang, Haoran Wei, Huan Lin, Jian Yang, Jianhong Tu, Jianwei Zhang, Jianxin Yang, Jiaxi Yang, Jingren Zhou, Junyang Lin, Kai Dang, Keming Lu, Keqin Bao, Kexin Yang, Le Yu, Mei Li, Mingfeng Xue, Pei Zhang, Qin Zhu, Rui Men, Runji Lin, Tianhao Li, Tianyi Tang, Tingyu Xia, Xingzhang Ren, Xuancheng Ren, Yang Fan, Yang Su, Yichang Zhang, Yu Wan, Yuqiong Liu, Zeyu Cui, Zhenru Zhang, and Zihan Qiu. 2025. Qwen2.5 Technical Report. arXiv preprint arXiv:2412.15115 (2025)."},{"key":"e_1_3_2_2_40_1","volume-title":"Learning Transferable Visual Models From Natural Language Supervision. In International Conference on Machine Learning. 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning Transferable Visual Models From Natural Language Supervision. In International Conference on Machine Learning. 8748-8763."},{"key":"e_1_3_2_2_41_1","volume-title":"Mathieu Bernard, Adam Lerer, Rob Fergus, V\u00e9ronique Izard, and Emmanuel Dupoux.","author":"Riochet Ronan","year":"2018","unstructured":"Ronan Riochet, Mario Ynocente Castro, Mathieu Bernard, Adam Lerer, Rob Fergus, V\u00e9ronique Izard, and Emmanuel Dupoux. 2018. Intphys: A Framework and Benchmark for Visual Intuitive Physics Reasoning. arXiv preprint arXiv:1803.07616 (2018)."},{"key":"e_1_3_2_2_42_1","volume-title":"High-Resolution Image Synthesis With Latent Diffusion Models. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 10684-10695","author":"Rombach Robin","year":"2022","unstructured":"Robin Rombach, Andreas Blattmann, Dominik Lorenz, Patrick Esser, and Bj\u00f6rn Ommer. 2022. High-Resolution Image Synthesis With Latent Diffusion Models. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 10684-10695."},{"key":"e_1_3_2_2_43_1","volume-title":"Probing Conceptual Understanding of Large Visual-Language Models. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 1797-1807","author":"Schiappa Madeline","year":"2024","unstructured":"Madeline Schiappa, Raiyaan Abdullah, Shehreen Azad, Jared Claypoole, Michael Cogswell, Ajay Divakaran, and Yogesh Rawat. 2024. Probing Conceptual Understanding of Large Visual-Language Models. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 1797-1807."},{"key":"e_1_3_2_2_44_1","unstructured":"Zhihong Shao Peiyi Wang Qihao Zhu Runxin Xu Junxiao Song Xiao Bi Haowei Zhang Mingchuan Zhang YK Li Yang Wu et al. 2024. DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models. arXiv preprint arXiv:2402.03300 (2024)."},{"key":"e_1_3_2_2_45_1","volume-title":"Juliette Love, et al.","author":"Team Gemma","year":"2024","unstructured":"Gemma Team, Thomas Mesnard, Cassidy Hardin, Robert Dadashi, Surya Bhupatiraju, Shreya Pathak, Laurent Sifre, Morgane Rivi\u00e8re, Mihir Sanjay Kale, Juliette Love, et al., 2024. Gemma: Open Models Based on Gemini Research and Technology. arXiv preprint arXiv:2403.08295 (2024)."},{"key":"e_1_3_2_2_46_1","volume-title":"Vision-Centric Exploration of Multimodal LLMs. In Conference on Neural Information Processing Systems. 87310-87356","author":"Tong Peter","year":"2024","unstructured":"Peter Tong, Ellis Brown, Penghao Wu, Sanghyun Woo, Adithya Jairam Vedagiri IYER, Sai Charitha Akula, Shusheng Yang, Jihan Yang, Manoj Middepogu, Ziteng Wang, et al., 2024. Cambrian-1: A Fully Open, Vision-Centric Exploration of Multimodal LLMs. In Conference on Neural Information Processing Systems. 87310-87356."},{"key":"e_1_3_2_2_47_1","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar et al. 2023a. LLaMA: Open and Efficient Foundation Language Models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_2_48_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023b. Llama 2: Open Foundation and Fine-Tuned Chat Models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_2_49_1","volume-title":"Fine-Grained Detecting and Calibrating Abnormal Human-Body. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 21226-21237","author":"Wang Zeqing","year":"2025","unstructured":"Zeqing Wang, Qingyang Ma, Wentao Wan, Haojie Li, Keze Wang, and Yonghong Tian. 2025. Is This Generated Person Existed in Real-World? Fine-Grained Detecting and Calibrating Abnormal Human-Body. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 21226-21237."},{"key":"e_1_3_2_2_50_1","volume-title":"Improving Transferable Targeted Adversarial Attacks with Model Self-Enhancement. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 24615-24624","author":"Wu Han","year":"2024","unstructured":"Han Wu, Guanyan Ou, Weibin Wu, and Zibin Zheng. 2024. Improving Transferable Targeted Adversarial Attacks with Model Self-Enhancement. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 24615-24624."},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3715784"},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3729397"},{"key":"e_1_3_2_2_53_1","volume-title":"Practical and Efficient Model Extraction of Sentiment Analysis APIs. In International Conference on Software Engineering. 524-536","author":"Wu Weibin","year":"2023","unstructured":"Weibin Wu, Jianping Zhang, Victor Junqiu Wei, Xixian Chen, Zibin Zheng, Irwin King, and Michael R Lyu. 2023. Practical and Efficient Model Extraction of Sentiment Analysis APIs. In International Conference on Software Engineering. 524-536."},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00438"},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00420"},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_2_2_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3369699"},{"key":"e_1_3_2_2_58_1","first-page":"3881","article-title":"Sentiment Analysis in the Era of Large Language Models","volume":"2024","author":"Zhang Wenxuan","year":"2024","unstructured":"Wenxuan Zhang, Yue Deng, Bing Liu, Sinno Pan, and Lidong Bing. 2024a. Sentiment Analysis in the Era of Large Language Models: A Reality Check. In Findings of the Association for Computational Linguistics: NAACL 2024. 3881-3906.","journal-title":"A Reality Check. In Findings of the Association for Computational Linguistics: NAACL"},{"key":"e_1_3_2_2_59_1","volume-title":"MiniGPT-5: Interleaved Vision-And-Language Generation via Generative Vokens. arXiv preprint arXiv:2310.02239","author":"Zheng Kaizhi","year":"2023","unstructured":"Kaizhi Zheng, Xuehai He, and Xin Eric Wang. 2023. MiniGPT-5: Interleaved Vision-And-Language Generation via Generative Vokens. arXiv preprint arXiv:2310.02239 (2023)."},{"key":"e_1_3_2_2_60_1","volume-title":"ViCor: Bridging Visual Understanding and Commonsense Reasoning With Large Language Models. arXiv preprint arXiv:2310.05872","author":"Zhou Kaiwen","year":"2023","unstructured":"Kaiwen Zhou, Kwonjoon Lee, Teruhisa Misu, and Xin Eric Wang. 2023. ViCor: Bridging Visual Understanding and Commonsense Reasoning With Large Language Models. arXiv preprint arXiv:2310.05872 (2023)."},{"key":"e_1_3_2_2_61_1","volume-title":"Conditional Prompt Learning for Vision-Language Models. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 16816-16825","author":"Zhou Kaiyang","year":"2022","unstructured":"Kaiyang Zhou, Jingkang Yang, Chen Change Loy, and Ziwei Liu. 2022. Conditional Prompt Learning for Vision-Language Models. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 16816-16825."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755124","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:59:52Z","timestamp":1765310392000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755124"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":61,"alternative-id":["10.1145\/3746027.3755124","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755124","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}