{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:20:25Z","timestamp":1765308025682,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":82,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2024QY1400"],"award-info":[{"award-number":["2024QY1400"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62425604"],"award-info":[{"award-number":["62425604"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3758156","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:44:48Z","timestamp":1761371088000},"page":"12304-12313","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Specify Privacy Yourself: Assessing Inference-Time Personalized Privacy Preservation Ability of Large Vision-Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-0017-6415","authenticated-orcid":false,"given":"Xingqi","family":"Wang","sequence":"first","affiliation":[{"name":"Tsinghua University, Beijing, China and Key Laboratory of Pervasive Computing, Ministry of Education, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2710-1613","authenticated-orcid":false,"given":"Xiaoyuan","family":"Yi","sequence":"additional","affiliation":[{"name":"Microsoft Research Asia, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8608-8482","authenticated-orcid":false,"given":"Xing","family":"Xie","sequence":"additional","affiliation":[{"name":"Microsoft Research Asia, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-8449-278X","authenticated-orcid":false,"given":"Jia","family":"Jia","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, BNRist, Tsinghua University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al., 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_2_2_1","unstructured":"Anthropic. 2024. The Claude 3 Model Family: Opus Sonnet Haiku. https:\/\/assets.anthropic.com\/m\/61e7d27f8c8f5919\/original\/Claude-3-Model-Card.pdf"},{"key":"e_1_3_2_2_3_1","unstructured":"Amanda Askell Yuntao Bai Anna Chen Dawn Drain Deep Ganguli Tom Henighan Andy Jones Nicholas Joseph Ben Mann Nova DasSarma et al. 2021. A general language assistant as a laboratory for alignment. arXiv preprint arXiv:2112.00861 (2021)."},{"key":"e_1_3_2_2_4_1","unstructured":"Jinze Bai Shuai Bai Yunfei Chu Zeyu Cui Kai Dang Xiaodong Deng Yang Fan Wenbin Ge Yu Han Fei Huang et al. 2023. Qwen technical report. arXiv preprint arXiv:2309.16609 (2023)."},{"key":"e_1_3_2_2_5_1","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang et al. 2025. Qwen2. 5-vl technical report. arXiv preprint arXiv:2502.13923 (2025)."},{"key":"e_1_3_2_2_6_1","unstructured":"Yuntao Bai Saurav Kadavath Sandipan Kundu Amanda Askell Jackson Kernion Andy Jones Anna Chen Anna Goldie Azalia Mirhoseini Cameron McKinnon et al. 2022. Constitutional ai: Harmlessness from ai feedback. arXiv preprint arXiv:2212.08073 (2022)."},{"key":"e_1_3_2_2_7_1","volume-title":"Special characters attack: Toward scalable training data extraction from large language models. arXiv preprint arXiv:2405.05990","author":"Bai Yang","year":"2024","unstructured":"Yang Bai, Ge Pei, Jindong Gu, Yong Yang, and Xingjun Ma. 2024. Special characters attack: Toward scalable training data extraction from large language models. arXiv preprint arXiv:2405.05990 (2024)."},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.2307\/2334029"},{"key":"e_1_3_2_2_9_1","volume-title":"The phantom menace: unmasking privacy leakages in vision-language models. arXiv preprint arXiv:2408.01228","author":"Caldarella Simone","year":"2024","unstructured":"Simone Caldarella, Massimiliano Mancini, Elisa Ricci, and Rahaf Aljundi. 2024. The phantom menace: unmasking privacy leakages in vision-language models. arXiv preprint arXiv:2408.01228 (2024)."},{"key":"e_1_3_2_2_10_1","unstructured":"Nicholas Carlini Florian Tramer Eric Wallace Matthew Jagielski Ariel Herbert-Voss Katherine Lee Adam Roberts Tom Brown Dawn Song Ulfar Erlingsson et al. 2021. Extracting training data from large language models. In 30th USENIX security symposium (USENIX Security 21). 2633-2650."},{"key":"e_1_3_2_2_11_1","volume-title":"Noise contrastive alignment of language models with explicit rewards. arXiv preprint arXiv:2402.05369","author":"Chen Huayu","year":"2024","unstructured":"Huayu Chen, Guande He, Lifan Yuan, Ganqu Cui, Hang Su, and Jun Zhu. 2024a. Noise contrastive alignment of language models with explicit rewards. arXiv preprint arXiv:2402.05369 (2024)."},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11280-024-01276-1"},{"key":"e_1_3_2_2_13_1","first-page":"70115","article-title":"Large language models are visual reasoning coordinators","volume":"36","author":"Chen Liangyu","year":"2023","unstructured":"Liangyu Chen, Bo Li, Sheng Shen, Jingkang Yang, Chunyuan Li, Kurt Keutzer, Trevor Darrell, and Ziwei Liu. 2023a. Large language models are visual reasoning coordinators. Advances in Neural Information Processing Systems, Vol. 36 (2023), 70115-70140.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_14_1","volume-title":"Janus-pro: Unified multimodal understanding and generation with data and model scaling. arXiv preprint arXiv:2501.17811","author":"Chen Xiaokang","year":"2025","unstructured":"Xiaokang Chen, Zhiyu Wu, Xingchao Liu, Zizheng Pan, Wen Liu, Zhenda Xie, Xingkai Yu, and Chong Ruan. 2025. Janus-pro: Unified multimodal understanding and generation with data and model scaling. arXiv preprint arXiv:2501.17811 (2025)."},{"key":"e_1_3_2_2_15_1","volume-title":"Can language models be instructed to protect personal information? arXiv preprint arXiv:2310.02224","author":"Chen Yang","year":"2023","unstructured":"Yang Chen, Ethan Mendes, Sauvik Das, Wei Xu, and Alan Ritter. 2023b. Can language models be instructed to protect personal information? arXiv preprint arXiv:2310.02224 (2023)."},{"key":"e_1_3_2_2_16_1","unstructured":"Zhe Chen Weiyun Wang Yue Cao Yangzhou Liu Zhangwei Gao Erfei Cui Jinguo Zhu Shenglong Ye Hao Tian Zhaoyang Liu et al. 2024c. Expanding performance boundaries of open-source multimodal models with model data and test-time scaling. arXiv preprint arXiv:2412.05271 (2024)."},{"key":"e_1_3_2_2_17_1","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 24185-24198","author":"Chen Zhe","year":"2024","unstructured":"Zhe Chen, Jiannan Wu, Wenhai Wang, Weijie Su, Guo Chen, Sen Xing, Muyan Zhong, Qinglong Zhang, Xizhou Zhu, Lewei Lu, et al., 2024d. Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 24185-24198."},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3712001"},{"key":"e_1_3_2_2_19_1","volume-title":"Measuring nominal scale agreement among many raters. Psychological bulletin","author":"Fleiss Joseph L","year":"1971","unstructured":"Joseph L Fleiss. 1971. Measuring nominal scale agreement among many raters. Psychological bulletin, Vol. 76, 5 (1971), 378."},{"key":"e_1_3_2_2_20_1","volume-title":"MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models. arXiv preprint arXiv:2306.13394","author":"Fu Chaoyou","year":"2023","unstructured":"Chaoyou Fu, Peixian Chen, Yunhang Shen, Yulei Qin, Mengdan Zhang, Xu Lin, Jinrui Yang, Xiawu Zheng, Ke Li, Xing Sun, et al., 2023. MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models. arXiv preprint arXiv:2306.13394 (2023)."},{"key":"e_1_3_2_2_21_1","volume-title":"Amir Zamir, and Maria Brbic.","author":"Gadetsky Artyom","year":"2025","unstructured":"Artyom Gadetsky, Andrei Atanov, Yulun Jiang, Zhitong Gao, Ghazal Hosseini Mighan, Amir Zamir, and Maria Brbic. 2025. Large (Vision) Language Models are Unsupervised In-Context Learners. arXiv preprint arXiv:2504.02349 (2025)."},{"key":"e_1_3_2_2_22_1","volume-title":"Chatglm: A family of large language models from glm-130b to glm-4 all tools. arXiv preprint arXiv:2406.12793","author":"Aohan Zeng Team GLM","year":"2024","unstructured":"Team GLM, Aohan Zeng, Bin Xu, Bowen Wang, Chenhui Zhang, Da Yin, Dan Zhang, Diego Rojas, Guanyu Feng, Hanlin Zhao, et al., 2024. Chatglm: A family of large language models from glm-130b to glm-4 all tools. arXiv preprint arXiv:2406.12793 (2024)."},{"key":"e_1_3_2_2_23_1","unstructured":"Google. 2024. Introducing Gemini 2.0: our new AI model for the agentic era. https:\/\/blog.google\/technology\/google-deepmind\/google-gemini-ai-update-december-2024\/"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0232"},{"key":"e_1_3_2_2_25_1","volume-title":"A Survey on Personalized Alignment-The Missing Piece for Large Language Models in Real-World Applications. arXiv preprint arXiv:2503.17003","author":"Guan Jian","year":"2025","unstructured":"Jian Guan, Junfei Wu, Jia-Nan Li, Chuanqi Cheng, and Wei Wu. 2025. A Survey on Personalized Alignment-The Missing Piece for Large Language Models in Real-World Applications. arXiv preprint arXiv:2503.17003 (2025)."},{"key":"e_1_3_2_2_26_1","volume-title":"Measuring massive multitask language understanding. arXiv preprint arXiv:2009.03300","author":"Hendrycks Dan","year":"2020","unstructured":"Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, and Jacob Steinhardt. 2020. Measuring massive multitask language understanding. arXiv preprint arXiv:2009.03300 (2020)."},{"key":"e_1_3_2_2_27_1","volume-title":"Membership Inference Attacks Against Vision-Language Models. arXiv preprint arXiv:2501.18624","author":"Hu Yuke","year":"2025","unstructured":"Yuke Hu, Zheng Li, Zhihao Liu, Yang Zhang, Zhan Qin, Kui Ren, and Chun Chen. 2025. Membership Inference Attacks Against Vision-Language Models. arXiv preprint arXiv:2501.18624 (2025)."},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.3390\/e19120656"},{"key":"e_1_3_2_2_29_1","unstructured":"Aaron Hurst Adam Lerer Adam P Goucher Adam Perelman Aditya Ramesh Aidan Clark AJ Ostrow Akila Welihinda Alan Hayes Alec Radford et al. 2024. Gpt-4o system card. arXiv preprint arXiv:2410.21276 (2024)."},{"key":"e_1_3_2_2_30_1","volume-title":"Yizhong Wang, Jack Hessel, Luke Zettlemoyer, Hannaneh Hajishirzi, Yejin Choi, and Prithviraj Ammanabrolu.","author":"Jang Joel","year":"2023","unstructured":"Joel Jang, Seungone Kim, Bill Yuchen Lin, Yizhong Wang, Jack Hessel, Luke Zettlemoyer, Hannaneh Hajishirzi, Yejin Choi, and Prithviraj Ammanabrolu. 2023. Personalized soups: Personalized large language model alignment via post-hoc parameter merging. arXiv preprint arXiv:2310.11564 (2023)."},{"key":"e_1_3_2_2_31_1","volume-title":"Modeling privacy control in context-aware systems","author":"Jiang Xiaodong","year":"2002","unstructured":"Xiaodong Jiang and James A Landay. 2002. Modeling privacy control in context-aware systems. IEEE Pervasive computing, Vol. 1, 3 (2002), 59-63."},{"key":"e_1_3_2_2_32_1","volume-title":"Personalisation within bounds: A risk taxonomy and policy framework for the alignment of large language models with personalised feedback. arXiv preprint arXiv:2303.05453","author":"Kirk Hannah Rose","year":"2023","unstructured":"Hannah Rose Kirk, Bertie Vidgen, Paul R\u00f6ttger, and Scott A Hale. 2023. Personalisation within bounds: A risk taxonomy and policy framework for the alignment of large language models with personalised feedback. arXiv preprint arXiv:2303.05453 (2023)."},{"key":"e_1_3_2_2_33_1","first-page":"73783","article-title":"Aligning to thousands of preferences via system message generalization","volume":"37","author":"Lee Seongyun","year":"2024","unstructured":"Seongyun Lee, Sue Hyun Park, Seungone Kim, and Minjoon Seo. 2024. Aligning to thousands of preferences via system message generalization. Advances in Neural Information Processing Systems, Vol. 37 (2024), 73783-73829.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3589334.3645408"},{"key":"e_1_3_2_2_35_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730-19742."},{"key":"e_1_3_2_2_36_1","volume-title":"International conference on machine learning. PMLR, 12888-12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888-12900."},{"key":"e_1_3_2_2_37_1","volume-title":"Scaling up personalized preference for user-level alignment. arXiv preprint arXiv:2503.15463","author":"Li Jia-Nan","year":"2025","unstructured":"Jia-Nan Li, Jian Guan, Songhao Wu, Wei Wu, and Rui Yan. 2025. From 1,000,000 users to every user: Scaling up personalized preference for user-level alignment. arXiv preprint arXiv:2503.15463 (2025)."},{"key":"e_1_3_2_2_38_1","volume-title":"Large language models can be strong differentially private learners. arXiv preprint arXiv:2110.05679","author":"Li Xuechen","year":"2021","unstructured":"Xuechen Li, Florian Tramer, Percy Liang, and Tatsunori Hashimoto. 2021. Large language models can be strong differentially private learners. arXiv preprint arXiv:2110.05679 (2021)."},{"key":"e_1_3_2_2_39_1","volume-title":"Personalized language modeling from personalized human feedback. arXiv preprint arXiv:2402.05133","author":"Li Xinyu","year":"2024","unstructured":"Xinyu Li, Ruiyang Zhou, Zachary C Lipton, and Liu Leqi. 2024c. Personalized language modeling from personalized human feedback. arXiv preprint arXiv:2402.05133 (2024)."},{"key":"e_1_3_2_2_40_1","first-page":"98645","article-title":"Membership inference attacks against large vision-language models","volume":"37","author":"Li Zhan","year":"2024","unstructured":"Zhan Li, Yongtao Wu, Yihang Chen, Francesco Tonin, Elias Abad Rocamora, and Volkan Cevher. 2024a. Membership inference attacks against large vision-language models. Advances in Neural Information Processing Systems, Vol. 37 (2024), 98645-98674.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_41_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2023), 34892-34916."},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4235-6"},{"key":"e_1_3_2_2_43_1","volume-title":"Protecting privacy in multimodal large language models with mllmu-bench. arXiv preprint arXiv:2410.22108","author":"Liu Zheyuan","year":"2024","unstructured":"Zheyuan Liu, Guangyao Dou, Mengzhao Jia, Zhaoxuan Tan, Qingkai Zeng, Yongle Yuan, and Meng Jiang. 2024a. Protecting privacy in multimodal large language models with mllmu-bench. arXiv preprint arXiv:2410.22108 (2024)."},{"key":"e_1_3_2_2_44_1","volume-title":"Safety alignment for vision language models. arXiv preprint arXiv:2405.13581","author":"Liu Zhendong","year":"2024","unstructured":"Zhendong Liu, Yuanbi Nie, Yingshui Tan, Xiangyu Yue, Qiushi Cui, Chongjun Wang, Xiaoyong Zhu, and Bo Zheng. 2024c. Safety alignment for vision language models. arXiv preprint arXiv:2405.13581 (2024)."},{"key":"e_1_3_2_2_45_1","volume-title":"Doxing via the Lens: Revealing Privacy Leakage in Image Geolocation for Agentic Multi-Modal Large Reasoning Model. arXiv preprint arXiv:2504.19373","author":"Luo Weidi","year":"2025","unstructured":"Weidi Luo, Qiming Zhang, Tianyu Lu, Xiaogeng Liu, Yue Zhao, Zhen Xiang, and Chaowei Xiao. 2025. Doxing via the Lens: Revealing Privacy Leakage in Image Geolocation for Agentic Multi-Modal Large Reasoning Model. arXiv preprint arXiv:2504.19373 (2025)."},{"key":"e_1_3_2_2_46_1","volume-title":"OK-VQA: A Visual Question Answering Benchmark Requiring External Knowledge. In Conference on Computer Vision and Pattern Recognition (CVPR).","author":"Marino Kenneth","year":"2019","unstructured":"Kenneth Marino, Mohammad Rastegari, Ali Farhadi, and Roozbeh Mottaghi. 2019. OK-VQA: A Visual Question Answering Benchmark Requiring External Knowledge. In Conference on Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_2_2_47_1","volume-title":"Granular privacy control for geolocation with vision language models. arXiv preprint arXiv:2407.04952","author":"Mendes Ethan","year":"2024","unstructured":"Ethan Mendes, Yang Chen, James Hays, Sauvik Das, Wei Xu, and Alan Ritter. 2024. Granular privacy control for geolocation with vision language models. arXiv preprint arXiv:2407.04952 (2024)."},{"key":"e_1_3_2_2_48_1","unstructured":"Meta. 2024. Llama 3.2: Revolutionizing edge AI and vision with open customizable models. https:\/\/ai.meta.com\/blog\/llama-3-2-connect-2024-vision-edge-mobile-devices\/"},{"key":"e_1_3_2_2_49_1","volume-title":"The llama 4 herd: The beginning of a new era of natively multimodal ai innovation. https:\/\/ai. meta.com\/blog\/llama-4-multimodal-intelligence\/, checked on","author":"Meta AI","year":"2025","unstructured":"AI Meta. 2025. The llama 4 herd: The beginning of a new era of natively multimodal ai innovation. https:\/\/ai. meta.com\/blog\/llama-4-multimodal-intelligence\/, checked on, Vol. 4, 7 (2025), 2025."},{"key":"e_1_3_2_2_50_1","volume-title":"Privacy issues in large language models: A survey. arXiv preprint arXiv:2312.06717","author":"Neel Seth","year":"2023","unstructured":"Seth Neel and Peter Chang. 2023. Privacy issues in large language models: A survey. arXiv preprint arXiv:2312.06717 (2023)."},{"key":"e_1_3_2_2_51_1","unstructured":"OpenAI. 2025. OpenAI o3 and o4-mini System Card. https:\/\/cdn.openai.com\/pdf\/2221c875-02dc-4789-800b-e7758f3722c1\/o3-and-o4-mini-system-card.pdf"},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.398"},{"key":"e_1_3_2_2_53_1","unstructured":"Long Ouyang Jeffrey Wu Xu Jiang Diogo Almeida Carroll Wainwright Pamela Mishkin Chong Zhang Sandhini Agarwal Katarina Slama Alex Ray et al. 2022. Training language models to follow instructions with human feedback. Advances in neural information processing systems Vol. 35 (2022) 27730-27744."},{"key":"e_1_3_2_2_54_1","volume-title":"International conference on machine learning. PmLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748-8763."},{"key":"e_1_3_2_2_55_1","first-page":"53728","article-title":"Direct preference optimization: Your language model is secretly a reward model","volume":"36","author":"Rafailov Rafael","year":"2023","unstructured":"Rafael Rafailov, Archit Sharma, Eric Mitchell, Christopher D Manning, Stefano Ermon, and Chelsea Finn. 2023. Direct preference optimization: Your language model is secretly a reward model. Advances in Neural Information Processing Systems, Vol. 36 (2023), 53728-53741.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1410"},{"key":"e_1_3_2_2_57_1","volume-title":"Privacy-aware visual language models. arXiv preprint arXiv:2405.17423","author":"Samson Laurens","year":"2024","unstructured":"Laurens Samson, Nimrod Barazani, Sennay Ghebreab, and Yuki M Asano. 2024. Privacy-aware visual language models. arXiv preprint arXiv:2405.17423 (2024)."},{"key":"e_1_3_2_2_58_1","volume-title":"Distributional preference learning: Understanding and accounting for hidden context in rlhf. arXiv preprint arXiv:2312.08358","author":"Siththaranjan Anand","year":"2023","unstructured":"Anand Siththaranjan, Cassidy Laidlaw, and Dylan Hadfield-Menell. 2023. Distributional preference learning: Understanding and accounting for hidden context in rlhf. arXiv preprint arXiv:2312.08358 (2023)."},{"key":"e_1_3_2_2_59_1","volume-title":"Carolyn Ashurst, and Adrian Weller.","author":"Smith Victoria","year":"2023","unstructured":"Victoria Smith, Ali Shahin Shamsabadi, Carolyn Ashurst, and Adrian Weller. 2023. Identifying and mitigating privacy risks stemming from language models: A survey. arXiv preprint arXiv:2310.01424 (2023)."},{"key":"e_1_3_2_2_60_1","volume-title":"Beyond memorization: Violating privacy via inference with large language models. arXiv preprint arXiv:2310.07298","author":"Staab Robin","year":"2023","unstructured":"Robin Staab, Mark Vero, Mislav Balunovi\u0107, and Martin Vechev. 2023. Beyond memorization: Violating privacy via inference with large language models. arXiv preprint arXiv:2310.07298 (2023)."},{"key":"e_1_3_2_2_61_1","volume-title":"Personalized pieces: Efficient personalized large language models through collaborative efforts. arXiv preprint arXiv:2406.10471","author":"Tan Zhaoxuan","year":"2024","unstructured":"Zhaoxuan Tan, Zheyuan Liu, and Meng Jiang. 2024. Personalized pieces: Efficient personalized large language models through collaborative efforts. arXiv preprint arXiv:2406.10471 (2024)."},{"key":"e_1_3_2_2_62_1","unstructured":"Gemini Team Rohan Anil Sebastian Borgeaud Jean-Baptiste Alayrac Jiahui Yu Radu Soricut Johan Schalkwyk Andrew M Dai Anja Hauth Katie Millican et al. 2023. Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)."},{"key":"e_1_3_2_2_63_1","volume-title":"Ryan Burnell, Libin Bai, Anmol Gulati, Garrett Tanzer, Damien Vincent, Zhufeng Pan, Shibo Wang, et al.","author":"Team Gemini","year":"2024","unstructured":"Gemini Team, Petko Georgiev, Ving Ian Lei, Ryan Burnell, Libin Bai, Anmol Gulati, Garrett Tanzer, Damien Vincent, Zhufeng Pan, Shibo Wang, et al., 2024. Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context. arXiv preprint arXiv:2403.05530 (2024)."},{"key":"e_1_3_2_2_64_1","unstructured":"Qwen Team. 2024. Qwen2-VL: To See the World More Clearly. https:\/\/qwenlm.github.io\/blog\/qwen2-vl\/"},{"key":"e_1_3_2_2_65_1","volume-title":"Private Attribute Inference from Images with Vision-Language Models. arXiv preprint arXiv:2404.10618","author":"T\u00f6mek\u00e7e Batuhan","year":"2024","unstructured":"Batuhan T\u00f6mek\u00e7e, Mark Vero, Robin Staab, and Martin Vechev. 2024. Private Attribute Inference from Images with Vision-Language Models. arXiv preprint arXiv:2404.10618 (2024)."},{"key":"e_1_3_2_2_66_1","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan Jinze Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge et al. 2024a. Qwen2-vl: Enhancing vision-language model's perception of the world at any resolution. arXiv preprint arXiv:2409.12191 (2024)."},{"key":"e_1_3_2_2_67_1","first-page":"121475","article-title":"Cogvlm: Visual expert for pretrained language models","volume":"37","author":"Wang Weihan","year":"2024","unstructured":"Weihan Wang, Qingsong Lv, Wenmeng Yu, Wenyi Hong, Ji Qi, Yan Wang, Junhui Ji, Zhuoyi Yang, Lei Zhao, Song XiXuan, et al., 2024b. Cogvlm: Visual expert for pretrained language models. Advances in Neural Information Processing Systems, Vol. 37 (2024), 121475-121499.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_68_1","volume-title":"Aligning large language models with human: A survey. arXiv preprint arXiv:2307.12966","author":"Wang Yufei","year":"2023","unstructured":"Yufei Wang, Wanjun Zhong, Liangyou Li, Fei Mi, Xingshan Zeng, Wenyong Huang, Lifeng Shang, Xin Jiang, and Qun Liu. 2023. Aligning large language models with human: A survey. arXiv preprint arXiv:2307.12966 (2023)."},{"key":"e_1_3_2_2_69_1","unstructured":"Simon Willison. 2025. Watching o3 guess a photo's location is surreal dystopian and wildly entertaining. https:\/\/simonwillison.net\/2025\/Apr\/26\/o3-photo-locations\/"},{"key":"e_1_3_2_2_70_1","unstructured":"Zhiyu Wu Xiaokang Chen Zizheng Pan Xingchao Liu Wen Liu Damai Dai Huazuo Gao Yiyang Ma Chengyue Wu Bingxuan Wang et al. 2024. Deepseek-vl2: Mixture-of-experts vision-language models for advanced multimodal understanding. arXiv preprint arXiv:2412.10302 (2024)."},{"key":"e_1_3_2_2_71_1","volume-title":"On protecting the data privacy of large language models (llms): A survey. arXiv preprint arXiv:2403.05156","author":"Yan Biwei","year":"2024","unstructured":"Biwei Yan, Kun Li, Minghui Xu, Yueyan Dong, Yue Zhang, Zhaochun Ren, and Xiuzhen Cheng. 2024. On protecting the data privacy of large language models (llms): A survey. arXiv preprint arXiv:2403.05156 (2024)."},{"key":"e_1_3_2_2_72_1","volume-title":"No Preference Left Behind: Group Distributional Preference Optimization. arXiv preprint arXiv:2412.20299","author":"Yao Binwei","year":"2024","unstructured":"Binwei Yao, Zefan Cai, Yun-Shiuan Chuang, Shanglin Yang, Ming Jiang, Diyi Yang, and Junjie Hu. 2024. No Preference Left Behind: Group Distributional Preference Optimization. arXiv preprint arXiv:2412.20299 (2024)."},{"key":"e_1_3_2_2_73_1","volume-title":"J Pablo Mu noz, and Ali Jannesari","author":"Yu Sixing","year":"2023","unstructured":"Sixing Yu, J Pablo Mu noz, and Ali Jannesari. 2023. Federated foundation models: Privacy-preserving and collaborative learning for large models. arXiv preprint arXiv:2305.11414 (2023)."},{"key":"e_1_3_2_2_74_1","volume-title":"Human Preference: A Survey. arXiv preprint arXiv:2503.14504","author":"Yu Tao","year":"2025","unstructured":"Tao Yu, Yi-Fan Zhang, Chaoyou Fu, Junkang Wu, Jinda Lu, Kun Wang, Xingyu Lu, Yunhang Shen, Guibin Zhang, Dingjie Song, et al., 2025. Aligning Multimodal LLM with Human Preference: A Survey. arXiv preprint arXiv:2503.14504 (2025)."},{"key":"e_1_3_2_2_75_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00913"},{"key":"e_1_3_2_2_76_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00069"},{"key":"e_1_3_2_2_77_1","volume-title":"Multi-P2A: A Multi-perspective Benchmark on Privacy Assessment for Large Vision-Language Models. arXiv preprint arXiv:2412.19496","author":"Zhang Jie","year":"2024","unstructured":"Jie Zhang, Xiangkui Cao, Zhouyu Han, Shiguang Shan, and Xilin Chen. 2024a. Multi-P2A: A Multi-perspective Benchmark on Privacy Assessment for Large Vision-Language Models. arXiv preprint arXiv:2412.19496 (2024)."},{"key":"e_1_3_2_2_78_1","doi-asserted-by":"publisher","DOI":"10.1145\/3708501"},{"key":"e_1_3_2_2_79_1","first-page":"55006","article-title":"Lima: Less is more for alignment","volume":"36","author":"Zhou Chunting","year":"2023","unstructured":"Chunting Zhou, Pengfei Liu, Puxin Xu, Srinivasan Iyer, Jiao Sun, Yuning Mao, Xuezhe Ma, Avia Efrat, Ping Yu, Lili Yu, et al., 2023. Lima: Less is more for alignment. Advances in Neural Information Processing Systems, Vol. 36 (2023), 55006-55021.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_80_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.940"},{"key":"e_1_3_2_2_81_1","volume-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592","author":"Zhu Deyao","year":"2023","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2023. Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)."},{"key":"e_1_3_2_2_82_1","doi-asserted-by":"publisher","DOI":"10.1145\/3209978.3210080"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3758156","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:16:04Z","timestamp":1765307764000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3758156"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":82,"alternative-id":["10.1145\/3746027.3758156","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3758156","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}