{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,16]],"date-time":"2026-06-16T18:51:50Z","timestamp":1781635910043,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":36,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:00:00Z","timestamp":1733184000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,12,3]]},"DOI":"10.1145\/3696409.3700483","type":"proceedings-article","created":{"date-parts":[[2024,12,28]],"date-time":"2024-12-28T09:55:23Z","timestamp":1735379723000},"page":"1-1","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Watermarking Vision-Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-4267-473X","authenticated-orcid":false,"given":"Shan","family":"Wan","sequence":"first","affiliation":[{"name":"Shanghai Normal University, Shanghai, CN"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1633-7575","authenticated-orcid":false,"given":"Wu","family":"Liu","sequence":"additional","affiliation":[{"name":"University of Science and Technology, Hefei, Anhui, CN"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7417-8399","authenticated-orcid":false,"given":"Yijun","family":"Liu","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering_Chinese Academy of Sciences, Beijing, CN"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3286-1481","authenticated-orcid":false,"given":"Feiniu","family":"Yuan","sequence":"additional","affiliation":[{"name":"Shanghai Normal University, Shanghai, CN"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-1235-6941","authenticated-orcid":false,"given":"Chunli","family":"Meng","sequence":"additional","affiliation":[{"name":"Shanghai Normal University, Shanghai, CN"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,12,28]]},"reference":[{"key":"e_1_3_3_2_2_2","first-page":"1615","volume-title":"27th USENIX Security Symposium (USENIX Security 18)","author":"Adi Yossi","year":"2018","unstructured":"Yossi Adi, Carsten Baum, Moustapha Cisse, Benny Pinkas, and Joseph Keshet. 2018. Turning your weakness into a strength: Watermarking deep neural networks by backdooring. In 27th USENIX Security Symposium (USENIX Security 18). 1615\u20131631. https:\/\/dl.acm.org\/doi\/10.5555\/3277203.3277324"},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01605"},{"key":"e_1_3_3_2_4_2","unstructured":"Wenliang Dai Junnan Li Dongxu Li Anthony Meng\u00a0Huat Tiong Junqi Zhao Weisheng Wang Boyang Li Pascale Fung and Steven Hoi. 2023. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. arxiv:https:\/\/arXiv.org\/abs\/2305.06500\u00a0[cs.CV]"},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00670"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00503"},{"key":"e_1_3_3_2_7_2","unstructured":"Lixin Fan Kam\u00a0Woh Ng and Chee\u00a0Seng Chan. 2019. [Extended version] Rethinking Deep Neural Network Ownership Verification: Embedding Passports to Defeat Ambiguity Attacks. arxiv:https:\/\/arXiv.org\/abs\/1909.07830\u00a0[cs.CR]"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00291"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.5555\/1996289"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"crossref","unstructured":"Guang Hua Andrew Beng\u00a0Jin Teoh Yong Xiang and Hao Jiang. 2023. Unambiguous and high-fidelity backdoor watermarking for deep neural networks. IEEE Transactions on Neural Networks and Learning Systems (2023).","DOI":"10.1109\/TNNLS.2023.3250210"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"crossref","unstructured":"Fred Jelinek Robert\u00a0L Mercer Lalit\u00a0R Bahl and James\u00a0K Baker. 1977. Perplexity\u2014a measure of the difficulty of speech recognition tasks. The Journal of the Acoustical Society of America 62 S1 (1977) S63\u2013S63.","DOI":"10.1121\/1.2016299"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_3_2_13_2","first-page":"5583","volume-title":"International conference on machine learning","author":"Kim Wonjae","year":"2021","unstructured":"Wonjae Kim, Bokyung Son, and Ildoo Kim. 2021. Vilt: Vision-and-language transformer without convolution or region supervision. In International conference on machine learning. PMLR, 5583\u20135594."},{"key":"e_1_3_3_2_14_2","first-page":"17061","volume-title":"International Conference on Machine Learning","author":"Kirchenbauer John","year":"2023","unstructured":"John Kirchenbauer, Jonas Geiping, Yuxin Wen, Jonathan Katz, Ian Miers, and Tom Goldstein. 2023. A watermark for large language models. In International Conference on Machine Learning. PMLR, 17061\u201317084."},{"key":"e_1_3_3_2_15_2","unstructured":"John Kirchenbauer Jonas Geiping Yuxin Wen Manli Shu Khalid Saifullah Kezhi Kong Kasun Fernando Aniruddha Saha Micah Goldblum and Tom Goldstein. 2023. On the reliability of watermarks for large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2306.04634 (2023)."},{"key":"e_1_3_3_2_16_2","unstructured":"Rohith Kuditipudi John Thickstun Tatsunori Hashimoto and Percy Liang. 2023. Robust distortion-free watermarks for language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.15593 (2023)."},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","unstructured":"Erwan Le\u00a0Merrer Patrick P\u00e9rez and Gilles Tr\u00e9dan. 2019. Adversarial frontier stitching for remote neural network watermarking. Neural Computing and Applications 32 13 (Aug. 2019) 9233\u20139244. 10.1007\/s00521-019-04434-z","DOI":"10.1007\/s00521-019-04434-z"},{"key":"e_1_3_3_2_18_2","unstructured":"Junnan Li Dongxu Li Silvio Savarese and Steven Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. arxiv:https:\/\/arXiv.org\/abs\/2301.12597\u00a0[cs.CV]"},{"key":"e_1_3_3_2_19_2","first-page":"12888","volume-title":"International conference on machine learning","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888\u201312900."},{"key":"e_1_3_3_2_20_2","unstructured":"Junnan Li Ramprasaath Selvaraju Akhilesh Gotmare Shafiq Joty Caiming Xiong and Steven Chu\u00a0Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems 34 (2021) 9694\u20139705."},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"crossref","unstructured":"Mingjie Li Zichi Wang and Xinpeng Zhang. 2023. An effective framework for intellectual property protection of NLG models. Symmetry 15 6 (2023) 1287.","DOI":"10.3390\/sym15061287"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02240"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"crossref","unstructured":"Jian\u00a0Han Lim Chee\u00a0Seng Chan Kam\u00a0Woh Ng Lixin Fan and Qiang Yang. 2022. Protect show attend and tell: Empowering image captioning models with ownership protection. Pattern Recognition 122 (2022) 108285.","DOI":"10.1016\/j.patcog.2021.108285"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"crossref","unstructured":"Chong Liu Yuqi Zhang Hongsong Wang Weihua Chen Fan Wang Yan Huang Yi-Dong Shen and Liang Wang. 2023. Efficient token-guided image-text retrieval with consistent multimodal contrastive training. IEEE Transactions on Image Processing (2023).","DOI":"10.1109\/TIP.2023.3286710"},{"key":"e_1_3_3_2_26_2","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong\u00a0Jae Lee. 2023. Visual Instruction Tuning."},{"key":"e_1_3_3_2_27_2","first-page":"311","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311\u2013318. https:\/\/dl.acm.org\/doi\/10.3115\/1073083.1073135"},{"key":"e_1_3_3_2_28_2","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748\u20138763."},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"crossref","unstructured":"Bita\u00a0Darvish Rouhani Huili Chen and Farinaz Koushanfar. 2018. DeepSigns: A Generic Watermarking Framework for IP Protection of Deep Learning Models. arxiv:https:\/\/arXiv.org\/abs\/1804.00750\u00a0[cs.CR]","DOI":"10.1145\/3297858.3304051"},{"key":"e_1_3_3_2_30_2","unstructured":"Yuanmin Tang Jing Yu Keke Gai Xiangyan Qu Yue Hu Gang Xiong and Qi Wu. 2023. Watermarking Vision-Language Pre-trained Models for Multi-modal Embedding as a Service. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.05863 (2023)."},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"publisher","DOI":"10.1145\/3078971.3078974"},{"key":"e_1_3_3_2_32_2","unstructured":"Jiapeng Wang Chengyu Wang Xiaodan Wang Jun Huang and Lianwen Jin. 2023. ConaCLIP: Exploring Distillation of Fully-Connected Knowledge Interaction Graph for Lightweight Text-Image Retrieval. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.17652 (2023)."},{"key":"e_1_3_3_2_33_2","unstructured":"Lean Wang Wenkai Yang Deli Chen Hao Zhou Yankai Lin Fandong Meng Jie Zhou and Xu Sun. 2023. Towards codable text watermarking for large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.15992 (2023)."},{"key":"e_1_3_3_2_34_2","unstructured":"Xi Yang Kejiang Chen Weiming Zhang Chang Liu Yuang Qi Jie Zhang Han Fang and Nenghai Yu. 2023. Watermarking text generated by black-box language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.08883 (2023)."},{"key":"e_1_3_3_2_35_2","unstructured":"KiYoon Yoo Wonhyuk Ahn Jiho Jang and Nojun Kwak. 2023. Robust multi-bit natural language watermarking through invariant features. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.01904 (2023)."},{"key":"e_1_3_3_2_36_2","unstructured":"Jie Zhang Dongdong Chen Jing Liao Weiming Zhang Gang Hua and Nenghai Yu. 2020. Passport-aware normalization for deep model protection. Advances in Neural Information Processing Systems 33 (2020) 22619\u201322628."},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"publisher","DOI":"10.1145\/3196494.3196550"}],"event":{"name":"MMAsia '24: ACM Multimedia Asia","location":"Auckland New Zealand","acronym":"MMAsia '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 6th ACM International Conference on Multimedia in Asia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696409.3700483","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3696409.3700483","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:10:16Z","timestamp":1750295416000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696409.3700483"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,3]]},"references-count":36,"alternative-id":["10.1145\/3696409.3700483","10.1145\/3696409"],"URL":"https:\/\/doi.org\/10.1145\/3696409.3700483","relation":{},"subject":[],"published":{"date-parts":[[2024,12,3]]},"assertion":[{"value":"2024-12-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}