{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,15]],"date-time":"2026-04-15T17:57:09Z","timestamp":1776275829993,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":55,"publisher":"ACM","funder":[{"name":"National Nature Science Foundation of China","award":["62322211"],"award-info":[{"award-number":["62322211"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755212","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:51Z","timestamp":1761377211000},"page":"382-391","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["From Language to Instance: Generative Visual Prompting for Zero-shot Camouflaged Object Detection"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-7490-3752","authenticated-orcid":false,"given":"Zihou","family":"Zhang","sequence":"first","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China and University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1758-5936","authenticated-orcid":false,"given":"Hao","family":"Li","sequence":"additional","affiliation":[{"name":"National Engineering Research Center for Multimedia Software, School of Computer Science, Wuhan University, Wuhan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8190-1438","authenticated-orcid":false,"given":"Zhengwei","family":"Yang","sequence":"additional","affiliation":[{"name":"National Engineering Research Center for Multimedia Software, School of Computer Science, Wuhan University, Wuhan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6834-6137","authenticated-orcid":false,"given":"Zechao","family":"Hu","sequence":"additional","affiliation":[{"name":"National Engineering Research Center for Multimedia Software, School of Computer Science, Wuhan University, Wuhan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1943-8219","authenticated-orcid":false,"given":"Liang","family":"Li","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3846-9157","authenticated-orcid":false,"given":"Zheng","family":"Wang","sequence":"additional","affiliation":[{"name":"National Engineering Research Center for Multimedia Software, School of Computer Science, Wuhan University, Wuhan, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Localization, Text Reading, and Beyond. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.52202\/079017-2983"},{"key":"e_1_3_2_1_3_1","volume-title":"European Conference on Computer Vision. Springer, 332-348","author":"Chen Huafeng","year":"2024","unstructured":"Huafeng Chen, Dian Shao, Guangqian Guo, and Shan Gao. 2024b. Just a Hint: Point-Supervised Camouflaged Object Detection. In European Conference on Computer Vision. Springer, 332-348."},{"key":"e_1_3_2_1_4_1","volume-title":"European Conference on Computer Vision. Springer, 323-340","author":"Chen Yi-Chia","year":"2024","unstructured":"Yi-Chia Chen, Wei-Hua Li, Cheng Sun, Yu-Chiang Frank Wang, and Chu-Song Chen. 2024a. SAM4MLLM: Enhance Multi-Modal Large Language Model for Referring Expression Segmentation. In European Conference on Computer Vision. Springer, 323-340."},{"key":"e_1_3_2_1_5_1","first-page":"15863","article-title":"EmoDubber","author":"Cong Gaoxiang","year":"2025","unstructured":"Gaoxiang Cong, Jiadong Pan, Liang Li, Yuankai Qi, Yuxin Peng, Anton van den Hengel, Jian Yang, and Qingming Huang. 2025. EmoDubber: Towards High Quality and Emotion Controllable Movie Dubbing. In CVPR. 15863-15873.","journal-title":"Towards High Quality and Emotion Controllable Movie Dubbing. In CVPR."},{"key":"e_1_3_2_1_6_1","volume-title":"RoMa: Robust Dense Feature Matching. IEEE Conference on Computer Vision and Pattern Recognition","author":"Edstedt Johan","year":"2024","unstructured":"Johan Edstedt, Qiyu Sun, Georg B\u00f6kman, M\u00e5rten Wadenb\u00e4ck, and Michael Felsberg. 2024. RoMa: Robust Dense Feature Matching. IEEE Conference on Computer Vision and Pattern Recognition (2024)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3085766"},{"key":"e_1_3_2_1_8_1","first-page":"30726","article-title":"Weakly-supervised concealed object segmentation with sam-based pseudo labeling and multi-scale feature grouping","volume":"36","author":"He Chunming","year":"2023","unstructured":"Chunming He, Kai Li, Yachao Zhang, Guoxia Xu, Longxiang Tang, Yulun Zhang, Zhenhua Guo, and Xiu Li. 2023b. Weakly-supervised concealed object segmentation with sam-based pseudo labeling and multi-scale feature grouping. Advances in Neural Information Processing Systems, Vol. 36, 30726-30737.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i1.25156"},{"key":"e_1_3_2_1_10_1","volume-title":"INT: Instance-Specific Negative Mining for Task-Generic Promptable Segmentation. arXiv preprint arXiv:2501.18753","author":"Hu Jian","year":"2025","unstructured":"Jian Hu, Zixu Cheng, and Shaogang Gong. 2025. INT: Instance-Specific Negative Mining for Task-Generic Promptable Segmentation. arXiv preprint arXiv:2501.18753 (2025)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i11.29144"},{"key":"e_1_3_2_1_12_1","volume-title":"The Thirty-eighth Annual Conference on Neural Information Processing Systems.","author":"Hu Jian","year":"2024","unstructured":"Jian Hu, Jiayi Lin, Junchi Yan, and Shaogang Gong. 2024b. Leveraging Hallucinations to Reduce Manual Prompt Dependency in Promptable Segmentation. In The Thirty-eighth Annual Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00538"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2021.108414"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_2_1_16_1","volume-title":"Anabranch network for camouflaged object segmentation. Computer vision and image understanding","author":"Le Trung-Nghia","year":"2019","unstructured":"Trung-Nghia Le, Tam V Nguyen, Zhongliang Nie, Minh-Triet Tran, and Akihiro Sugimoto. 2019. Anabranch network for camouflaged object segmentation. Computer vision and image understanding, Vol. 184 (2019), 45-56."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i5.32497"},{"key":"e_1_3_2_1_18_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023a. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730-19742."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01710"},{"key":"e_1_3_2_1_20_1","volume-title":"Clip surgery for better explainability with enhancement in open-vocabulary tasks. arXiv e-prints","author":"Li Yi","year":"2023","unstructured":"Yi Li, Hualiang Wang, Yiqun Duan, and Xiaomeng Li. 2023b. Clip surgery for better explainability with enhancement in open-vocabulary tasks. arXiv e-prints (2023), arXiv-2304."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01640"},{"key":"e_1_3_2_1_22_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual instruction tuning. Advances in neural information processing systems, Vol. 36, 34892-34916."},{"key":"e_1_3_2_1_23_1","volume-title":"European Conference on Computer Vision. Springer, 38-55","author":"Liu Shilong","year":"2024","unstructured":"Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Qing Jiang, Chunyuan Li, Jianwei Yang, Hang Su, et al., 2024. Grounding dino: Marrying dino with grounded pre-training for open-set object detection. In European Conference on Computer Vision. Springer, 38-55."},{"key":"e_1_3_2_1_24_1","first-page":"3003","article-title":"Entity-enhanced adaptive reconstruction network for weakly supervised referring expression grounding","volume":"45","author":"Liu Xuejing","year":"2022","unstructured":"Xuejing Liu, Liang Li, Shuhui Wang, Zheng-Jun Zha, Zechao Li, Qi Tian, and Qingming Huang. 2022. Entity-enhanced adaptive reconstruction network for weakly supervised referring expression grounding. IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. 45, 3 (2022), 3003-3018.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_2_1_25_1","unstructured":"Haoyu Lu Wen Liu Bo Zhang Bingxuan Wang Kai Dong Bo Liu Jingxiang Sun Tongzheng Ren Zhuoshu Li Hao Yang et al. 2024. Deepseek-vl: towards real-world vision-language understanding. arXiv preprint arXiv:2403.05525 (2024)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00695"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01142"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680891"},{"key":"e_1_3_2_1_29_1","unstructured":"OpenAI. 2023. GPT-4V(ision) System Card. https:\/\/openai.com."},{"key":"e_1_3_2_1_30_1","unstructured":"Maxime Oquab Timoth\u00e9e Darcet Th\u00e9o Moutakanni Huy Vo Marc Szafraniec Vasil Khalidov Pierre Fernandez Daniel Haziza Francisco Massa Alaaeldin El-Nouby et al. 2024. DINOv2: Learning Robust Visual Features without Supervision. Transactions on Machine Learning Research Journal (2024)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00220"},{"key":"e_1_3_2_1_32_1","volume-title":"International conference on machine learning. PMLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748-8763."},{"key":"e_1_3_2_1_33_1","unstructured":"Tianhe Ren Shilong Liu Ailing Zeng Jing Lin Kunchang Li He Cao Jiayu Chen Xinyu Huang Yukang Chen Feng Yan et al. 2024. Grounded sam: Assembling open-world models for diverse visual tasks. arXiv preprint arXiv:2401.14159 (2024)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_35_1","unstructured":"Przemyslaw Skurowski Hassan Abdulameer Jakub Blaszczyk Tomasz Depta Adam Kornacki and Przemyslaw Kozie?. 2018. Animal camouflage analysis: Chameleon database. (2018) 7 pages."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01621"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680730"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02185-6"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00376"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3365104"},{"key":"e_1_3_2_1_41_1","volume-title":"European Conference on Computer Vision. Springer, 198-215","author":"Wan David","year":"2024","unstructured":"David Wan, Jaemin Cho, Elias Stengel-Eskin, and Mohit Bansal. 2024. Contrastive region guidance: Improving grounding in vision-language models without training. In European Conference on Computer Vision. Springer, 198-215."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00367"},{"key":"e_1_3_2_1_43_1","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan Jinze Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge et al. 2024a. Qwen2-vl: Enhancing vision-language model's perception of the world at any resolution. arXiv preprint arXiv:2409.12191 (2024)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.937"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00370"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00148"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1049\/cje.2021.00.455"},{"key":"e_1_3_2_1_48_1","volume-title":"Luc Van Gool, and Qibin Hou","author":"Yin Bowen","year":"2024","unstructured":"Bowen Yin, Xuying Zhang, Deng-Ping Fan, Shaohui Jiao, Ming-Ming Cheng, Luc Van Gool, and Qibin Hou. 2024. Camoformer: Masked separable attention for camouflaged object detection. IEEE Transactions on Pattern Analysis and Machine Intelligence (2024)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16434"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02214-4"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3432099"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01256"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.23919\/cje.2022.00.414"},{"key":"e_1_3_2_1_54_1","volume-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592","author":"Zhu Deyao","year":"2023","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2023. Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681190"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755212","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:10:07Z","timestamp":1765339807000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755212"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":55,"alternative-id":["10.1145\/3746027.3755212","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755212","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}