{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T20:22:40Z","timestamp":1776889360686,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":36,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3611911","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:40Z","timestamp":1698391660000},"page":"4928-4938","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Distilling Vision-Language Foundation Models: A Data-Free Approach via Prompt Diversification"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-6365-5475","authenticated-orcid":false,"given":"Yunyi","family":"Xuan","sequence":"first","affiliation":[{"name":"Hikvision Research Institute, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5508-473X","authenticated-orcid":false,"given":"Weijie","family":"Chen","sequence":"additional","affiliation":[{"name":"Zhejiang University &amp; Hikvision Research Institute, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9260-1334","authenticated-orcid":false,"given":"Shicai","family":"Yang","sequence":"additional","affiliation":[{"name":"Hikvision Research Institute, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8065-5901","authenticated-orcid":false,"given":"Di","family":"Xie","sequence":"additional","affiliation":[{"name":"Hikvision Research Institute, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1141-2487","authenticated-orcid":false,"given":"Luojun","family":"Lin","sequence":"additional","affiliation":[{"name":"Fuzhou University, Fuzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9017-2508","authenticated-orcid":false,"given":"Yueting","family":"Zhuang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence.","author":"Binici K.","unstructured":"K. Binici, S. Aggarwal, N. T. Pham, K. Leman, and T. Mitra. 2022. Robust and Resource-Efficient Data-Free Knowledge Distillation by Generative Pseudo Replay. In Proceedings of the AAAI Conference on Artificial Intelligence."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-11382-1_18"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00361"},{"key":"e_1_3_2_1_4_1","volume-title":"Hinton","author":"Chen Ting","year":"2020","unstructured":"Ting Chen, Simon Kornblith, Mohammad Norouzi, and Geoffrey E. Hinton. 2020a. A Simple Framework for Contrastive Learning of Visual Representations. ArXiv, Vol. abs\/2002.05709 (2020)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-66096-3_30"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00741"},{"key":"e_1_3_2_1_7_1","volume-title":"Exploiting hierarchical context on a large database of object categories. In 2010 IEEE computer society conference on computer vision and pattern recognition","author":"Choi Myung Jin","unstructured":"Myung Jin Choi, Joseph J Lim, Antonio Torralba, and Alan S Willsky. 2010. Exploiting hierarchical context on a large database of object categories. In 2010 IEEE computer society conference on computer vision and pattern recognition. IEEE, 129--136."},{"key":"e_1_3_2_1_8_1","volume-title":"Vqgan-clip: Open domain image generation and editing with natural language guidance. arXiv preprint arXiv:2204.08583","author":"Crowson Katherine","year":"2022","unstructured":"Katherine Crowson, Stella Biderman, Daniel Kornis, Dashiell Stander, Eric Hallahan, Louis Castricato, and Edward Raff. 2022. Vqgan-clip: Open domain image generation and editing with natural language guidance. arXiv preprint arXiv:2204.08583, Vol. 2 (2022)."},{"key":"e_1_3_2_1_9_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. ArXiv","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. ArXiv, Vol. abs\/1810.04805 (2019)."},{"key":"e_1_3_2_1_10_1","volume-title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. ArXiv","author":"Dosovitskiy Alexey","year":"1929","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. ArXiv, Vol. abs\/2010.11929 (2021)."},{"key":"e_1_3_2_1_11_1","unstructured":"M. Everingham L. Van Gool C. K. I. Williams J. Winn and A. Zisserman. [n. d.]. The PASCAL Visual Object Classes Challenge 2007 (VOC2007) Results. http:\/\/www.pascal-network.org\/challenges\/VOC\/voc2007\/workshop\/index.html."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.208"},{"key":"e_1_3_2_1_13_1","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence.","author":"Fang G.","unstructured":"G. Fang, K. Mo, X. Wang, J. Song, S. Bei, H. Zhang, and M. Song. 2021a. Up to 100x Faster Data-free Knowledge Distillation. In Proceedings of the AAAI Conference on Artificial Intelligence."},{"key":"e_1_3_2_1_14_1","unstructured":"G. Fang J. Song C. Shen X. Wang and M. Song. 2019. Data-Free Adversarial Distillation. (2019)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"G. Fang J. Song X. Wang C. Shen X. Wang and M. Song. 2021b. Contrastive Model Inversion for Data-Free Knowledge Distillation. In IJCAI.","DOI":"10.24963\/ijcai.2021\/327"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2006.79"},{"key":"e_1_3_2_1_17_1","volume-title":"Generative adversarial nets. Advances in neural information processing systems","author":"Goodfellow Ian","year":"2014","unstructured":"Ian Goodfellow, Jean Pouget-Abadie, Mehdi Mirza, Bing Xu, David Warde-Farley, Sherjil Ozair, Aaron Courville, and Yoshua Bengio. 2014. Generative adversarial nets. Advances in neural information processing systems, Vol. 27 (2014)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_19_1","first-page":"38","article-title":"Distilling the Knowledge in a Neural Network","volume":"14","author":"Hinton Geoffrey","year":"2015","unstructured":"Geoffrey Hinton, Oriol Vinyals, and Jeff Dean. 2015. Distilling the Knowledge in a Neural Network. Computer Science, Vol. 14, 7 (2015), 38--39.","journal-title":"Computer Science"},{"key":"e_1_3_2_1_20_1","volume-title":"Transductive Clip with Class-Conditional Contrastive Learning. In ICASSP 2022--2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 3858--3862","author":"Huang Junchu","year":"2022","unstructured":"Junchu Huang, Weijie Chen, Shicai Yang, Di Xie, Shiliang Pu, and Yueting Zhuang. 2022a. Transductive Clip with Class-Conditional Contrastive Learning. In ICASSP 2022--2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 3858--3862."},{"key":"e_1_3_2_1_21_1","volume-title":"Knowledge distillation from a stronger teacher. arXiv preprint arXiv:2205.10536","author":"Huang Tao","year":"2022","unstructured":"Tao Huang, Shan You, Fei Wang, Chen Qian, and Chang Xu. 2022b. Knowledge distillation from a stronger teacher. arXiv preprint arXiv:2205.10536 (2022)."},{"key":"e_1_3_2_1_22_1","volume-title":"Cross-Domain Weakly-Supervised Object Detection Through Progressive Domain Adaptation. 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Inoue Naoto","year":"2018","unstructured":"Naoto Inoue, Ryosuke Furuta, T. Yamasaki, and Kiyoharu Aizawa. 2018. Cross-Domain Weakly-Supervised Object Detection Through Progressive Domain Adaptation. 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2018), 5001--5009."},{"key":"e_1_3_2_1_23_1","unstructured":"Chao Jia Yinfei Yang Ye Xia Yi-Ting Chen Zarana Parekh Hieu Pham Quoc V. Le Yun-Hsuan Sung Zhen Li and Tom Duerig. 2021. Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision. In ICML."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.591"},{"key":"e_1_3_2_1_25_1","volume-title":"Large-scale generative data-free distillation. arXiv preprint arXiv:2012.05578","author":"Luo Liangchen","year":"2020","unstructured":"Liangchen Luo, Mark Sandler, Zi Lin, Andrey Zhmoginov, and Andrew Howard. 2020. Large-scale generative data-free distillation. arXiv preprint arXiv:2012.05578 (2020)."},{"key":"e_1_3_2_1_26_1","volume-title":"Attention Diversification for Domain Generalization. In European Conference on Computer Vision (ECCV).","author":"Meng Rang","year":"2022","unstructured":"Rang Meng, Xianfeng Li, Weijie Chen, Shicai Yang, Jie Song, Xinchao Wang, Lei Zhang, Mingli Song, Di Xie, and Shiliang Pu. 2022. Attention Diversification for Domain Generalization. In European Conference on Computer Vision (ECCV)."},{"key":"e_1_3_2_1_27_1","volume-title":"SLIP: Self-supervision meets Language-Image Pre-training. In ECCV.","author":"Mu Norman","year":"2022","unstructured":"Norman Mu, Alexander Kirillov, David A. Wagner, and Saining Xie. 2022. SLIP: Self-supervision meets Language-Image Pre-training. In ECCV."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2018.00271"},{"key":"e_1_3_2_1_29_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In ICML."},{"key":"e_1_3_2_1_30_1","volume-title":"Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-007-0090-8"},{"key":"e_1_3_2_1_32_1","unstructured":"Zhishu Sun Zhifeng Shen Luojun Lin Yuanlong Yu Zhifeng Yang Shicai Yang and Weijie Chen. 2022. Dynamic Domain Generalization. In IJCAI."},{"key":"e_1_3_2_1_33_1","volume-title":"FILIP: Fine-grained Interactive Language-Image Pre-Training. ArXiv","author":"Yao Lewei","year":"2022","unstructured":"Lewei Yao, Runhu Huang, Lu Hou, Guansong Lu, Minzhe Niu, Hang Xu, Xiaodan Liang, Zhenguo Li, Xin Jiang, and Chunjing Xu. 2022. FILIP: Fine-grained Interactive Language-Image Pre-Training. ArXiv, Vol. abs\/2111.07783 (2022)."},{"key":"e_1_3_2_1_34_1","volume-title":"Distill: Data-free Knowledge Transfer via DeepInversion. In CVPR.","author":"Yin H.","year":"2020","unstructured":"H. Yin, P. Molchanov, Z. Li, J. M. Alvarez, A. Mallya, D. Hoiem, N. K. Jha, and J. Kautz. 2020. Dreaming to Distill: Data-free Knowledge Transfer via DeepInversion. In CVPR."},{"key":"e_1_3_2_1_35_1","volume-title":"Florence: A New Foundation Model for Computer Vision. ArXiv","author":"Yuan Lu","year":"2021","unstructured":"Lu Yuan, Dongdong Chen, Yi-Ling Chen, Noel C. F. Codella, Xiyang Dai, Jianfeng Gao, Houdong Hu, Xuedong Huang, Boxin Li, Chunyuan Li, Ce Liu, Mengchen Liu, Zicheng Liu, Yumao Lu, Yu Shi, Lijuan Wang, Jianfeng Wang, Bin Xiao, Zhen Xiao, Jianwei Yang, Michael Zeng, Luowei Zhou, and Pengchuan Zhang. 2021. Florence: A New Foundation Model for Computer Vision. ArXiv, Vol. abs\/2111.11432 (2021)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01165"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611911","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3611911","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:08:01Z","timestamp":1755821281000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611911"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":36,"alternative-id":["10.1145\/3581783.3611911","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3611911","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}