{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T09:08:33Z","timestamp":1765357713014,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":49,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62072048"],"award-info":[{"award-number":["62072048"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"BUPT Excellent Ph.D.Students Foundation","award":["CX20242004"],"award-info":[{"award-number":["CX20242004"]}]},{"name":"Industry University-Research Innovation Fund of Universities in China","award":["2021ITA07005"],"award-info":[{"award-number":["2021ITA07005"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681559","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"4273-4282","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["WaveDN: A Wavelet-based Training-free Zero-shot Enhancement for Vision-Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-1578-5485","authenticated-orcid":false,"given":"Jiulin","family":"Li","sequence":"first","affiliation":[{"name":"State Key Laboratory of Networking and Switching Technology, Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7832-0926","authenticated-orcid":false,"given":"Mengyu","family":"Yang","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Networking and Switching Technology, Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6683-5524","authenticated-orcid":false,"given":"Ye","family":"Tian","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Networking and Switching Technology, Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0674-7864","authenticated-orcid":false,"given":"Lanshan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Beijing Key Laboratory of Network System and Network Culture, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-6950-5629","authenticated-orcid":false,"given":"Yongchun","family":"Lu","sequence":"additional","affiliation":[{"name":"School of Computer Science, Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-5524-2887","authenticated-orcid":false,"given":"Jice","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Computer Science, Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6418-8087","authenticated-orcid":false,"given":"Wendong","family":"Wang","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Networking and Switching Technology, Beijing University of Posts and Telecommunications, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Gpt-4 technical report","author":"Josh Achiam AI","year":"2024","unstructured":"OpenAI, Josh Achiam, and Steven Adler et al. Gpt-4 technical report, 2024."},{"key":"e_1_3_2_1_2_1","volume-title":"Ernie: Enhanced representation through knowledge integration","author":"Sun Yu","year":"2019","unstructured":"Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, and Hua Wu. Ernie: Enhanced representation through knowledge integration, 2019."},{"key":"e_1_3_2_1_3_1","volume-title":"Sora: A review on background, technology, limitations, and opportunities of large vision models","author":"Liu Yixin","year":"2024","unstructured":"Yixin Liu, Kai Zhang, Yuan Li, Zhiling Yan, Chujie Gao, Ruoxi Chen, Zhengqing Yuan, Yue Huang, Hanchi Sun, Jianfeng Gao, Lifang He, and Lichao Sun. Sora: A review on background, technology, limitations, and opportunities of large vision models, 2024."},{"key":"e_1_3_2_1_4_1","volume-title":"Learning transferable visual models from natural language supervision","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, JongWook Kim, Chris Hallacy, A. Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Askell Amanda, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. Learning transferable visual models from natural language supervision. Cornell University - arXiv,Cornell University - arXiv, Feb 2021."},{"key":"e_1_3_2_1_5_1","volume-title":"Dec","author":"Sohn Kihyuk","year":"2016","unstructured":"Kihyuk Sohn. Improved deep metric learning with multi-class n-pair loss objective. Neural Information Processing Systems,Neural Information Processing Systems, Dec 2016."},{"key":"e_1_3_2_1_6_1","volume-title":"Representation learning with contrastive predictive coding","author":"Oord Aaronvanden","year":"2018","unstructured":"Aaronvanden Oord, Yazhe Li, and Oriol Vinyals. Representation learning with contrastive predictive coding. Cornell University - arXiv,Cornell University - arXiv, Jul 2018."},{"key":"e_1_3_2_1_7_1","volume-title":"International Journal of Computer Vision, page 2337--2348","author":"Zhou Kaiyang","year":"2022","unstructured":"Kaiyang Zhou, Jingkang Yang, Chen Change Loy, and Ziwei Liu. Learning to prompt for vision-language models. International Journal of Computer Vision, page 2337--2348, Sep 2022."},{"key":"e_1_3_2_1_8_1","first-page":"16816","volume-title":"Chen Change Loy, and Ziwei Liu. Conditional prompt learning for vision-language models","author":"Zhou Kaiyang","year":"2022","unstructured":"Kaiyang Zhou, Jingkang Yang, Chen Change Loy, and Ziwei Liu. Conditional prompt learning for vision-language models. pages 16816--16825, 2022."},{"key":"e_1_3_2_1_9_1","volume-title":"Tip-adapter: Training-free clip-adapter for better vision-language modeling. arXiv preprint arXiv:2111.03930","author":"Zhang Renrui","year":"2021","unstructured":"Renrui Zhang, Rongyao Fang, Wei Zhang, Peng Gao, Kunchang Li, Jifeng Dai, Yu Qiao, and Hongsheng Li. Tip-adapter: Training-free clip-adapter for better vision-language modeling. arXiv preprint arXiv:2111.03930, 2021."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00682"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611706"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00246"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i1.25152"},{"key":"e_1_3_2_1_14_1","first-page":"36","author":"Zhou Yifei","year":"2024","unstructured":"Yifei Zhou, Juntao Ren, Fengyu Li, Ramin Zabih, and Ser Nam Lim. Test-time distribution normalization for contrastively learned visual-language models. Advances in Neural Information Processing Systems, 36, 2024.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01047"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548021"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00257"},{"key":"e_1_3_2_1_18_1","volume-title":"Visual classification via description from large language models. arXiv preprint arXiv:2210.07183","author":"Menon Sachit","year":"2022","unstructured":"Sachit Menon and Carl Vondrick. Visual classification via description from large language models. arXiv preprint arXiv:2210.07183, 2022."},{"key":"e_1_3_2_1_19_1","volume-title":"Follow-up differential descriptions: Language models resolve ambiguities for image classification. arXiv preprint arXiv:2311.07593","author":"Esfandiarpoor Reza","year":"2023","unstructured":"Reza Esfandiarpoor and Stephen H Bach. Follow-up differential descriptions: Language models resolve ambiguities for image classification. arXiv preprint arXiv:2311.07593, 2023."},{"key":"e_1_3_2_1_20_1","volume-title":"Representation learning with contrastive predictive coding","author":"Oord Aaronvanden","year":"2018","unstructured":"Aaronvanden Oord, Yazhe Li, and Oriol Vinyals. Representation learning with contrastive predictive coding. Cornell University - arXiv,Cornell University - arXiv, Jul 2018."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01981"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW53098.2021.00158"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01585"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00187"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00215"},{"key":"e_1_3_2_1_26_1","volume-title":"Darenerf: Direction-aware representation for dynamic scenes. arXiv preprint arXiv:2403.02265","author":"Lou Ange","year":"2024","unstructured":"Ange Lou, Benjamin Planche, Zhongpai Gao, Yamin Li, Tianyu Luan, Hao Ding, Terrence Chen, Jack Noble, and Ziyan Wu. Darenerf: Direction-aware representation for dynamic scenes. arXiv preprint arXiv:2403.02265, 2024."},{"key":"e_1_3_2_1_27_1","volume-title":"Jun","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, AidanN. Gomez, Lukasz Kaiser, and Illia Polosukhin. Attention is all you need. Neural Information Processing Systems,Neural Information Processing Systems, Jun 2017."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_29_1","volume-title":"Oct","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv: Computer Vision and Pattern Recognition,arXiv: Computer Vision and Pattern Recognition, Oct 2020."},{"key":"e_1_3_2_1_30_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems, 34:9694--9705","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems, 34:9694--9705, 2021."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01522"},{"key":"e_1_3_2_1_32_1","first-page":"1015","volume-title":"Piczak. ESC: Dataset for Environmental Sound Classification. In Proceedings of the 23rd Annual ACM Conference on Multimedia","author":"Karol","unstructured":"Karol J. Piczak. ESC: Dataset for Environmental Sound Classification. In Proceedings of the 23rd Annual ACM Conference on Multimedia, pages 1015--1018. ACM Press."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2655045"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747631"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_36_1","volume-title":"Learning multiple layers of features from tiny images","author":"Krizhevsky Alex","year":"2009","unstructured":"Alex Krizhevsky, Geoffrey Hinton, et al. Learning multiple layers of features from tiny images. 2009."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2010.5539970"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2013.77"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"e_1_3_2_1_40_1","volume-title":"Food-101 -- Mining Discriminative Components with Random Forests, page 446--461","author":"Bossard Lukas","year":"2014","unstructured":"Lukas Bossard, Matthieu Guillaumin, and Luc Van Gool. Food-101 -- Mining Discriminative Components with Random Forests, page 446--461. Jan 2014."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.461"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.5555\/2354409.2355061"},{"key":"e_1_3_2_1_43_1","first-page":"2217","volume-title":"Eurosat: A novel dataset and deep learning benchmark for land use and land cover classification","author":"Helber Patrick","year":"2019","unstructured":"Patrick Helber, Benjamin Bischke, Andreas Dengel, and Damian Borth. Eurosat: A novel dataset and deep learning benchmark for land use and land cover classification. IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing, page 2217--2226, Jul 2019."},{"key":"e_1_3_2_1_44_1","volume-title":"Fine-grained visual classification of aircraft","author":"Maji Subhransu","year":"2013","unstructured":"Subhransu Maji, Esa Rahtu, Juho Kannala, MatthewB. Blaschko, and Andrea Vedaldi. Fine-grained visual classification of aircraft. Le Centre pour la Communication Scientifique Directe - HAL - memSIC,Le Centre pour la Communication Scientifique Directe - HAL - memSIC, Jun 2013."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0965-7"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01522"},{"key":"e_1_3_2_1_47_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems, 34:9694--9705","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems, 34:9694--9705, 2021."},{"key":"e_1_3_2_1_48_1","volume-title":"Microsoft COCO: Common Objects in Context, page 740--755","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C. Lawrence Zitnick. Microsoft COCO: Common Objects in Context, page 740--755. Jan 2014."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681559","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681559","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:48Z","timestamp":1750295868000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681559"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":49,"alternative-id":["10.1145\/3664647.3681559","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681559","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}