{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T09:08:37Z","timestamp":1765357717174,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Key Research and Development Project of China","award":["2020AAA0105600"],"award-info":[{"award-number":["2020AAA0105600"]}]},{"name":"Shenzhen Key Technical Projects under Grant","award":["CJGJZD2022051714160501"],"award-info":[{"award-number":["CJGJZD2022051714160501"]}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No. U21B2048 and No. 62302382"],"award-info":[{"award-number":["No. U21B2048 and No. 62302382"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680983","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"797-806","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Enhancing Pre-trained ViTs for Downstream Task Adaptation: A Locality-Aware Prompt Learning Method"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8945-1200","authenticated-orcid":false,"given":"Shaokun","family":"Wang","sequence":"first","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-5291-9003","authenticated-orcid":false,"given":"Yifan","family":"Yu","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6750-1403","authenticated-orcid":false,"given":"Yuhang","family":"He","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1793-5836","authenticated-orcid":false,"given":"Yihong","family":"Gong","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong University, Xi'an, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Shir Amir Yossi Gandelsman Shai Bagon and Tali Dekel. 2022. Deep ViT Features as Dense Visual Descriptors. In ECCVW What is Motion For?"},{"key":"e_1_3_2_1_2_1","volume-title":"Exploring visual prompts for adapting large-scale models. arXiv preprint arXiv:2203.17274","author":"Bahng Hyojin","year":"2022","unstructured":"Hyojin Bahng, Ali Jahanian, Swami Sankaranarayanan, and Phillip Isola. 2022. Exploring visual prompts for adapting large-scale models. arXiv preprint arXiv:2203.17274 (2022)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"e_1_3_2_1_5_1","volume-title":"Proceedings of the international conference on Machine learning. 1725--1735","author":"Chen Ming","year":"2020","unstructured":"Ming Chen, Zhewei Wei, Zengfeng Huang, Bolin Ding, and Yaliang Li. 2020. Simple and deep graph convolutional networks. In Proceedings of the international conference on Machine learning. 1725--1735."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612185"},{"key":"e_1_3_2_1_7_1","volume-title":"International Conference on Learning Representations.","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_8_1","volume-title":"International Conference on Machine Learning. 2286--2296","author":"Ascoli St\u00e9phane","year":"2021","unstructured":"St\u00e9phane d'Ascoli, Hugo Touvron, Matthew L Leavitt, Ari S Morcos, Giulio Biroli, and Levent Sagun. 2021. Convit: Improving vision transformers with soft convolutional inductive biases. In International Conference on Machine Learning. 2286--2296."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01175"},{"key":"e_1_3_2_1_10_1","volume-title":"Clip-adapter: Better vision-language models with feature adapters. arXiv preprint arXiv:2110.04544","author":"Gao Peng","year":"2021","unstructured":"Peng Gao, Shijie Geng, Renrui Zhang, Teli Ma, Rongyao Fang, Yongfeng Zhang, Hongsheng Li, and Yu Qiao. 2021. Clip-adapter: Better vision-language models with feature adapters. arXiv preprint arXiv:2110.04544 (2021)."},{"key":"e_1_3_2_1_11_1","first-page":"42","article-title":"A Survey of Methods for Explaining Black Box Models","volume":"51","author":"Guidotti Riccardo","year":"2018","unstructured":"Riccardo Guidotti, Anna Monreale, Salvatore Ruggieri, Franco Turini, Fosca Giannotti, and Dino Pedreschi. 2018. A Survey of Methods for Explaining Black Box Models. ACM Comput. Surv., Vol. 51, 5 (2018), 42.","journal-title":"ACM Comput. Surv."},{"key":"e_1_3_2_1_12_1","first-page":"8291","article-title":"Vision gnn: An image is worth graph of nodes","volume":"35","author":"Han Kai","year":"2022","unstructured":"Kai Han, Yunhe Wang, Jianyuan Guo, Yehui Tang, and Enhua Wu. 2022. Vision gnn: An image is worth graph of nodes. Advances in Neural Information Processing Systems, Vol. 35 (2022), 8291--8303.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00599"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_7"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01832"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.automatica.2003.10.001"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00936"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.353"},{"key":"e_1_3_2_1_21_1","volume-title":"Yan Zheng, Wei Zhang, and Kwan-Liu Ma.","author":"Li Yiran","year":"2023","unstructured":"Yiran Li, Junpeng Wang, Xin Dai, Liang Wang, Chin-Chia Michael Yeh, Yan Zheng, Wei Zhang, and Kwan-Liu Ma. 2023. How Does Attention Work in Vision Transformers? A Visual Analytics Attempt. IEEE Transactions on Visualization and Computer Graphics (2023)."},{"key":"e_1_3_2_1_22_1","volume-title":"Localvit: Bringing locality to vision transformers. arXiv preprint arXiv:2104.05707","author":"Li Yawei","year":"2021","unstructured":"Yawei Li, Kai Zhang, Jiezhang Cao, Radu Timofte, and Luc Van Gool. 2021. Localvit: Bringing locality to vision transformers. arXiv preprint arXiv:2104.05707 (2021)."},{"key":"e_1_3_2_1_23_1","first-page":"1","article-title":"Pre-train, prompt, and predict: A systematic survey of prompting methods in natural language processing","volume":"55","author":"Liu Pengfei","year":"2023","unstructured":"Pengfei Liu, Weizhe Yuan, Jinlan Fu, Zhengbao Jiang, Hiroaki Hayashi, and Graham Neubig. 2023. Pre-train, prompt, and predict: A systematic survey of prompting methods in natural language processing. Comput. Surveys, Vol. 55, 9 (2023), 1--35.","journal-title":"Comput. Surveys"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-short.8"},{"key":"e_1_3_2_1_25_1","first-page":"23818","article-title":"Efficient training of visual transformers with small datasets","volume":"34","author":"Liu Yahui","year":"2021","unstructured":"Yahui Liu, Enver Sangineto, Wei Bi, Nicu Sebe, Bruno Lepri, and Marco Nadai. 2021. Efficient training of visual transformers with small datasets. Advances in Neural Information Processing Systems, Vol. 34 (2021), 23818--23830.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_26_1","volume-title":"Transformer in Convolutional Neural Networks. CoRR","author":"Liu Yun","year":"2021","unstructured":"Yun Liu, Guolei Sun, Yu Qiu, Le Zhang, Ajad Chhatkuli, and Luc Van Gool. 2021. Transformer in Convolutional Neural Networks. CoRR, Vol. abs\/2106.03180 (2021)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_28_1","volume-title":"Black-box vs. white-box: Understanding their advantages and weaknesses from a practical point of view","author":"Loyola-Gonzalez Octavio","year":"2019","unstructured":"Octavio Loyola-Gonzalez. 2019. Black-box vs. white-box: Understanding their advantages and weaknesses from a practical point of view. IEEE access, Vol. 7 (2019), 154096--154113."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00514"},{"key":"e_1_3_2_1_30_1","volume-title":"Spair-71k: A large-scale benchmark for semantic correspondence. arXiv preprint arXiv:1908.10543","author":"Min Juhong","year":"2019","unstructured":"Juhong Min, Jongmin Lee, Jean Ponce, and Minsu Cho. 2019. Spair-71k: A large-scale benchmark for semantic correspondence. arXiv preprint arXiv:1908.10543 (2019)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00207"},{"key":"e_1_3_2_1_32_1","volume-title":"The 2017 davis challenge on video object segmentation. CoRR","author":"Pont-Tuset Jordi","year":"2017","unstructured":"Jordi Pont-Tuset, Federico Perazzi, Sergi Caelles, Pablo Arbel\u00e1ez, Alex Sorkine-Hornung, and Luc Van Gool. 2017. The 2017 davis challenge on video object segmentation. CoRR, Vol. abs\/1704.00675 (2017). showeprint[arXiv]1704.00675"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00598"},{"key":"e_1_3_2_1_34_1","volume-title":"International conference on machine learning. 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. 8748--8763."},{"key":"e_1_3_2_1_35_1","first-page":"12116","article-title":"Do vision transformers see like convolutional neural networks","volume":"34","author":"Raghu Maithra","year":"2021","unstructured":"Maithra Raghu, Thomas Unterthiner, Simon Kornblith, Chiyuan Zhang, and Alexey Dosovitskiy. 2021. Do vision transformers see like convolutional neural networks? Advances in Neural Information Processing Systems, Vol. 34 (2021), 12116--12128.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_36_1","volume-title":"Learning multiple visual domains with residual adapters. Advances in neural information processing systems","author":"Rebuffi Sylvestre-Alvise","year":"2017","unstructured":"Sylvestre-Alvise Rebuffi, Hakan Bilen, and Andrea Vedaldi. 2017. Learning multiple visual domains with residual adapters. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_37_1","volume-title":"An Explainable Self-Labeling Grey-Box Model. In International Conference on Pattern Analysis and Intelligent Systems. 1--7.","author":"Seddik Boudissa","year":"2022","unstructured":"Boudissa Seddik, Drif Ahlem, and Cherifi Hocine. 2022. An Explainable Self-Labeling Grey-Box Model. In International Conference on Pattern Analysis and Intelligent Systems. 1--7."},{"key":"e_1_3_2_1_38_1","volume-title":"How to train your ViT? Data, Augmentation, and Regularization in Vision Transformers. CoRR","author":"Steiner Andreas","year":"2021","unstructured":"Andreas Steiner, Alexander Kolesnikov, Xiaohua Zhai, Ross Wightman, Jakob Uszkoreit, and Lucas Beyer. 2021. How to train your ViT? Data, Augmentation, and Regularization in Vision Transformers. CoRR, Vol. abs\/2106.10270 (2021). showeprint[arXiv]2106.10270"},{"key":"e_1_3_2_1_39_1","volume-title":"International Conference on Machine Learning. 10347--10357","author":"Touvron Hugo","year":"2021","unstructured":"Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, and Herve Jegou. 2021. Training data-efficient image transformers & distillation through attention. In International Conference on Machine Learning. 10347--10357."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1007\/s13347-021-00477-0"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00723"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00024"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611817"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00009"},{"key":"e_1_3_2_1_45_1","volume-title":"Class-aware visual prompt tuning for vision-language pre-trained model. arXiv preprint arXiv:2208.08340","author":"Xing Yinghui","year":"2022","unstructured":"Yinghui Xing, Qirui Wu, De Cheng, Shizhou Zhang, Guoqiang Liang, and Yanning Zhang. 2022. Class-aware visual prompt tuning for vision-language pre-trained model. arXiv preprint arXiv:2208.08340 (2022)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00983"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00062"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00060"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_29"},{"key":"e_1_3_2_1_50_1","volume-title":"Neural prompt search. arXiv preprint arXiv:2206.04673","author":"Zhang Yuanhan","year":"2022","unstructured":"Yuanhan Zhang, Kaiyang Zhou, and Ziwei Liu. 2022. Neural prompt search. arXiv preprint arXiv:2206.04673 (2022)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.398"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01435"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680983","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680983","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:35Z","timestamp":1750295855000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680983"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":54,"alternative-id":["10.1145\/3664647.3680983","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680983","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}