{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T09:08:32Z","timestamp":1765357712121,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":63,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681624","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"1004-1013","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["SSAT-Adapter: Enhancing Vision-Language Model Few-shot Learning with Auxiliary Tasks"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-5062-2890","authenticated-orcid":false,"given":"Bowen","family":"Chen","sequence":"first","affiliation":[{"name":"The University of Auckland, Auckland, New Zealand"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7256-4049","authenticated-orcid":false,"given":"Yun Sing","family":"Koh","sequence":"additional","affiliation":[{"name":"The University of Auckland, Auckland, New Zealand"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7245-0367","authenticated-orcid":false,"given":"Gillian","family":"Dobbie","sequence":"additional","affiliation":[{"name":"The University of Auckland, Auckland, New Zealand"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Keshigeyan Chandrasegaran, Guimeng Liu, and Ngai-Man Cheung.","author":"Abdollahzadeh Milad","year":"2023","unstructured":"Milad Abdollahzadeh, Touba Malekzadeh, Christopher TH Teo, Keshigeyan Chandrasegaran, Guimeng Liu, and Ngai-Man Cheung. 2023. A survey on generative modeling with limited data, few shots, and zero shot. arXiv preprint arXiv:2307.14397 (2023)."},{"key":"e_1_3_2_1_2_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Albalak Alon","year":"2024","unstructured":"Alon Albalak, Colin A Raffel, and William Yang Wang. 2024. Improving few-shot generalization by exploring and exploiting auxiliary data. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.compag.2020.105542"},{"key":"e_1_3_2_1_4_1","volume-title":"Proceedings, Part VI 13","author":"Bossard Lukas","year":"2014","unstructured":"Lukas Bossard, Matthieu Guillaumin, and Luc Van Gool. 2014. Food-101--mining discriminative components with random forests. In Computer Vision--ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6--12, 2014, Proceedings, Part VI 13. Springer, 446--461."},{"key":"e_1_3_2_1_5_1","volume-title":"Large scale GAN training for high fidelity natural image synthesis. arXiv preprint arXiv:1809.11096","author":"Brock Andrew","year":"2018","unstructured":"Andrew Brock, Jeff Donahue, and Karen Simonyan. 2018. Large scale GAN training for high fidelity natural image synthesis. arXiv preprint arXiv:1809.11096 (2018)."},{"key":"e_1_3_2_1_6_1","first-page":"20941","article-title":"Data-efficient gan training beyond (just) augmentations: A lottery ticket perspective","volume":"34","author":"Chen Tianlong","year":"2021","unstructured":"Tianlong Chen, Yu Cheng, Zhe Gan, Jingjing Liu, and Zhangyang Wang. 2021. Data-efficient gan training beyond (just) augmentations: A lottery ticket perspective. Advances in Neural Information Processing Systems, Vol. 34 (2021), 20941--20955.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_7_1","volume-title":"International conference on machine learning. PMLR, 1597--1607","author":"Chen Ting","year":"2020","unstructured":"Ting Chen, Simon Kornblith, Mohammad Norouzi, and Geoffrey Hinton. 2020. A simple framework for contrastive learning of visual representations. In International conference on machine learning. PMLR, 1597--1607."},{"key":"e_1_3_2_1_8_1","volume-title":"Yu-Chiang Frank Wang, and Jia-Bin Huang","author":"Chen Wei-Yu","year":"2019","unstructured":"Wei-Yu Chen, Yen-Cheng Liu, Zsolt Kira, Yu-Chiang Frank Wang, and Jia-Bin Huang. 2019. A closer look at few-shot classification. arXiv preprint arXiv:1904.04232 (2019)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01345"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02011"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.461"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_14_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_15_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_16_1","first-page":"31782","article-title":"Diggan: Discriminator gradient gap regularization for gan training with limited data","volume":"35","author":"Fang Tiantian","year":"2022","unstructured":"Tiantian Fang, Ruoyu Sun, and Alex Schwing. 2022. Diggan: Discriminator gradient gap regularization for gan training with limited data. Advances in Neural Information Processing Systems, Vol. 35 (2022), 31782--31795.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2004.383"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-023-01891-x"},{"key":"e_1_3_2_1_19_1","unstructured":"Yixiao Ge Feng Zhu Dapeng Chen Rui Zhao et al. 2020. Self-paced contrastive learning with hybrid memory for domain adaptive object re-id. Advances in neural information processing systems Vol. 33 (2020) 11309--11321."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2019.2918242"},{"key":"e_1_3_2_1_21_1","first-page":"10294","article-title":"Self-supervised auxiliary learning with meta-paths for heterogeneous graphs","volume":"33","author":"Hwang Dasol","year":"2020","unstructured":"Dasol Hwang, Jinyoung Park, Sunyoung Kwon, KyungMin Kim, Jung-Woo Ha, and Hyunwoo J Kim. 2020. Self-supervised auxiliary learning with meta-paths for heterogeneous graphs. Advances in Neural Information Processing Systems, Vol. 33 (2020), 10294--10305.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_22_1","first-page":"21655","article-title":"Deceive d: Adaptive pseudo augmentation for gan training with limited data","volume":"34","author":"Jiang Liming","year":"2021","unstructured":"Liming Jiang, Bo Dai, Wayne Wu, and Chen Change Loy. 2021. Deceive d: Adaptive pseudo augmentation for gan training with limited data. Advances in Neural Information Processing Systems, Vol. 34 (2021), 21655--21667.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00813"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2013.77"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1186\/s12880-023-01007-4"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00241"},{"key":"e_1_3_2_1_27_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730--19742."},{"key":"e_1_3_2_1_28_1","volume-title":"International conference on machine learning. PMLR, 12888--12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888--12900."},{"key":"e_1_3_2_1_29_1","volume-title":"Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557","author":"Li Liunian Harold","year":"2019","unstructured":"Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, and Kai-Wei Chang. 2019. Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557 (2019)."},{"key":"e_1_3_2_1_30_1","volume-title":"Proceedings, Part XXX 16","author":"Li Xiujun","year":"2020","unstructured":"Xiujun Li, Xi Yin, Chunyuan Li, Pengchuan Zhang, Xiaowei Hu, Lei Zhang, Lijuan Wang, Houdong Hu, Li Dong, Furu Wei, et al. 2020. Oscar: Object-semantics aligned pre-training for vision-language tasks. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XXX 16. Springer, 121--137."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-022-00516-1"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00327"},{"key":"e_1_3_2_1_33_1","volume-title":"Advances in Neural Information Processing Systems","volume":"32","author":"Liu Shikun","year":"2019","unstructured":"Shikun Liu, Andrew Davison, and Edward Johns. 2019. Self-supervised generalisation with meta auxiliary learning. Advances in Neural Information Processing Systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20074"},{"key":"e_1_3_2_1_35_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_1_36_1","volume-title":"Fine-grained visual classification of aircraft. arXiv preprint arXiv:1306.5151","author":"Maji Subhransu","year":"2013","unstructured":"Subhransu Maji, Esa Rahtu, Juho Kannala, Matthew Blaschko, and Andrea Vedaldi. 2013. Fine-grained visual classification of aircraft. arXiv preprint arXiv:1306.5151 (2013)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00270"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054073"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"e_1_3_2_1_40_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Ning Chuanruo","year":"2024","unstructured":"Chuanruo Ning, Ruihai Wu, Haoran Lu, Kaichun Mo, and Hao Dong. 2024. Where2explore: Few-shot affordance learning for unseen novel categories of articulated objects. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6248092"},{"key":"e_1_3_2_1_42_1","first-page":"16686","article-title":"Self-paced contrastive learning for semi-supervised medical image segmentation with meta-labels","volume":"34","author":"Peng Jizong","year":"2021","unstructured":"Jizong Peng, Ping Wang, Christian Desrosiers, and Marco Pedersoli. 2021. Self-paced contrastive learning for semi-supervised medical image segmentation with meta-labels. Advances in Neural Information Processing Systems, Vol. 34 (2021), 16686--16699.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_43_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445518"},{"key":"e_1_3_2_1_45_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Song Lin","year":"2024","unstructured":"Lin Song, Ruoyi Xue, Hang Wang, Hongbin Sun, Yixiao Ge, Ying Shan, et al. 2024. Meta-Adapter: An Online Few-shot Learner for Vision-Language Model. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_46_1","volume-title":"Amir Roshan Zamir, and Mubarak Shah","author":"Soomro Khurram","year":"2012","unstructured":"Khurram Soomro, Amir Roshan Zamir, and Mubarak Shah. 2012. UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.97"},{"key":"e_1_3_2_1_48_1","first-page":"30569","article-title":"Dualcoop: Fast adaptation to multi-label recognition with limited annotations","volume":"35","author":"Sun Ximeng","year":"2022","unstructured":"Ximeng Sun, Ping Hu, and Kate Saenko. 2022. Dualcoop: Fast adaptation to multi-label recognition with limited annotations. Advances in Neural Information Processing Systems, Vol. 35 (2022), 30569--30582.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_49_1","volume-title":"Generalizing from a few examples: A survey on few-shot learning. ACM computing surveys (csur)","author":"Wang Yaqing","year":"2020","unstructured":"Yaqing Wang, Quanming Yao, James T Kwok, and Lionel M Ni. 2020. Generalizing from a few examples: A survey on few-shot learning. ACM computing surveys (csur), Vol. 53, 3 (2020), 1--34."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3234993"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00780"},{"volume-title":"Sun database: Large-scale scene recognition from abbey to zoo. In 2010 IEEE computer society conference on computer vision and pattern recognition","author":"Xiao Jianxiong","key":"e_1_3_2_1_52_1","unstructured":"Jianxiong Xiao, James Hays, Krista A Ehinger, Aude Oliva, and Antonio Torralba. 2010. Sun database: Large-scale scene recognition from abbey to zoo. In 2010 IEEE computer society conference on computer vision and pattern recognition. IEEE, 3485--3492."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/TSMC.2023.3279023"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1186\/s13007-022-00866-2"},{"key":"e_1_3_2_1_55_1","volume-title":"Looking beyond single images for contrastive semantic segmentation learning. Advances in neural information processing systems","author":"Zhang Feihu","year":"2021","unstructured":"Feihu Zhang, Philip Torr, Ren\u00e9 Ranftl, and Stephan Richter. 2021. Looking beyond single images for contrastive semantic segmentation learning. Advances in neural information processing systems, Vol. 34 (2021), 3285--3297."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3263112"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01460"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_29"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00339"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2021.3070203"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i15.29639"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681624","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681624","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:49Z","timestamp":1750295869000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681624"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":63,"alternative-id":["10.1145\/3664647.3681624","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681624","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}