{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T16:51:56Z","timestamp":1781542316152,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":46,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/legalcode"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810841","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"986-995","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Beyond Post-hoc Fusion: Rethinking Cross-Modal Interaction Timing in Few-Shot Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-0927-2991","authenticated-orcid":false,"given":"Liang","family":"Yang","sequence":"first","affiliation":[{"name":"College of Computer and Information Sciences, Fujian Agriculture and Forestry University, Fuzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7031-8614","authenticated-orcid":false,"given":"Hongyuan","family":"Xiao","sequence":"additional","affiliation":[{"name":"College of Computer and Information Sciences, Fujian Agriculture and Forestry University, Fuzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7244-4817","authenticated-orcid":false,"given":"Songtao","family":"He","sequence":"additional","affiliation":[{"name":"College of Computer and Information Sciences, Fujian Agriculture and Forestry University, Fuzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-1426-9644","authenticated-orcid":false,"given":"Ye","family":"Lin","sequence":"additional","affiliation":[{"name":"College of Computer and Information Sciences, Fujian Agriculture and Forestry University, Fuzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8782-9414","authenticated-orcid":false,"given":"Zhenchang","family":"Zhang","sequence":"additional","affiliation":[{"name":"College of Computer and Information Sciences, Fujian Agriculture and Forestry University, Fuzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"crossref","unstructured":"Tadas Baltru\u0161aitis Chaitanya Ahuja and Louis-Philippe Morency. 2019. Multimodal Machine Learning: A Survey and Taxonomy. IEEE Transactions on Pattern Analysis and Machine Intelligence 41 2 (2019) 423\u2013443.","DOI":"10.1109\/TPAMI.2018.2798607"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02336"},{"key":"e_1_3_3_1_4_2","first-page":"446","volume-title":"Proceedings of the European Conference on Computer Vision (ECCV)","author":"Bossard Lukas","year":"2014","unstructured":"Lukas Bossard, Matthieu Guillaumin, and Luc Van\u00a0Gool. 2014. Food-101 \u2013 Mining Discriminative Components with Random Forests. In Proceedings of the European Conference on Computer Vision (ECCV). Springer International Publishing, Cham, 446\u2013461."},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00950"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.461"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_3_1_9_2","volume-title":"International Conference on Learning Representations (ICLR)","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"crossref","unstructured":"Peng Gao Shijie Geng Renrui Zhang Haohang Ma Rongyao Fang Zhang Zhang Hongsheng Li and Yu Qiao. 2024. CLIP-Adapter: Better Vision-Language Models with Feature Adapters. International Journal of Computer Vision 132 2 (2024) 581\u2013595.","DOI":"10.1007\/s11263-023-01891-x"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i1.25152"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"crossref","unstructured":"Patrick Helber Benjamin Bischke Andreas Dengel and Damian Borth. 2019. EuroSAT: A Novel Dataset and Deep Learning Benchmark for Land Use and Land Cover Classification. IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing 12 7 (2019) 2217\u20132226.","DOI":"10.1109\/JSTARS.2019.2918242"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02244"},{"key":"e_1_3_3_1_16_2","volume-title":"International Conference on Learning Representations (ICLR)","author":"Jing Li","year":"2022","unstructured":"Li Jing, Pascal Vincent, Yann LeCun, and Yuandong Tian. 2022. Understanding Dimensional Collapse in Contrastive Self-Supervised Learning. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01832"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2013.77"},{"key":"e_1_3_3_1_19_2","first-page":"178","volume-title":"Proceedings of the 2004 IEEE Computer Vision and Pattern Recognition Workshop (CVPRW)","author":"Li Fei-Fei","year":"2004","unstructured":"Fei-Fei Li, Robert Fergus, and Pietro Perona. 2004. Learning Generative Visual Models from Few Training Examples: An Incremental Bayesian Approach Tested on 101 Object Categories. In Proceedings of the 2004 IEEE Computer Vision and Pattern Recognition Workshop (CVPRW). IEEE, 178\u2013178."},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02366"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i5.32534"},{"key":"e_1_3_3_1_22_2","first-page":"17612","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","author":"Liang Weixin","year":"2022","unstructured":"Weixin Liang, Yuhui Zhang, Yongchan Kwon, Serena Ye, and James Zou. 2022. Mind the Gap: Understanding the Modality Gap in Multi-modal Contrastive Representation Learning. In Advances in Neural Information Processing Systems (NeurIPS). 17612\u201317625."},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i01.5347"},{"key":"e_1_3_3_1_24_2","unstructured":"Subhransu Maji Esa Rahtu Juho Kannala Matthew Blaschko and Andrea Vedaldi. 2013. Fine-Grained Visual Classification of Aircraft. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1306.5151 (2013)."},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"crossref","unstructured":"Vardan Papyan X.\u00a0Y. Han and David\u00a0L. Donoho. 2020. Prevalence of Neural Collapse during the Terminal Phase of Deep Learning Training. Proceedings of the National Academy of Sciences (PNAS) 117 40 (2020) 24652\u201324663.","DOI":"10.1073\/pnas.2015509117"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00409"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6248092"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00806"},{"key":"e_1_3_3_1_30_2","unstructured":"Longtian Qiu Renrui Zhang Ziyu Guo Ziyao Zeng Yafeng Li and Guangnan Zhang. 2021. VT-CLIP: Enhancing Vision-Language Models with Visual-guided Texts. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2112.02399 (2021)."},{"key":"e_1_3_3_1_31_2","first-page":"8748","volume-title":"Proceedings of the 38th International Conference on Machine Learning (ICML)","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In Proceedings of the 38th International Conference on Machine Learning (ICML). PMLR, 8748\u20138763."},{"key":"e_1_3_3_1_32_2","first-page":"8821","volume-title":"Proceedings of the 38th International Conference on Machine Learning (ICML)","author":"Ramesh Aditya","year":"2021","unstructured":"Aditya Ramesh, Mikhail Pavlov, Gabriel Goh, Scott Gray, Chelsea Voss, Alec Radford, Mark Chen, and Ilya Sutskever. 2021. Zero-Shot Text-to-Image Generation. In Proceedings of the 38th International Conference on Machine Learning (ICML). 8821\u20138831."},{"key":"e_1_3_3_1_33_2","first-page":"5389","volume-title":"Proceedings of the 36th International Conference on Machine Learning (ICML)","author":"Recht Benjamin","year":"2019","unstructured":"Benjamin Recht, Rebecca Roelofs, Ludwig Schmidt, and Vaishaal Shankar. 2019. Do ImageNet Classifiers Generalize to ImageNet?. In Proceedings of the 36th International Conference on Machine Learning (ICML). PMLR, 5389\u20135400."},{"key":"e_1_3_3_1_34_2","unstructured":"Khurram Soomro Amir\u00a0Roshan Zamir and Mubarak Shah. 2012. UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1212.0402 (2012)."},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02201"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58568-6_16"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00257"},{"key":"e_1_3_3_1_38_2","unstructured":"Huy\u00a0V. Vo Oriane Sim\u00e9oni Cijo Jose Vasil Khalidov et\u00a0al. 2025. DINOv3: Scaling Self-Supervised Vision Models to Large Datasets and Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2508.10104 (2025)."},{"key":"e_1_3_3_1_39_2","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","author":"Wang Haohan","year":"2019","unstructured":"Haohan Wang, Robert Geirhos, Simon Kornblith, Jason Miller, Matthias Bethge, Felix\u00a0A. Wichmann, and Antonio Torralba. 2019. Learning Robust Global Representations by Penalizing Confident Predictions. In Advances in Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_1_40_2","first-page":"9929","volume-title":"International conference on machine learning","author":"Wang Tongzhou","year":"2020","unstructured":"Tongzhou Wang and Phillip Isola. 2020. Understanding contrastive representation learning through alignment and uniformity on the hypersphere. In International conference on machine learning. PMLR, 9929\u20139939."},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2010.5539970"},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00541"},{"key":"e_1_3_3_1_43_2","volume-title":"International Conference on Learning Representations (ICLR)","author":"Yuksekgonul Mert","year":"2023","unstructured":"Mert Yuksekgonul, Federico Bianchi, Pratyusha Kalluri, Dan Jurafsky, and James Zou. 2023. When and why Vision-Language Models behave like Bags-of-Words, and what to do about it?. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_29"},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01460"},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"crossref","unstructured":"Kaiyang Zhou Jingkang Yang Chen\u00a0Change Loy and Ziwei Liu. 2022. Learning to Prompt for Vision-Language Models. International Journal of Computer Vision (IJCV) 130 9 (2022) 2337\u20132348.","DOI":"10.1007\/s11263-022-01653-1"}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:51:52Z","timestamp":1781538712000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810841"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":46,"alternative-id":["10.1145\/3805622.3810841","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810841","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}