{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:09:34Z","timestamp":1750219774236,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":63,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,6,12]],"date-time":"2023-06-12T00:00:00Z","timestamp":1686528000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,6,12]]},"DOI":"10.1145\/3591106.3592266","type":"proceedings-article","created":{"date-parts":[[2023,6,8]],"date-time":"2023-06-08T22:33:38Z","timestamp":1686263618000},"page":"57-66","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Improving Image Encoders for General-Purpose Nearest Neighbor Search and Classification"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3548-0537","authenticated-orcid":false,"given":"Konstantin","family":"Schall","sequence":"first","affiliation":[{"name":"HTW Berlin, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6309-572X","authenticated-orcid":false,"given":"Kai Uwe","family":"Barthel","sequence":"additional","affiliation":[{"name":"HTW Berlin, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3957-4672","authenticated-orcid":false,"given":"Nico","family":"Hezel","sequence":"additional","affiliation":[{"name":"HTW Berlin, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3600-6848","authenticated-orcid":false,"given":"Klaus","family":"Jung","sequence":"additional","affiliation":[{"name":"HTW Berlin, Germany"}]}],"member":"320","published-online":{"date-parts":[[2023,6,12]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"CVPR 2020 AliProducts Challenge: Large-scale product recognition. https:\/\/tianchi.aliyun.com\/competition\/entrance\/231780\/information","author":"Alibaba Tianchi","year":"2020","unstructured":"Tianchi Alibaba. 2020. CVPR 2020 AliProducts Challenge: Large-scale product recognition. https:\/\/tianchi.aliyun.com\/competition\/entrance\/231780\/information"},{"unstructured":"Andre Araujo and Bingyi Cao. 2022. Google Universal Image Embedding. https:\/\/kaggle.com\/competitions\/google-universal-image-embedding","key":"e_1_3_2_1_2_1"},{"doi-asserted-by":"crossref","unstructured":"Artem Babenko Anton Slesarev Alexander Chigorin and Victor\u00a0S. Lempitsky. 2014. Neural Codes for Image Retrieval.. In ECCV (1).","key":"e_1_3_2_1_3_1","DOI":"10.1007\/978-3-319-10590-1_38"},{"doi-asserted-by":"publisher","unstructured":"Yalong Bai Yuxiang Chen Wei Yu Linfang Wang and Wei Zhang. 2020. Products-10K: A Large-scale Product Recognition Dataset. https:\/\/doi.org\/10.48550\/ARXIV.2008.10545","key":"e_1_3_2_1_4_1","DOI":"10.48550\/ARXIV.2008.10545"},{"key":"e_1_3_2_1_5_1","volume-title":"Language Models are Few-Shot Learners. CoRR abs\/2005.14165","author":"Brown B.","year":"2020","unstructured":"Tom\u00a0B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel\u00a0M. Ziegler, Jeffrey Wu, Clemens Winter, Christopher Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models are Few-Shot Learners. CoRR abs\/2005.14165 (2020). arXiv:2005.14165https:\/\/arxiv.org\/abs\/2005.14165"},{"unstructured":"Matic Broz. 2023. How Many Photos Are There? (2023) 50+ Photos Statistics \u2014 photutorial.com. https:\/\/photutorial.com\/photos-statistics. [Accessed 20-Jan-2023].","key":"e_1_3_2_1_6_1"},{"doi-asserted-by":"crossref","unstructured":"Bingyi Cao Andr\u00e9 Araujo and Jack Sim. 2020. Unifying Deep Local and Global Features for Image Search.. In ECCV (20) Andrea Vedaldi Horst Bischof Thomas Brox and Jan-Michael Frahm (Eds.).","key":"e_1_3_2_1_7_1","DOI":"10.1007\/978-3-030-58565-5_43"},{"doi-asserted-by":"publisher","unstructured":"Mathilde Caron Ishan Misra Julien Mairal Priya Goyal Piotr Bojanowski and Armand Joulin. 2020. Unsupervised Learning of Visual Features by Contrasting Cluster Assignments. https:\/\/doi.org\/10.48550\/ARXIV.2006.09882","key":"e_1_3_2_1_8_1","DOI":"10.48550\/ARXIV.2006.09882"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_9_1","DOI":"10.1111\/j.2517-6161.1958.tb00292.x"},{"doi-asserted-by":"crossref","unstructured":"Jiankang Deng Jia Guo Niannan Xue and Stefanos Zafeiriou. 2019. ArcFace: Additive Angular Margin Loss for Deep Face Recognition.. In CVPR.","key":"e_1_3_2_1_10_1","DOI":"10.1109\/CVPR.2019.00482"},{"key":"e_1_3_2_1_11_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. CoRR abs\/1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. CoRR abs\/1810.04805 (2018). http:\/\/arxiv.org\/abs\/1810.04805"},{"key":"e_1_3_2_1_12_1","volume-title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale.CoRR","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2020. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale.CoRR (2020)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_13_1","DOI":"10.1109\/TIFS.2014.2359646"},{"key":"e_1_3_2_1_14_1","volume-title":"Training Vision Transformers for Image Retrieval. CoRR abs\/2102.05644","author":"El-Nouby Alaaeldin","year":"2021","unstructured":"Alaaeldin El-Nouby, Natalia Neverova, Ivan Laptev, and Herv\u00e9 J\u00e9gou. 2021. Training Vision Transformers for Image Retrieval. CoRR abs\/2102.05644 (2021). arXiv:2102.05644https:\/\/arxiv.org\/abs\/2102.05644"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_15_1","DOI":"10.48550\/ARXIV.2211.07636"},{"doi-asserted-by":"crossref","unstructured":"Albert Gordo Jon Almaz\u00e1n J\u00e9r\u00f4me Revaud and Diane Larlus. 2016. Deep Image Retrieval: Learning Global Representations for Image Search.. In ECCV (6).","key":"e_1_3_2_1_16_1","DOI":"10.1007\/978-3-319-46466-4_15"},{"unstructured":"Priya Goyal Quentin Duval Isaac Seessel Mathilde Caron Ishan Misra Levent Sagun Armand Joulin and Piotr Bojanowski. 2022. Vision Models Are More Robust And Fair When Pretrained On Uncurated Images Without Supervision. (2022). arxiv:2202.08360\u00a0[cs.CV]","key":"e_1_3_2_1_17_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_18_1","DOI":"10.1109\/CVPR52688.2022.01553"},{"volume-title":"Deep Residual Learning for Image Recognition. In 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","author":"He K.","unstructured":"K. He, X. Zhang, S. Ren, and J. Sun. 2016. Deep Residual Learning for Image Recognition. In 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","key":"e_1_3_2_1_19_1"},{"doi-asserted-by":"publisher","unstructured":"Dan Hendrycks Steven Basart Norman Mu Saurav Kadavath Frank Wang Evan Dorundo Rahul Desai Tyler Zhu Samyak Parajuli Mike Guo Dawn Song Jacob Steinhardt and Justin Gilmer. 2020. The Many Faces of Robustness: A Critical Analysis of Out-of-Distribution Generalization. https:\/\/doi.org\/10.48550\/ARXIV.2006.16241","key":"e_1_3_2_1_20_1","DOI":"10.48550\/ARXIV.2006.16241"},{"doi-asserted-by":"publisher","unstructured":"Dan Hendrycks Kevin Zhao Steven Basart Jacob Steinhardt and Dawn Song. 2019. Natural Adversarial Examples. https:\/\/doi.org\/10.48550\/ARXIV.1907.07174","key":"e_1_3_2_1_21_1","DOI":"10.48550\/ARXIV.1907.07174"},{"doi-asserted-by":"crossref","unstructured":"Grant\u00a0Van Horn Oisin\u00a0Mac Aodha Yang Song Yin Cui Chen Sun Alexander Shepard Hartwig Adam Pietro Perona and Serge\u00a0J. Belongie. 2018. The INaturalist Species Classification and Detection Dataset.. In CVPR.","key":"e_1_3_2_1_22_1","DOI":"10.1109\/CVPR.2018.00914"},{"key":"e_1_3_2_1_23_1","volume-title":"Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision. CoRR abs\/2102.05918","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc\u00a0V. Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision. CoRR abs\/2102.05918 (2021). arXiv:2102.05918https:\/\/arxiv.org\/abs\/2102.05918"},{"key":"e_1_3_2_1_24_1","volume-title":"Proxy Anchor Loss for Deep Metric Learning. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR).","author":"Kim Sungyeon","year":"2020","unstructured":"Sungyeon Kim, Dongwon Kim, Minsu Cho, and Suha Kwak. 2020. Proxy Anchor Loss for Deep Metric Learning. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)."},{"doi-asserted-by":"crossref","unstructured":"Alexander Kolesnikov Lucas Beyer Xiaohua Zhai Joan Puigcerver Jessica Yung Sylvain Gelly and Neil Houlsby. 2020. Big Transfer (BiT): General Visual Representation Learning.. In ECCV (5).","key":"e_1_3_2_1_25_1","DOI":"10.1007\/978-3-030-58558-7_29"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_26_1","DOI":"10.1109\/ICCVW.2013.77"},{"doi-asserted-by":"crossref","unstructured":"Liunian\u00a0Harold Li Pengchuan Zhang Haotian Zhang* Jianwei Yang Chunyuan Li Yiwu Zhong Lijuan Wang Lu Yuan Lei Zhang Jenq-Neng Hwang Kai-Wei Chang and Jianfeng Gao. 2022. Grounded Language-Image Pre-training. In CVPR.","key":"e_1_3_2_1_27_1","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"e_1_3_2_1_28_1","volume-title":"Piotr Doll\u2019a r, and C.\u00a0Lawrence Zitnick","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin, Michael Maire, Serge\u00a0J. Belongie, Lubomir\u00a0D. Bourdev, Ross\u00a0B. Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u2019a r, and C.\u00a0Lawrence Zitnick. 2014. Microsoft COCO: Common Objects in Context. CoRR abs\/1405.0312 (2014). arxiv:1405.0312http:\/\/arxiv.org\/abs\/1405.0312"},{"key":"e_1_3_2_1_29_1","volume-title":"Swin Transformer: Hierarchical Vision Transformer using Shifted Windows. arXiv preprint arXiv:2103.14030","author":"Liu Ze","year":"2021","unstructured":"Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, and Baining Guo. 2021. Swin Transformer: Hierarchical Vision Transformer using Shifted Windows. arXiv preprint arXiv:2103.14030 (2021)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_30_1","DOI":"10.1023\/B:VISI.0000029664.99615.94"},{"unstructured":"S. Maji J. Kannala E. Rahtu M. Blaschko and A. Vedaldi. 2013. Fine-Grained Visual Classification of Aircraft. Technical Report. arxiv:1306.5151\u00a0[cs-cv]","key":"e_1_3_2_1_31_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_32_1","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"e_1_3_2_1_33_1","volume-title":"Deep Face Recognition. In British Machine Vision Conference.","author":"Parkhi M.","year":"2015","unstructured":"Omkar\u00a0M. Parkhi, Andrea Vedaldi, and Andrew Zisserman. 2015. Deep Face Recognition. In British Machine Vision Conference."},{"doi-asserted-by":"crossref","unstructured":"James Philbin Ondrej Chum Michael Isard Josef Sivic and Andrew Zisserman. 2007. Object retrieval with large vocabularies and fast spatial matching.. In CVPR.","key":"e_1_3_2_1_34_1","DOI":"10.1109\/CVPR.2007.383172"},{"volume-title":"Revisiting Oxford and","author":"Radenovic Filip","unstructured":"Filip Radenovic, Ahmet Iscen, Giorgos Tolias, Yannis Avrithis, and Ondrej Chum. 2018. Revisiting Oxford and Paris: Large-Scale Image Retrieval Benchmarking.. In CVPR.","key":"e_1_3_2_1_35_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_36_1","DOI":"10.1109\/TPAMI.2018.2846566"},{"key":"e_1_3_2_1_37_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning, ICML 2021","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18-24 July 2021, Virtual Event(Proceedings of Machine Learning Research, Vol.\u00a0139), Marina Meila and Tong Zhang (Eds.). PMLR, 8748\u20138763. http:\/\/proceedings.mlr.press\/v139\/radford21a.html"},{"doi-asserted-by":"publisher","unstructured":"Ilija Radosavovic Raj\u00a0Prateek Kosaraju Ross Girshick Kaiming He and Piotr Doll\u00e1r. 2020. Designing Network Design Spaces. https:\/\/doi.org\/10.48550\/ARXIV.2003.13678","key":"e_1_3_2_1_38_1","DOI":"10.48550\/ARXIV.2003.13678"},{"doi-asserted-by":"publisher","unstructured":"Benjamin Recht Rebecca Roelofs Ludwig Schmidt and Vaishaal Shankar. 2019. Do ImageNet Classifiers Generalize to ImageNet?https:\/\/doi.org\/10.48550\/ARXIV.1902.10811","key":"e_1_3_2_1_39_1","DOI":"10.48550\/ARXIV.1902.10811"},{"doi-asserted-by":"publisher","unstructured":"Tal Ridnik Emanuel Ben-Baruch Asaf Noy and Lihi Zelnik-Manor. 2021. ImageNet-21K Pretraining for the Masses. https:\/\/doi.org\/10.48550\/ARXIV.2104.10972","key":"e_1_3_2_1_40_1","DOI":"10.48550\/ARXIV.2104.10972"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_41_1","DOI":"10.1007\/s11263-015-0816-y"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_42_1","DOI":"10.1007\/978-3-030-98358-1_17"},{"doi-asserted-by":"publisher","unstructured":"Christoph Schuhmann Romain Beaumont Richard Vencu Cade Gordon Ross Wightman Mehdi Cherti Theo Coombes Aarush Katta Clayton Mullis Mitchell Wortsman Patrick Schramowski Srivatsa Kundurthy Katherine Crowson Ludwig Schmidt Robert Kaczmarczyk and Jenia Jitsev. 2022. LAION-5B: An open large-scale dataset for training next generation image-text models. https:\/\/doi.org\/10.48550\/ARXIV.2210.08402","key":"e_1_3_2_1_43_1","DOI":"10.48550\/ARXIV.2210.08402"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_44_1","DOI":"10.48550\/ARXIV.2210.08473"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_45_1","DOI":"10.1109\/CVPR.2016.434"},{"key":"e_1_3_2_1_46_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning.","author":"Tan Mingxing","year":"2021","unstructured":"Mingxing Tan and Quoc Le. 2021. EfficientNetV2: Smaller Models and Faster Training. In Proceedings of the 38th International Conference on Machine Learning."},{"unstructured":"Giorgos Tolias Ronan Sicre and Herv\u00e9 J\u00e9gou. 2016. Particular object retrieval with integral max-pooling of CNN activations.. In ICLR (Poster).","key":"e_1_3_2_1_47_1"},{"volume-title":"Advances in Neural Information Processing Systems 30. Curran Associates","author":"Vaswani Ashish","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N. Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems 30. Curran Associates, Inc.","key":"e_1_3_2_1_48_1"},{"key":"e_1_3_2_1_49_1","volume-title":"Technical Report CNS-TR-2011-001. California Institute of Technology.","author":"Wah C.","year":"2011","unstructured":"C. Wah, S. Branson, P. Welinder, P. Perona, and S. Belongie. 2011. The Caltech-UCSD Birds-200-2011 Dataset. Technical Report CNS-TR-2011-001. California Institute of Technology."},{"unstructured":"Haohan Wang Songwei Ge Zachary Lipton and Eric\u00a0P Xing. 2019. Learning Robust Global Representations by Penalizing Local Predictive Power. In Advances in Neural Information Processing Systems. 10506\u201310518.","key":"e_1_3_2_1_50_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_51_1","DOI":"10.1145\/2700292"},{"doi-asserted-by":"crossref","unstructured":"Tobias Weyand Andr\u00e9 Araujo Bingyi Cao and Jack Sim. 2020. Google Landmarks Dataset v2 - A Large-Scale Benchmark for Instance-Level Recognition and Retrieval.. In CVPR.","key":"e_1_3_2_1_52_1","DOI":"10.1109\/CVPR42600.2020.00265"},{"doi-asserted-by":"publisher","unstructured":"Ross Wightman. 2019. PyTorch Image Models. https:\/\/github.com\/rwightman\/pytorch-image-models. https:\/\/doi.org\/10.5281\/zenodo.4414861","key":"e_1_3_2_1_53_1","DOI":"10.5281\/zenodo.4414861"},{"doi-asserted-by":"publisher","unstructured":"Thomas Wolf Lysandre Debut Victor Sanh Julien Chaumond Clement Delangue Anthony Moi Pierric Cistac Tim Rault R\u00e9mi Louf Morgan Funtowicz Joe Davison Sam Shleifer Patrick von Platen Clara Ma Yacine Jernite Julien Plu Canwen Xu Teven\u00a0Le Scao Sylvain Gugger Mariama Drame Quentin Lhoest and Alexander\u00a0M. Rush. 2019. HuggingFace\u2019s Transformers: State-of-the-art Natural Language Processing. https:\/\/doi.org\/10.48550\/ARXIV.1910.03771","key":"e_1_3_2_1_54_1","DOI":"10.48550\/ARXIV.1910.03771"},{"doi-asserted-by":"publisher","unstructured":"Sanghyun Woo Shoubhik Debnath Ronghang Hu Xinlei Chen Zhuang Liu In\u00a0So Kweon and Saining Xie. 2023. ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders. https:\/\/doi.org\/10.48550\/ARXIV.2301.00808","key":"e_1_3_2_1_55_1","DOI":"10.48550\/ARXIV.2301.00808"},{"doi-asserted-by":"publisher","unstructured":"Mitchell Wortsman Gabriel Ilharco Jong\u00a0Wook Kim Mike Li Simon Kornblith Rebecca Roelofs Raphael Gontijo-Lopes Hannaneh Hajishirzi Ali Farhadi Hongseok Namkoong and Ludwig Schmidt. 2021. Robust fine-tuning of zero-shot models. https:\/\/doi.org\/10.48550\/ARXIV.2109.01903","key":"e_1_3_2_1_56_1","DOI":"10.48550\/ARXIV.2109.01903"},{"doi-asserted-by":"crossref","unstructured":"Jianwei Yang Chunyuan Li Pengchuan Zhang Bin Xiao Ce Liu Lu Yuan and Jianfeng Gao. 2022. Unified Contrastive Learning in Image-Text-Label Space. arxiv:2204.03610\u00a0[cs.CV]","key":"e_1_3_2_1_57_1","DOI":"10.1109\/CVPR52688.2022.01857"},{"key":"e_1_3_2_1_58_1","volume-title":"DOLG: Single-Stage Image Retrieval with Deep Orthogonal Fusion of Local and Global Features. CoRR abs\/2108.02927","author":"Yang Min","year":"2021","unstructured":"Min Yang, Dongliang He, Miao Fan, Baorong Shi, Xuetong Xue, Fu Li, Errui Ding, and Jizhou Huang. 2021. DOLG: Single-Stage Image Retrieval with Deep Orthogonal Fusion of Local and Global Features. CoRR abs\/2108.02927 (2021). arXiv:2108.02927https:\/\/arxiv.org\/abs\/2108.02927"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_59_1","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_2_1_60_1","volume-title":"Scaling Vision Transformers. CoRR abs\/2106.04560","author":"Zhai Xiaohua","year":"2021","unstructured":"Xiaohua Zhai, Alexander Kolesnikov, Neil Houlsby, and Lucas Beyer. 2021. Scaling Vision Transformers. CoRR abs\/2106.04560 (2021). https:\/\/arxiv.org\/abs\/2106.04560"},{"doi-asserted-by":"publisher","unstructured":"Xiaohua Zhai Joan Puigcerver Alexander Kolesnikov Pierre Ruyssen Carlos Riquelme Mario Lucic Josip Djolonga Andre\u00a0Susano Pinto Maxim Neumann Alexey Dosovitskiy Lucas Beyer Olivier Bachem Michael Tschannen Marcin Michalski Olivier Bousquet Sylvain Gelly and Neil Houlsby. 2019. A Large-scale Study of Representation Learning with the Visual Task Adaptation Benchmark. https:\/\/doi.org\/10.48550\/ARXIV.1910.04867","key":"e_1_3_2_1_61_1","DOI":"10.48550\/ARXIV.1910.04867"},{"key":"e_1_3_2_1_62_1","volume-title":"GLIPv2: Unifying Localization and Vision-Language Understanding. arXiv preprint arXiv:2206.05836","author":"Zhang Haotian","year":"2022","unstructured":"Haotian Zhang, Pengchuan Zhang, Xiaowei Hu, Yen-Chun Chen, Liunian\u00a0Harold Li, Xiyang Dai, Lijuan Wang, Lu Yuan, Jenq-Neng Hwang, and Jianfeng Gao. 2022. GLIPv2: Unifying Localization and Vision-Language Understanding. arXiv preprint arXiv:2206.05836 (2022)."},{"key":"e_1_3_2_1_63_1","volume-title":"Bamboo: Building Mega-Scale Vision Dataset Continually with Human-Machine Synergy. arxiv:2203.07845\u00a0[cs.CV]","author":"Zhang Yuanhan","year":"2022","unstructured":"Yuanhan Zhang, Qinghong Sun, Yichun Zhou, Zexin He, Zhenfei Yin, Kun Wang, Lu Sheng, Yu Qiao, Jing Shao, and Ziwei Liu. 2022. Bamboo: Building Mega-Scale Vision Dataset Continually with Human-Machine Synergy. arxiv:2203.07845\u00a0[cs.CV]"}],"event":{"sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"acronym":"ICMR '23","name":"ICMR '23: International Conference on Multimedia Retrieval","location":"Thessaloniki Greece"},"container-title":["Proceedings of the 2023 ACM International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3591106.3592266","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3591106.3592266","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:37:30Z","timestamp":1750178250000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3591106.3592266"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6,12]]},"references-count":63,"alternative-id":["10.1145\/3591106.3592266","10.1145\/3591106"],"URL":"https:\/\/doi.org\/10.1145\/3591106.3592266","relation":{},"subject":[],"published":{"date-parts":[[2023,6,12]]},"assertion":[{"value":"2023-06-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}