{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:04:49Z","timestamp":1765343089667,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":55,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3758234","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:37:21Z","timestamp":1761377841000},"page":"12897-12904","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Towards High Robust Vision-Language Large Models: Benchmark and Method"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7720-806X","authenticated-orcid":false,"given":"Minyi","family":"Zhao","sequence":"first","affiliation":[{"name":"College of CS and AI, Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-4815-1967","authenticated-orcid":false,"given":"Yi","family":"Liu","sequence":"additional","affiliation":[{"name":"ByteDance Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-8785-1771","authenticated-orcid":false,"given":"Wensong","family":"He","sequence":"additional","affiliation":[{"name":"ByteDance Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-6495-7894","authenticated-orcid":false,"given":"Bingzhe","family":"Yu","sequence":"additional","affiliation":[{"name":"College of CS and AI, Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1006-6041","authenticated-orcid":false,"given":"Yuxi","family":"Mi","sequence":"additional","affiliation":[{"name":"College of CS and AI, Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1949-2768","authenticated-orcid":false,"given":"Shuigeng","family":"Zhou","sequence":"additional","affiliation":[{"name":"College of CS and AI, Fudan University, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_1_2_1","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang et al. 2025. Qwen2. 5-vl technical report. arXiv preprint arXiv:2502.13923 (2025)."},{"key":"e_1_3_2_1_3_1","volume-title":"Jailbreakbench: An open robustness benchmark for jailbreaking large language models. arXiv preprint arXiv:2404.01318","author":"Chao Patrick","year":"2024","unstructured":"Patrick Chao, Edoardo Debenedetti, Alexander Robey, Maksym Andriushchenko, Francesco Croce, Vikash Sehwag, Edgar Dobriban, Nicolas Flammarion, George J Pappas, Florian Tramer, et al., 2024. Jailbreakbench: An open robustness benchmark for jailbreaking large language models. arXiv preprint arXiv:2404.01318 (2024)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11633-022-1369-5"},{"key":"e_1_3_2_1_5_1","volume-title":"Allava: Harnessing gpt4v-synthesized data for a lite vision-language model. arXiv preprint arXiv:2402.11684","author":"Chen Guiming Hardy","year":"2024","unstructured":"Guiming Hardy Chen, Shunian Chen, Ruifei Zhang, Junying Chen, Xiangbo Wu, Zhiyi Zhang, Zhihong Chen, Jianquan Li, Xiang Wan, and Benyou Wang. 2024a. Allava: Harnessing gpt4v-synthesized data for a lite vision-language model. arXiv preprint arXiv:2402.11684 (2024)."},{"key":"e_1_3_2_1_6_1","volume-title":"Sharegpt4v: Improving large multi-modal models with better captions. arXiv preprint arXiv:2311.12793","author":"Chen Lin","year":"2023","unstructured":"Lin Chen, Jisong Li, Xiaoyi Dong, Pan Zhang, Conghui He, Jiaqi Wang, Feng Zhao, and Dahua Lin. 2023a. Sharegpt4v: Improving large multi-modal models with better captions. arXiv preprint arXiv:2311.12793 (2023)."},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 24185-24198","author":"Chen Zhe","year":"2024","unstructured":"Zhe Chen, Jiannan Wu, Wenhai Wang, Weijie Su, Guo Chen, Sen Xing, Muyan Zhong, Qinglong Zhang, Xizhou Zhu, Lewei Lu, et al., 2024b. Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 24185-24198."},{"key":"e_1_3_2_1_8_1","volume-title":"InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. In Thirty-seventh Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=vvoWPYqZJA","author":"Dai Wenliang","year":"2023","unstructured":"Wenliang Dai, Junnan Li, Dongxu Li, Anthony Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven Hoi. 2023. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. In Thirty-seventh Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=vvoWPYqZJA"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-023-00626-4"},{"key":"e_1_3_2_1_10_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_11_1","volume-title":"Mme: A comprehensive evaluation benchmark for multimodal large language models. arXiv preprint arXiv:2306.13394","author":"Fu Chaoyou","year":"2023","unstructured":"Chaoyou Fu, Peixian Chen, Yunhang Shen, Yulei Qin, Mengdan Zhang, Xu Lin, Jinrui Yang, Xiawu Zheng, Ke Li, Xing Sun, et al., 2023. Mme: A comprehensive evaluation benchmark for multimodal large language models. arXiv preprint arXiv:2306.13394 (2023)."},{"key":"e_1_3_2_1_12_1","volume-title":"Video-mme: The first-ever comprehensive evaluation benchmark of multi-modal llms in video analysis. arXiv preprint arXiv:2405.21075","author":"Fu Chaoyou","year":"2024","unstructured":"Chaoyou Fu, Yuhan Dai, Yongdong Luo, Lei Li, Shuhuai Ren, Renrui Zhang, Zihan Wang, Chenyu Zhou, Yunhang Shen, Mengdan Zhang, et al., 2024. Video-mme: The first-ever comprehensive evaluation benchmark of multi-modal llms in video analysis. arXiv preprint arXiv:2405.21075 (2024)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01363"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01046"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00380"},{"key":"e_1_3_2_1_17_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 8322-8326","author":"Helff Lukas","year":"2024","unstructured":"Lukas Helff, Felix Friedrich, Manuel Brack, Patrick Schramowski, and Kristian Kersting. 2024. LLAVAGUARD: VLM-based Safeguard for Vision Dataset Curation and Safety Assessment. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 8322-8326."},{"key":"e_1_3_2_1_18_1","unstructured":"Alexander Hermans Lucas Beyer and Bastian Leibe. 2017. In defense of the triplet loss for person re-identification. arXiv preprint arXiv:1703.07737 (2017)."},{"key":"e_1_3_2_1_19_1","volume-title":"Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685","author":"Hu Edward J","year":"2021","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2021. Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)."},{"key":"e_1_3_2_1_20_1","volume-title":"Ciem: Contrastive instruction evaluation method for better instruction tuning. arXiv preprint arXiv:2309.02301","author":"Hu Hongyu","year":"2023","unstructured":"Hongyu Hu, Jiyuan Zhang, Minyi Zhao, and Zhenbang Sun. 2023. Ciem: Contrastive instruction evaluation method for better instruction tuning. arXiv preprint arXiv:2309.02301 (2023)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00686"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"e_1_3_2_1_23_1","volume-title":"Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lucile Saulnier, et al.","author":"Jiang Albert Q","year":"2023","unstructured":"Albert Q Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lucile Saulnier, et al., 2023. Mistral 7B. arXiv preprint arXiv:2310.06825 (2023)."},{"key":"e_1_3_2_1_24_1","volume-title":"Seed-bench: Benchmarking multimodal llms with generative comprehension. arXiv preprint arXiv:2307.16125","author":"Li Bohao","year":"2023","unstructured":"Bohao Li, Rui Wang, Guangzhi Wang, Yuying Ge, Yixiao Ge, and Ying Shan. 2023d. Seed-bench: Benchmarking multimodal llms with generative comprehension. arXiv preprint arXiv:2307.16125 (2023)."},{"key":"e_1_3_2_1_25_1","volume-title":"2023 e. Otter: A multi-modal model with in-context instruction tuning. arXiv preprint arXiv:2305.03726","author":"Li Bo","year":"2023","unstructured":"Bo Li, Yuanhan Zhang, Liangyu Chen, Jinghao Wang, Jingkang Yang, and Ziwei Liu. 2023 e. Otter: A multi-modal model with in-context instruction tuning. arXiv preprint arXiv:2305.03726 (2023)."},{"key":"e_1_3_2_1_26_1","volume-title":"Llava-onevision: Easy visual task transfer. arXiv preprint arXiv:2408.03326","author":"Li Bo","year":"2024","unstructured":"Bo Li, Yuanhan Zhang, Dong Guo, Renrui Zhang, Feng Li, Hao Zhang, Kaichen Zhang, Yanwei Li, Ziwei Liu, and Chunyuan Li. 2024. Llava-onevision: Easy visual task transfer. arXiv preprint arXiv:2408.03326 (2024)."},{"key":"e_1_3_2_1_27_1","volume-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023b. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)."},{"key":"e_1_3_2_1_28_1","volume-title":"International Conference on Machine Learning. PMLR, 12888-12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International Conference on Machine Learning. PMLR, 12888-12900."},{"key":"e_1_3_2_1_29_1","volume-title":"Empowering vision-language models to follow interleaved vision-language instructions. arXiv preprint arXiv:2308.04152","author":"Li Juncheng","year":"2023","unstructured":"Juncheng Li, Kaihang Pan, Zhiqi Ge, Minghe Gao, Hanwang Zhang, Wei Ji, Wenqiao Zhang, Tat-Seng Chua, Siliang Tang, and Yueting Zhuang. 2023c. Empowering vision-language models to follow interleaved vision-language instructions. arXiv preprint arXiv:2308.04152 (2023)."},{"key":"e_1_3_2_1_30_1","volume-title":"Wayne Xin Zhao, and Ji-Rong Wen","author":"Li Yifan","year":"2023","unstructured":"Yifan Li, Yifan Du, Kun Zhou, Jinpeng Wang, Wayne Xin Zhao, and Ji-Rong Wen. 2023a. Evaluating object hallucination in large vision-language models. arXiv preprint arXiv:2305.10355 (2023)."},{"key":"e_1_3_2_1_31_1","volume-title":"Benchmark evaluations, applications, and challenges of large vision language models: A survey. arXiv preprint arXiv:2501.02189","author":"Li Zongxia","year":"2025","unstructured":"Zongxia Li, Xiyang Wu, Hongyang Du, Huy Nghiem, and Guangyao Shi. 2025. Benchmark evaluations, applications, and challenges of large vision language models: A survey. arXiv preprint arXiv:2501.02189 (2025)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"e_1_3_2_1_33_1","unstructured":"Haotian Liu Chunyuan Li Yuheng Li Bo Li Yuanhan Zhang Sheng Shen and Yong Jae Lee. 2024b. LLaVA-NeXT: Improved reasoning OCR and world knowledge. https:\/\/llava-vl.github.io\/blog\/2024-01-30-llava-next\/"},{"key":"e_1_3_2_1_34_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2024c. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-short.8"},{"key":"e_1_3_2_1_36_1","volume-title":"Mmbench: Is your multi-modal model an all-around player? arXiv preprint arXiv:2307.06281","author":"Liu Yuan","year":"2023","unstructured":"Yuan Liu, Haodong Duan, Yuanhan Zhang, Bo Li, Songyang Zhang, Wangbo Zhao, Yike Yuan, Jiaqi Wang, Conghui He, Ziwei Liu, et al., 2023. Mmbench: Is your multi-modal model an all-around player? arXiv preprint arXiv:2307.06281 (2023)."},{"key":"e_1_3_2_1_37_1","first-page":"2507","article-title":"Learn to explain: Multimodal reasoning via thought chains for science question answering","volume":"35","author":"Lu Pan","year":"2022","unstructured":"Pan Lu, Swaroop Mishra, Tanglin Xia, Liang Qiu, Kai-Wei Chang, Song-Chun Zhu, Oyvind Tafjord, Peter Clark, and Ashwin Kalyan. 2022. Learn to explain: Multimodal reasoning via thought chains for science question answering. Advances in Neural Information Processing Systems, Vol. 35 (2022), 2507-2521.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_38_1","volume-title":"Jake Grigsby, Di Jin, and Yanjun Qi.","author":"Morris John X","year":"2020","unstructured":"John X Morris, Eli Lifland, Jin Yong Yoo, Jake Grigsby, Di Jin, and Yanjun Qi. 2020. Textattack: A framework for adversarial attacks, data augmentation, and adversarial training in nlp. arXiv preprint arXiv:2005.05909 (2020)."},{"key":"e_1_3_2_1_39_1","volume-title":"Instruction tuning with gpt-4. arXiv preprint arXiv:2304.03277","author":"Peng Baolin","year":"2023","unstructured":"Baolin Peng, Chunyuan Li, Pengcheng He, Michel Galley, and Jianfeng Gao. 2023. Instruction tuning with gpt-4. arXiv preprint arXiv:2304.03277 (2023)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3399061"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Amanpreet Singh Vivek Natarajan Meet Shah Yu Jiang Xinlei Chen Dhruv Batra Devi Parikh and Marcus Rohrbach. 2019. Towards vqa models that can read. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 8317-8326.","DOI":"10.1109\/CVPR.2019.00851"},{"key":"e_1_3_2_1_42_1","volume-title":"A Cognitive Evaluation Benchmark of Image Reasoning and Description for Large Vision-Language Models. arXiv preprint arXiv:2402.18409","author":"Song Xiujie","year":"2024","unstructured":"Xiujie Song, Mengyue Wu, Kenny Q Zhu, Chunhao Zhang, and Yanyi Chen. 2024. A Cognitive Evaluation Benchmark of Image Reasoning and Description for Large Vision-Language Models. arXiv preprint arXiv:2402.18409 (2024)."},{"key":"e_1_3_2_1_43_1","volume-title":"An llm-free multi-dimensional benchmark for mllms hallucination evaluation. arXiv preprint arXiv:2311.07397","author":"Wang Junyang","year":"2023","unstructured":"Junyang Wang, Yuhang Wang, Guohai Xu, Jing Zhang, Yukai Gu, Haitao Jia, Ming Yan, Ji Zhang, and Jitao Sang. 2023. An llm-free multi-dimensional benchmark for mllms hallucination evaluation. arXiv preprint arXiv:2311.07397 (2023)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00217"},{"key":"e_1_3_2_1_45_1","volume-title":"Eda: Easy data augmentation techniques for boosting performance on text classification tasks. arXiv preprint arXiv:1901.11196","author":"Wei Jason","year":"2019","unstructured":"Jason Wei and Kai Zou. 2019. Eda: Easy data augmentation techniques for boosting performance on text classification tasks. arXiv preprint arXiv:1901.11196 (2019)."},{"key":"e_1_3_2_1_46_1","first-page":"69925","article-title":"Visionllm v2: An end-to-end generalist multimodal large language model for hundreds of vision-language tasks","volume":"37","author":"Wu Jiannan","year":"2024","unstructured":"Jiannan Wu, Muyan Zhong, Sen Xing, Zeqiang Lai, Zhaoyang Liu, Zhe Chen, Wenhai Wang, Xizhou Zhu, Lewei Lu, Tong Lu, et al., 2024. Visionllm v2: An end-to-end generalist multimodal large language model for hundreds of vision-language tasks. Advances in Neural Information Processing Systems, Vol. 37 (2024), 69925-69975.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_47_1","volume-title":"Lvlm-ehub: A comprehensive evaluation benchmark for large vision-language models","author":"Xu Peng","year":"2024","unstructured":"Peng Xu, Wenqi Shao, Kaipeng Zhang, Peng Gao, Shuo Liu, Meng Lei, Fanqing Meng, Siyuan Huang, Yu Qiao, and Ping Luo. 2024. Lvlm-ehub: A comprehensive evaluation benchmark for large vision-language models. IEEE Transactions on Pattern Analysis and Machine Intelligence (2024)."},{"key":"e_1_3_2_1_48_1","unstructured":"An Yang Baosong Yang Beichen Zhang Binyuan Hui Bo Zheng Bowen Yu Chengyuan Li Dayiheng Liu Fei Huang Haoran Wei et al. 2024. Qwen2. 5 technical report. arXiv preprint arXiv:2412.15115 (2024)."},{"key":"e_1_3_2_1_49_1","unstructured":"Qinghao Ye Haiyang Xu Guohai Xu Jiabo Ye Ming Yan Yiyang Zhou Junyang Wang Anwen Hu Pengcheng Shi Yaya Shi et al. 2023. mplug-owl: Modularization empowers large language models with multimodality. arXiv preprint arXiv:2304.14178 (2023)."},{"key":"e_1_3_2_1_50_1","volume-title":"Mm-vet: Evaluating large multimodal models for integrated capabilities. arXiv preprint arXiv:2308.02490","author":"Yu Weihao","year":"2023","unstructured":"Weihao Yu, Zhengyuan Yang, Linjie Li, Jianfeng Wang, Kevin Lin, Zicheng Liu, Xinchao Wang, and Lijuan Wang. 2023. Mm-vet: Evaluating large multimodal models for integrated capabilities. arXiv preprint arXiv:2308.02490 (2023)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1093\/nsr\/nwx105"},{"key":"e_1_3_2_1_52_1","first-page":"35549","article-title":"Towards video text visual question answering: Benchmark and baseline","volume":"35","author":"Zhao Minyi","year":"2022","unstructured":"Minyi Zhao, Bingjia Li, Jie Wang, Wanqing Li, Wenjing Zhou, Lan Zhang, Shijie Xuyang, Zhihang Yu, Xinkun Yu, Guangze Li, et al., 2022a. Towards video text visual question answering: Benchmark and baseline. Advances in Neural Information Processing Systems, Vol. 35 (2022), 35549-35562.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_53_1","volume-title":"Effectively Enhancing Vision Language Large Models by Prompt Augmentation and Caption Utilization. arXiv preprint arXiv:2409.14484","author":"Zhao Minyi","year":"2024","unstructured":"Minyi Zhao, Jie Wang, Zhaoyang Li, Jiyuan Zhang, Zhenbang Sun, and Shuigeng Zhou. 2024. Effectively Enhancing Vision Language Large Models by Prompt Augmentation and Caption Utilization. arXiv preprint arXiv:2409.14484 (2024)."},{"key":"e_1_3_2_1_54_1","volume-title":"EPiDA: An Easy Plug-in Data Augmentation Framework for High Performance Text Classification. arXiv preprint arXiv:2204.11205","author":"Zhao Minyi","year":"2022","unstructured":"Minyi Zhao, Lu Zhang, Yi Xu, Jiandong Ding, Jihong Guan, and Shuigeng Zhou. 2022b. EPiDA: An Easy Plug-in Data Augmentation Framework for High Performance Text Classification. arXiv preprint arXiv:2204.11205 (2022)."},{"key":"e_1_3_2_1_55_1","volume-title":"Ekin Dogus Cubuk, and Quoc Le","author":"Zoph Barret","year":"2020","unstructured":"Barret Zoph, Golnaz Ghiasi, Tsung-Yi Lin, Yin Cui, Hanxiao Liu, Ekin Dogus Cubuk, and Quoc Le. 2020. Rethinking pre-training and self-training. Advances in neural information processing systems, Vol. 33 (2020), 3833-3845."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3758234","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:00:33Z","timestamp":1765342833000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3758234"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":55,"alternative-id":["10.1145\/3746027.3758234","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3758234","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}