{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,19]],"date-time":"2026-01-19T10:27:47Z","timestamp":1768818467228,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":90,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Science and Technology Major Project, National Natural Science Foundation of China, Zhejiang Provincial Natural Science Foundation of China, Beijing Natural Science Foundation, Beihang World TOP University Cooperation Program, Key Research and Devolopment Program of Jiangsu Porvince","award":["2022ZD0115502, NO. 62122010, U23B2010, No.LDT23F02022F02, NO.L231011, BE2023016-3"],"award-info":[{"award-number":["2022ZD0115502, NO. 62122010, U23B2010, No.LDT23F02022F02, NO.L231011, BE2023016-3"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681026","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"4928-4937","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Collaborative Training of Tiny-Large Vision Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1272-8723","authenticated-orcid":false,"given":"Shichen","family":"Lu","sequence":"first","affiliation":[{"name":"School of Computer Science and Engineering, Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4340-4000","authenticated-orcid":false,"given":"Longteng","family":"Guo","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences &amp; School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9022-1370","authenticated-orcid":false,"given":"Wenxuan","family":"Wang","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences &amp; School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-7781-932X","authenticated-orcid":false,"given":"Zijia","family":"Zhao","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences &amp; School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5774-4084","authenticated-orcid":false,"given":"Tongtian","family":"Yue","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences &amp; School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0903-9131","authenticated-orcid":false,"given":"Jing","family":"Liu","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences &amp; School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9180-2935","authenticated-orcid":false,"given":"Si","family":"Liu","sequence":"additional","affiliation":[{"name":"Institute of Artificial Intelligence, Beihang University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Armen Aghajanyan Bernie Huang Candace Ross Vladimir Karpukhin Hu Xu Naman Goyal Dmytro Okhonko Mandar Joshi Gargi Ghosh Mike Lewis et al. 2022. Cm3: A causal masked multimodal model of the internet. arXiv preprint arXiv:2201.07520 (2022)."},{"key":"e_1_3_2_1_2_1","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican Malcolm Reynolds et al. 2022. Flamingo: a visual language model for few-shot learning. Advances in neural information processing systems Vol. 35 (2022) 23716--23736."},{"key":"e_1_3_2_1_3_1","volume-title":"Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_1_4_1","volume-title":"Beit: Bert pre-training of image transformers. arXiv preprint arXiv:2106.08254","author":"Bao Hangbo","year":"2021","unstructured":"Hangbo Bao, Li Dong, Songhao Piao, and Furu Wei. 2021. Beit: Bert pre-training of image transformers. arXiv preprint arXiv:2106.08254 (2021)."},{"key":"e_1_3_2_1_5_1","unstructured":"Rohan Bavishi Erich Elsen Curtis Hawthorne Maxwell Nye Augustus Odena Arushi Somani and Saugnak Tacsirlar. 2023. Introducing our Multimodal Models. https:\/\/www.adept.ai\/blog\/fuyu-8b"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"e_1_3_2_1_7_1","volume-title":"Shikra: Unleashing Multimodal LLM's Referential Dialogue Magic. arXiv preprint arXiv:2306.15195","author":"Chen Keqin","year":"2023","unstructured":"Keqin Chen, Zhao Zhang, Weili Zeng, Richong Zhang, Feng Zhu, and Rui Zhao. 2023. Shikra: Unleashing Multimodal LLM's Referential Dialogue Magic. arXiv preprint arXiv:2306.15195 (2023)."},{"key":"e_1_3_2_1_8_1","volume-title":"International Conference on Artificial Intelligence and Statistics. PMLR, 661--669","author":"Chen Liqun","year":"2018","unstructured":"Liqun Chen, Shuyang Dai, Yunchen Pu, Erjin Zhou, Chunyuan Li, Qinliang Su, Changyou Chen, and Lawrence Carin. 2018. Symmetric variational autoencoder and connections to adversarial learning. In International Conference on Artificial Intelligence and Statistics. PMLR, 661--669."},{"key":"e_1_3_2_1_9_1","volume-title":"Pali: A jointly-scaled multilingual language-image model. arXiv preprint arXiv:2209.06794","author":"Chen Xi","year":"2022","unstructured":"Xi Chen, Xiao Wang, Soravit Changpinyo, AJ Piergiovanni, Piotr Padlewski, Daniel Salz, Sebastian Goodman, Adam Grycner, Basil Mustafa, Lucas Beyer, et al. 2022. Pali: A jointly-scaled multilingual language-image model. arXiv preprint arXiv:2209.06794 (2022)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"e_1_3_2_1_11_1","volume-title":"Xing","author":"Chiang Wei-Lin","year":"2023","unstructured":"Wei-Lin Chiang, Zhuohan Li, Zi Lin, Ying Sheng, Zhanghao Wu, Hao Zhang, Lianmin Zheng, Siyuan Zhuang, Yonghao Zhuang, Joseph E. Gonzalez, Ion Stoica, and Eric P. Xing. 2023. Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90%* ChatGPT Quality. https:\/\/lmsys.org\/blog\/2023-03--30-vicuna\/"},{"key":"e_1_3_2_1_12_1","volume-title":"X-lxmert: Paint, caption and answer questions with multi-modal transformers. arXiv preprint arXiv:2009.11278","author":"Cho Jaemin","year":"2020","unstructured":"Jaemin Cho, Jiasen Lu, Dustin Schwenk, Hannaneh Hajishirzi, and Aniruddha Kembhavi. 2020. X-lxmert: Paint, caption and answer questions with multi-modal transformers. arXiv preprint arXiv:2009.11278 (2020)."},{"key":"e_1_3_2_1_13_1","volume-title":"Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven Hoi.","author":"Dai Wenliang","year":"2023","unstructured":"Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven Hoi. 2023. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. arXiv preprint arXiv:2305.06500 (2023)."},{"key":"e_1_3_2_1_14_1","volume-title":"Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven Hoi.","author":"Dai Wenliang","year":"2023","unstructured":"Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven Hoi. 2023. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. arxiv: 2305.06500 [cs.CV]"},{"key":"e_1_3_2_1_15_1","volume-title":"Mme: A comprehensive evaluation benchmark for multimodal large language models. arXiv preprint arXiv:2306.13394","author":"Fu Chaoyou","year":"2023","unstructured":"Chaoyou Fu, Peixian Chen, Yunhang Shen, Yulei Qin, Mengdan Zhang, Xu Lin, Jinrui Yang, Xiawu Zheng, Ke Li, Xing Sun, et al. 2023. Mme: A comprehensive evaluation benchmark for multimodal large language models. arXiv preprint arXiv:2306.13394 (2023)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01453-z"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"Yash Goyal Tejas Khot Douglas Summers-Stay Dhruv Batra and Devi Parikh. 2017. Making the v in vqa matter: Elevating the role of image understanding in visual question answering. In CVPR.","DOI":"10.1109\/CVPR.2017.670"},{"key":"e_1_3_2_1_18_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Gu Yuxian","year":"2023","unstructured":"Yuxian Gu, Li Dong, Furu Wei, and Minlie Huang. 2023. MiniLLM: Knowledge distillation of large language models. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1007\/s44267-023-00003-0"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_21_1","volume-title":"Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531","author":"Hinton Geoffrey","year":"2015","unstructured":"Geoffrey Hinton, Oriol Vinyals, and Jeff Dean. 2015. Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531 (2015)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i3.27999"},{"key":"e_1_3_2_1_23_1","volume-title":"Gqa: A new dataset for real-world visual reasoning and compositional question answering. In CVPR.","author":"Hudson Drew A","year":"2019","unstructured":"Drew A Hudson and Christopher D Manning. 2019. Gqa: A new dataset for real-world visual reasoning and compositional question answering. In CVPR."},{"key":"e_1_3_2_1_24_1","volume-title":"Scheduled sampling, likelihood, adversary? arXiv preprint arXiv:1511.05101","author":"Husz\u00e1r Ferenc","year":"2015","unstructured":"Ferenc Husz\u00e1r. 2015. How (not) to train your generative model: Scheduled sampling, likelihood, adversary? arXiv preprint arXiv:1511.05101 (2015)."},{"key":"e_1_3_2_1_25_1","unstructured":"Gabriel Ilharco Mitchell Wortsman Ross Wightman Cade Gordon Nicholas Carlini Rohan Taori Achal Dave Vaishaal Shankar Hongseok Namkoong John Miller et al. 2021. Openclip. If you use this software please cite it as below (2021) 1."},{"key":"e_1_3_2_1_26_1","unstructured":"Haozhe Ji Pei Ke Zhipeng Hu Rongsheng Zhang and Minlie Huang. 2023. Tailoring Language Generation Models under Total Variation Distance. arxiv: 2302.13344 [cs.CL]"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1007\/s44267-024-00040-3"},{"key":"e_1_3_2_1_28_1","volume-title":"Tinybert: Distilling bert for natural language understanding. arXiv preprint arXiv:1909.10351","author":"Jiao Xiaoqi","year":"2019","unstructured":"Xiaoqi Jiao, Yichun Yin, Lifeng Shang, Xin Jiang, Xiao Chen, Linlin Li, Fang Wang, and Qun Liu. 2019. Tinybert: Distilling bert for natural language understanding. arXiv preprint arXiv:1909.10351 (2019)."},{"key":"e_1_3_2_1_29_1","volume-title":"Proceedings of NAACL-HLT. 4171--4186","author":"Ming-Wei Chang Jacob Devlin","year":"2019","unstructured":"Jacob Devlin Ming-Wei Chang Kenton and Lee Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of NAACL-HLT. 4171--4186."},{"key":"e_1_3_2_1_30_1","volume-title":"Sequence-level knowledge distillation. arXiv preprint arXiv:1606.07947","author":"Kim Yoon","year":"2016","unstructured":"Yoon Kim and Alexander M Rush. 2016. Sequence-level knowledge distillation. arXiv preprint arXiv:1606.07947 (2016)."},{"key":"e_1_3_2_1_31_1","volume-title":"Lisa: Reasoning segmentation via large language model. arXiv preprint arXiv:2308.00692","author":"Lai Xin","year":"2023","unstructured":"Xin Lai, Zhuotao Tian, Yukang Chen, Yanwei Li, Yuhui Yuan, Shu Liu, and Jiaya Jia. 2023. Lisa: Reasoning segmentation via large language model. arXiv preprint arXiv:2308.00692 (2023)."},{"key":"e_1_3_2_1_32_1","unstructured":"Hugo Laurenccon Daniel van Strien Stas Bekman Leo Tronchon Lucile Saulnier Thomas Wang Siddharth Karamcheti Amanpreet Singh Giada Pistilli Yacine Jernite et al. 2023. Introducing idefics: An open reproduction of state-of-the-art visual language model 2023. URL https:\/\/huggingface. co\/blog\/idefics. Accessed (2023) 09--18."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2023.103720"},{"key":"e_1_3_2_1_34_1","volume-title":"2023 d. Seed-bench: Benchmarking multimodal llms with generative comprehension. arXiv preprint arXiv:2307.16125","author":"Li Bohao","year":"2023","unstructured":"Bohao Li, Rui Wang, Guangzhi Wang, Yuying Ge, Yixiao Ge, and Ying Shan. 2023 d. Seed-bench: Benchmarking multimodal llms with generative comprehension. arXiv preprint arXiv:2307.16125 (2023)."},{"key":"e_1_3_2_1_35_1","volume-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)."},{"key":"e_1_3_2_1_36_1","unstructured":"Junnan Li Dongxu Li Silvio Savarese and Steven Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. arxiv: 2301.12597 [cs.CV]"},{"key":"e_1_3_2_1_37_1","volume-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In ICML.","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In ICML."},{"key":"e_1_3_2_1_38_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems, Vol. 34 (2021), 9694--9705."},{"key":"e_1_3_2_1_39_1","volume-title":"Wayne Xin Zhao, and Ji-Rong Wen","author":"Li Yifan","year":"2023","unstructured":"Yifan Li, Yifan Du, Kun Zhou, Jinpeng Wang, Wayne Xin Zhao, and Ji-Rong Wen. 2023. Evaluating object hallucination in large vision-language models. arXiv preprint arXiv:2305.10355 (2023)."},{"key":"e_1_3_2_1_40_1","volume-title":"Mixkd: Towards efficient distillation of large-scale language models. arXiv preprint arXiv:2011.00593","author":"Liang Kevin J","year":"2020","unstructured":"Kevin J Liang, Weituo Hao, Dinghan Shen, Yufan Zhou, Weizhu Chen, Changyou Chen, and Lawrence Carin. 2020. Mixkd: Towards efficient distillation of large-scale language models. arXiv preprint arXiv:2011.00593 (2020)."},{"key":"e_1_3_2_1_41_1","volume-title":"Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Yuheng Li, and Yong Jae Lee. 2023. Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744 (2023)."},{"key":"e_1_3_2_1_42_1","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong Jae Lee. 2023. Visual Instruction Tuning."},{"key":"e_1_3_2_1_43_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2024. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_44_1","volume-title":"Mmbench: Is your multi-modal model an all-around player? arXiv preprint arXiv:2307.06281","author":"Liu Yuan","year":"2023","unstructured":"Yuan Liu, Haodong Duan, Yuanhan Zhang, Bo Li, Songyang Zhang, Wangbo Zhao, Yike Yuan, Jiaqi Wang, Conghui He, Ziwei Liu, et al. 2023. Mmbench: Is your multi-modal model an all-around player? arXiv preprint arXiv:2307.06281 (2023)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_46_1","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. Decoupled Weight Decay Regularization. arxiv: 1711.05101 [cs.LG]"},{"key":"e_1_3_2_1_47_1","volume-title":"Learn to explain: Multimodal reasoning via thought chains for science question answering. NeurIPS","author":"Lu Pan","year":"2022","unstructured":"Pan Lu, Swaroop Mishra, Tanglin Xia, Liang Qiu, Kai-Wei Chang, Song-Chun Zhu, Oyvind Tafjord, Peter Clark, and Ashwin Kalyan. 2022. Learn to explain: Multimodal reasoning via thought chains for science question answering. NeurIPS (2022)."},{"key":"e_1_3_2_1_48_1","volume-title":"f-gan: Training generative neural samplers using variational divergence minimization. Advances in neural information processing systems","author":"Nowozin Sebastian","year":"2016","unstructured":"Sebastian Nowozin, Botond Cseke, and Ryota Tomioka. 2016. f-gan: Training generative neural samplers using variational divergence minimization. Advances in neural information processing systems, Vol. 29 (2016)."},{"key":"e_1_3_2_1_49_1","unstructured":"OpenAI. 2023. ChatGPT. Technical Report. https:\/\/openai.com\/blog\/chatgpt."},{"key":"e_1_3_2_1_50_1","unstructured":"Maxime Oquab Timoth\u00e9e Darcet Th\u00e9o Moutakanni Huy Vo Marc Szafraniec Vasil Khalidov Pierre Fernandez Daniel Haziza Francisco Massa Alaaeldin El-Nouby et al. 2023. Dinov2: Learning robust visual features without supervision. arXiv preprint arXiv:2304.07193 (2023)."},{"key":"e_1_3_2_1_51_1","volume-title":"Kosmos-2: Grounding multimodal large language models to the world. arXiv preprint arXiv:2306.14824","author":"Peng Zhiliang","year":"2023","unstructured":"Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, and Furu Wei. 2023. Kosmos-2: Grounding multimodal large language models to the world. arXiv preprint arXiv:2306.14824 (2023)."},{"key":"e_1_3_2_1_52_1","volume-title":"International Conference on Machine Learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_53_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_54_1","unstructured":"Alec Radford Karthik Narasimhan Tim Salimans Ilya Sutskever et al. 2018. Improving language understanding by generative pre-training. (2018)."},{"key":"e_1_3_2_1_55_1","unstructured":"Aditya Ramesh Mikhail Pavlov Gabriel Goh Scott Gray Chelsea Voss Alec Radford Mark Chen and Ilya Sutskever. 2021. Zero-Shot Text-to-Image Generation. arxiv: 2102.12092 [cs.CV]"},{"key":"e_1_3_2_1_56_1","volume-title":"Caglar Gulcehre, Guillaume Desjardins, James Kirkpatrick, Razvan Pascanu, Volodymyr Mnih, Koray Kavukcuoglu, and Raia Hadsell.","author":"Rusu Andrei A","year":"2015","unstructured":"Andrei A Rusu, Sergio Gomez Colmenarejo, Caglar Gulcehre, Guillaume Desjardins, James Kirkpatrick, Razvan Pascanu, Volodymyr Mnih, Koray Kavukcuoglu, and Raia Hadsell. 2015. Policy distillation. arXiv preprint arXiv:1511.06295 (2015)."},{"key":"e_1_3_2_1_57_1","volume-title":"Sanghun Cho and Woonhyuk Baek","author":"Doyup Lee Saehoon Kim Chiheon Kim","year":"2021","unstructured":"Chiheon Kim Doyup Lee Saehoon Kim, Sanghun Cho and Woonhyuk Baek. 2021. minDALL-E on Conceptual Captions. https:\/\/github.com\/kakaobrain\/minDALL-E."},{"key":"e_1_3_2_1_58_1","volume-title":"a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108","author":"Sanh Victor","year":"2019","unstructured":"Victor Sanh, Lysandre Debut, Julien Chaumond, and Thomas Wolf. 2019. DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108 (2019)."},{"key":"e_1_3_2_1_59_1","volume-title":"Laion-400m: Open dataset of clip-filtered 400 million image-text pairs. arXiv preprint arXiv:2111.02114","author":"Schuhmann Christoph","year":"2021","unstructured":"Christoph Schuhmann, Richard Vencu, Romain Beaumont, Robert Kaczmarczyk, Clayton Mullis, Aarush Katta, Theo Coombes, Jenia Jitsev, and Aran Komatsuzaki. 2021. Laion-400m: Open dataset of clip-filtered 400 million image-text pairs. arXiv preprint arXiv:2111.02114 (2021)."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"e_1_3_2_1_61_1","volume-title":"Emojich - zero-shot emoji generation using Russian language: a technical report. CoRR","author":"Shonenkov Alex","year":"2021","unstructured":"Alex Shonenkov, Daria Bakshandaeva, Denis Dimitrov, and Aleksandr Nikolich. 2021. Emojich - zero-shot emoji generation using Russian language: a technical report. CoRR, Vol. abs\/2112.02448 (2021). showeprint[arXiv]2112.02448 https:\/\/arxiv.org\/abs\/2112.02448"},{"key":"e_1_3_2_1_62_1","volume-title":"Meet Shah, Yu Jiang, Xinlei Chen, Devi Parikh, and Marcus Rohrbach.","author":"Singh Amanpreet","year":"2019","unstructured":"Amanpreet Singh, Vivek Natarjan, Meet Shah, Yu Jiang, Xinlei Chen, Devi Parikh, and Marcus Rohrbach. 2019. Towards VQA Models That Can Read. In CVPR."},{"key":"e_1_3_2_1_63_1","volume-title":"LightPAFF: A two-stage distillation framework for pre-training and fine-tuning. arXiv preprint arXiv:2004.12817","author":"Song Kaitao","year":"2020","unstructured":"Kaitao Song, Hao Sun, Xu Tan, Tao Qin, Jianfeng Lu, Hongzhi Liu, and Tie-Yan Liu. 2020. LightPAFF: A two-stage distillation framework for pre-training and fine-tuning. arXiv preprint arXiv:2004.12817 (2020)."},{"key":"e_1_3_2_1_64_1","volume-title":"Vl-bert: Pre-training of generic visual-linguistic representations. arXiv preprint arXiv:1908.08530","author":"Su Weijie","year":"2019","unstructured":"Weijie Su, Xizhou Zhu, Yue Cao, Bin Li, Lewei Lu, Furu Wei, and Jifeng Dai. 2019. Vl-bert: Pre-training of generic visual-linguistic representations. arXiv preprint arXiv:1908.08530 (2019)."},{"key":"e_1_3_2_1_65_1","volume-title":"Eva-clip: Improved training techniques for clip at scale. arXiv preprint arXiv:2303.15389","author":"Sun Quan","year":"2023","unstructured":"Quan Sun, Yuxin Fang, Ledell Wu, Xinlong Wang, and Yue Cao. 2023. Eva-clip: Improved training techniques for clip at scale. arXiv preprint arXiv:2303.15389 (2023)."},{"key":"e_1_3_2_1_66_1","volume-title":"Patient knowledge distillation for bert model compression. arXiv preprint arXiv:1908.09355","author":"Sun Siqi","year":"2019","unstructured":"Siqi Sun, Yu Cheng, Zhe Gan, and Jingjing Liu. 2019. Patient knowledge distillation for bert model compression. arXiv preprint arXiv:1908.09355 (2019)."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1514"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2024.3413599"},{"key":"e_1_3_2_1_69_1","volume-title":"Hashimoto","author":"Taori Rohan","year":"2023","unstructured":"Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li, Carlos Guestrin, Percy Liang, and Tatsunori B. Hashimoto. 2023. Stanford Alpaca: An Instruction-following LLaMA model. https:\/\/github.com\/tatsu-lab\/stanford_alpaca."},{"key":"e_1_3_2_1_70_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_71_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_72_1","volume-title":"Git: A generative image-to-text transformer for vision and language. arXiv preprint arXiv:2205.14100","author":"Wang Jianfeng","year":"2022","unstructured":"Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, and Lijuan Wang. 2022. Git: A generative image-to-text transformer for vision and language. arXiv preprint arXiv:2205.14100 (2022)."},{"key":"e_1_3_2_1_73_1","volume-title":"Minilmv2: Multi-head self-attention relation distillation for compressing pretrained transformers. arXiv preprint arXiv:2012.15828","author":"Wang Wenhui","year":"2020","unstructured":"Wenhui Wang, Hangbo Bao, Shaohan Huang, Li Dong, and Furu Wei. 2020. Minilmv2: Multi-head self-attention relation distillation for compressing pretrained transformers. arXiv preprint arXiv:2012.15828 (2020)."},{"key":"e_1_3_2_1_74_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Wang Wenhai","year":"2024","unstructured":"Wenhai Wang, Zhe Chen, Xiaokang Chen, Jiannan Wu, Xizhou Zhu, Gang Zeng, Ping Luo, Tong Lu, Jie Zhou, Yu Qiao, et al. 2024. Visionllm: Large language model is also an open-ended decoder for vision-centric tasks. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01385"},{"key":"e_1_3_2_1_76_1","first-page":"5776","article-title":"Minilm: Deep self-attention distillation for task-agnostic compression of pre-trained transformers","volume":"33","author":"Wang Wenhui","year":"2020","unstructured":"Wenhui Wang, Furu Wei, Li Dong, Hangbo Bao, Nan Yang, and Ming Zhou. 2020. Minilm: Deep self-attention distillation for task-agnostic compression of pre-trained transformers. Advances in Neural Information Processing Systems, Vol. 33 (2020), 5776--5788.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01235"},{"key":"e_1_3_2_1_78_1","volume-title":"SimVLM: Simple Visual Language Model Pretraining with Weak Supervision. In International Conference on Learning Representations.","author":"Wang Zirui","year":"2021","unstructured":"Zirui Wang, Jiahui Yu, Adams Wei Yu, Zihang Dai, Yulia Tsvetkov, and Yuan Cao. 2021. SimVLM: Simple Visual Language Model Pretraining with Weak Supervision. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_79_1","volume-title":"The dawn of lmms: Preliminary explorations with gpt-4v (ision). arXiv preprint arXiv:2309.17421","author":"Yang Zhengyuan","year":"2023","unstructured":"Zhengyuan Yang, Linjie Li, Kevin Lin, Jianfeng Wang, Chung-Ching Lin, Zicheng Liu, and Lijuan Wang. 2023. The dawn of lmms: Preliminary explorations with gpt-4v (ision). arXiv preprint arXiv:2309.17421, Vol. 9, 1 (2023), 1."},{"key":"e_1_3_2_1_80_1","unstructured":"Qinghao Ye Haiyang Xu Guohai Xu Jiabo Ye Ming Yan Yiyang Zhou Junyang Wang Anwen Hu Pengcheng Shi Yaya Shi et al. 2023. mplug-owl: Modularization empowers large language models with multimodality. arXiv preprint arXiv:2304.14178 (2023)."},{"key":"e_1_3_2_1_81_1","volume-title":"Coca: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917","author":"Yu Jiahui","year":"2022","unstructured":"Jiahui Yu, Zirui Wang, Vijay Vasudevan, Legg Yeung, Mojtaba Seyedhosseini, and Yonghui Wu. 2022. Coca: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917 (2022)."},{"key":"e_1_3_2_1_82_1","volume-title":"Thang Luong, Gunjan Baid, Zirui Wang, Vijay Vasudevan, Alexander Ku, Yinfei Yang, Burcu Karagol Ayan, et al.","author":"Yu Jiahui","year":"2022","unstructured":"Jiahui Yu, Yuanzhong Xu, Jing Yu Koh, Thang Luong, Gunjan Baid, Zirui Wang, Vijay Vasudevan, Alexander Ku, Yinfei Yang, Burcu Karagol Ayan, et al. 2022. Scaling autoregressive models for content-rich text-to-image generation. arXiv preprint arXiv:2206.10789, Vol. 2, 3 (2022), 5."},{"key":"e_1_3_2_1_83_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01242"},{"key":"e_1_3_2_1_84_1","volume-title":"Mmmu: A massive multi-discipline multimodal understanding and reasoning benchmark for expert agi. arXiv preprint arXiv:2311.16502","author":"Yue Xiang","year":"2023","unstructured":"Xiang Yue, Yuansheng Ni, Kai Zhang, Tianyu Zheng, Ruoqi Liu, Ge Zhang, Samuel Stevens, Dongfu Jiang, Weiming Ren, Yuxuan Sun, et al. 2023. Mmmu: A massive multi-discipline multimodal understanding and reasoning benchmark for expert agi. arXiv preprint arXiv:2311.16502 (2023)."},{"key":"e_1_3_2_1_85_1","doi-asserted-by":"publisher","DOI":"10.1007\/s44267-024-00051-0"},{"key":"e_1_3_2_1_86_1","volume-title":"Video-llama: An instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858","author":"Zhang Hang","year":"2023","unstructured":"Hang Zhang, Xin Li, and Lidong Bing. 2023. Video-llama: An instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858 (2023)."},{"key":"e_1_3_2_1_87_1","unstructured":"Pan Zhang Xiaoyi Dong Bin Wang Yuhang Cao Chao Xu Linke Ouyang Zhiyuan Zhao Shuangrui Ding Songyang Zhang Haodong Duan Hang Yan et al. 2023 d. Internlm-xcomposer: A vision-language large model for advanced text-image comprehension and composition. arXiv preprint arXiv:2309.15112 (2023)."},{"key":"e_1_3_2_1_88_1","volume-title":"Llama-adapter: Efficient fine-tuning of language models with zero-init attention. arXiv preprint arXiv:2303.16199","author":"Zhang Renrui","year":"2023","unstructured":"Renrui Zhang, Jiaming Han, Chris Liu, Peng Gao, Aojun Zhou, Xiangfei Hu, Shilin Yan, Pan Lu, Hongsheng Li, and Yu Qiao. 2023. Llama-adapter: Efficient fine-tuning of language models with zero-init attention. arXiv preprint arXiv:2303.16199 (2023)."},{"key":"e_1_3_2_1_89_1","volume-title":"Do not blindly imitate the teacher: Using perturbed loss for knowledge distillation. arXiv preprint arXiv:2305.05010","author":"Zhang Rongzhi","year":"2023","unstructured":"Rongzhi Zhang, Jiaming Shen, Tianqi Liu, Jialu Liu, Michael Bendersky, Marc Najork, and Chao Zhang. 2023. Do not blindly imitate the teacher: Using perturbed loss for knowledge distillation. arXiv preprint arXiv:2305.05010 (2023)."},{"key":"e_1_3_2_1_90_1","volume-title":"Xi Victoria Lin, et al","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, et al. 2022. Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068 (2022)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681026","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681026","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:37Z","timestamp":1750295857000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681026"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":90,"alternative-id":["10.1145\/3664647.3681026","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681026","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}