{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T05:05:19Z","timestamp":1750309519747,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":74,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,25]],"date-time":"2024-10-25T00:00:00Z","timestamp":1729814400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc\/4.0\/"}],"funder":[{"name":"National Key R & D Program of China","award":["2021YFE0205700"],"award-info":[{"award-number":["2021YFE0205700"]}]},{"name":"The Science and Technology Reseaerch and Development Plan of China Railway","award":["P2023S001"],"award-info":[{"award-number":["P2023S001"]}]},{"name":"The National Natural Science Foundation of China","award":["62276260, 62076235, and 62206290"],"award-info":[{"award-number":["62276260, 62076235, and 62206290"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,25]]},"DOI":"10.1145\/3704323.3704364","type":"proceedings-article","created":{"date-parts":[[2025,1,7]],"date-time":"2025-01-07T08:25:22Z","timestamp":1736238322000},"page":"303-313","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Multimodal Mamba: A Versatile Multimodal Model for Seamless Integration into Diverse Downstream Tasks"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-8517-9829","authenticated-orcid":false,"given":"Zongshu","family":"Li","sequence":"first","affiliation":[{"name":"Foundation Model Research Center, Institute of Automation, Chinese Academy of Sciences, Beijing, Beijing, China and Algorithm Development Department, Wuhan Artificial Intelligence Research, Wuhan, Hubei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8293-3952","authenticated-orcid":false,"given":"Guibo","family":"Zhu","sequence":"additional","affiliation":[{"name":"Foundation Model Research Center, Institute of Automation, Chinese Academy of Sciences, Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4001-4453","authenticated-orcid":false,"given":"Dongyi","family":"Yi","sequence":"additional","affiliation":[{"name":"Algorithm Development Department, Wuhan Artificial Intelligence Research, Wuhan, Hubei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9118-2780","authenticated-orcid":false,"given":"Jinqiao","family":"Wang","sequence":"additional","affiliation":[{"name":"Foundation Model Research Center, Institute of Automation, Chinese Academy of Sciences, Beijing, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,1,7]]},"reference":[{"key":"e_1_3_3_1_2_2","unstructured":"[n. d.]. Introducing ChatGPT. https:\/\/openai.com\/index\/chatgpt\/"},{"key":"e_1_3_3_1_3_2","unstructured":"AI@Meta. 2024. Llama 3 Model Card. (2024). https:\/\/github.com\/meta-llama\/llama3\/blob\/main\/MODEL_CARD.md"},{"key":"e_1_3_3_1_4_2","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech and et al. 2022. Flamingo: a Visual Language Model for Few-Shot Learning. arxiv:https:\/\/arXiv.org\/abs\/2204.14198\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2204.14198"},{"key":"e_1_3_3_1_5_2","unstructured":"Jinze Bai Shuai Bai Shusheng Yang Shijie Wang Sinan Tan Peng Wang Junyang Lin Chang Zhou and Jingren Zhou. 2023. Qwen-VL: A Versatile Vision-Language Model for Understanding Localization Text Reading and Beyond. arxiv:https:\/\/arXiv.org\/abs\/2308.12966\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2308.12966"},{"volume-title":"International Conference on Learning Representations","author":"Bao Hangbo","key":"e_1_3_3_1_6_2","unstructured":"Hangbo Bao, Li Dong, Songhao Piao, and Furu Wei. [n. d.]. BEiT: BERT Pre-Training of Image Transformers. In International Conference on Learning Representations."},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.bigscience-1.9"},{"key":"e_1_3_3_1_8_2","unstructured":"Tom\u00a0B. Brown Benjamin Mann Nick Ryder Melanie Subbiah and et al. 2020. Language Models are Few-Shot Learners. arxiv:https:\/\/arXiv.org\/abs\/2005.14165\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2005.14165"},{"key":"e_1_3_3_1_9_2","unstructured":"Zheng Cai Maosong Cao Haojiong Chen Kai Chen Keyu Chen Xin Chen Xun Chen Zehui Chen Zhi Chen Pei Chu et\u00a0al. 2024. Internlm2 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.17297 (2024)."},{"key":"e_1_3_3_1_10_2","unstructured":"Wei-Lin Chiang Zhuohan Li Zi Lin Ying Sheng Zhanghao Wu and et al. 2023. Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90%* ChatGPT Quality. https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/"},{"key":"e_1_3_3_1_11_2","unstructured":"Junyoung Chung Caglar Gulcehre KyungHyun Cho and Yoshua Bengio. 2014. Empirical evaluation of gated recurrent neural networks on sequence modeling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1412.3555 (2014)."},{"key":"e_1_3_3_1_12_2","unstructured":"Wenliang Dai Junnan Li Dongxu Li Anthony Meng\u00a0Huat Tiong Junqi Zhao Weisheng Wang Boyang Li Pascale Fung and Steven Hoi. 2023. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. arxiv:https:\/\/arXiv.org\/abs\/2305.06500\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2305.06500"},{"key":"e_1_3_3_1_13_2","first-page":"4171","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers). 4171\u20134186."},{"key":"e_1_3_3_1_14_2","unstructured":"Xiaoyi Dong Pan Zhang Yuhang Zang Yuhang Cao and et al. 2024. InternLM-XComposer2-4KHD: A Pioneering Large Vision-Language Model Handling Resolutions from 336 Pixels to 4K HD. arxiv:https:\/\/arXiv.org\/abs\/2404.06512\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2404.06512"},{"key":"e_1_3_3_1_15_2","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly Jakob Uszkoreit and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. arxiv:https:\/\/arXiv.org\/abs\/2010.11929\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2010.11929"},{"key":"e_1_3_3_1_16_2","unstructured":"Stefan Elfwing Eiji Uchibe and Kenji Doya. 2017. Sigmoid-Weighted Linear Units for Neural Network Function Approximation in Reinforcement Learning. arxiv:https:\/\/arXiv.org\/abs\/1702.03118\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/1702.03118"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"crossref","unstructured":"Jeffrey\u00a0L Elman. 1990. Finding structure in time. Cognitive science 14 2 (1990) 179\u2013211.","DOI":"10.1016\/0364-0213(90)90002-E"},{"key":"e_1_3_3_1_18_2","unstructured":"Tao Gong Chengqi Lyu Shilong Zhang Yudong Wang Miao Zheng Qian Zhao Kuikun Liu Wenwei Zhang Ping Luo and Kai Chen. 2023. MultiModal-GPT: A Vision and Language Model for Dialogue with Humans. arxiv:https:\/\/arXiv.org\/abs\/2305.04790\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2305.04790"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"crossref","unstructured":"Alex Graves and Alex Graves. 2012. Long short-term memory. Supervised sequence labelling with recurrent neural networks (2012) 37\u201345.","DOI":"10.1007\/978-3-642-24797-2_4"},{"key":"e_1_3_3_1_20_2","unstructured":"Albert Gu and Tri Dao. 2024. Mamba: Linear-Time Sequence Modeling with Selective State Spaces. arxiv:https:\/\/arXiv.org\/abs\/2312.00752\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/2312.00752"},{"key":"e_1_3_3_1_21_2","first-page":"1474","volume-title":"Advances in Neural Information Processing Systems","author":"Gu Albert","year":"2020","unstructured":"Albert Gu, Tri Dao, Stefano Ermon, Atri Rudra, and Christopher R\u00e9. 2020. HiPPO: Recurrent Memory with Optimal Polynomial Projections. In Advances in Neural Information Processing Systems, H.\u00a0Larochelle, M.\u00a0Ranzato, R.\u00a0Hadsell, M.F. Balcan, and H.\u00a0Lin (Eds.), Vol.\u00a033. Curran Associates, Inc., 1474\u20131487. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2020\/file\/102f0bb6efb3a6128a3c750dd16729be-Paper.pdf"},{"key":"e_1_3_3_1_22_2","unstructured":"Albert Gu Tri Dao Stefano Ermon Atri Rudra and Christopher R\u00e9. 2020. Hippo: Recurrent memory with optimal polynomial projections. Advances in neural information processing systems 33 (2020) 1474\u20131487."},{"key":"e_1_3_3_1_23_2","unstructured":"Albert Gu Karan Goel Ankit Gupta and Christopher R\u00e9. 2022. On the parameterization and initialization of diagonal state space models. Advances in Neural Information Processing Systems 35 (2022) 35971\u201335983."},{"key":"e_1_3_3_1_24_2","first-page":"35971","volume-title":"Advances in Neural Information Processing Systems","volume":"35","author":"Gu Albert","year":"2022","unstructured":"Albert Gu, Karan Goel, Ankit Gupta, and Christopher R\u00e9. 2022. On the Parameterization and Initialization of Diagonal State Space Models. In Advances in Neural Information Processing Systems, S.\u00a0Koyejo, S.\u00a0Mohamed, A.\u00a0Agarwal, D.\u00a0Belgrave, K.\u00a0Cho, and A.\u00a0Oh (Eds.), Vol.\u00a035. Curran Associates, Inc., 35971\u201335983. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2022\/file\/e9a32fade47b906de908431991440f7c-Paper-Conference.pdf"},{"volume-title":"International Conference on Learning Representations","author":"Gu Albert","key":"e_1_3_3_1_25_2","unstructured":"Albert Gu, Karan Goel, and Christopher Re. [n. d.]. Efficiently Modeling Long Sequences with Structured State Spaces. In International Conference on Learning Representations."},{"key":"e_1_3_3_1_26_2","volume-title":"International Conference on Learning Representations","author":"Gu Albert","year":"2022","unstructured":"Albert Gu, Karan Goel, and Christopher Re. 2022. Efficiently Modeling Long Sequences with Structured State Spaces. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=uYLFoz1vlAC"},{"key":"e_1_3_3_1_27_2","volume-title":"Advances in Neural Information Processing Systems","author":"Gu Albert","year":"2021","unstructured":"Albert Gu, Isys Johnson, Karan Goel, Khaled\u00a0Kamal Saab, Tri Dao, Atri Rudra, and Christopher Re. 2021. Combining Recurrent, Convolutional, and Continuous-time Models with Linear State Space Layers. In Advances in Neural Information Processing Systems, A.\u00a0Beygelzimer, Y.\u00a0Dauphin, P.\u00a0Liang, and J.\u00a0Wortman Vaughan (Eds.). https:\/\/openreview.net\/forum?id=yWd42CWN3c"},{"key":"e_1_3_3_1_28_2","unstructured":"Ankit Gupta Albert Gu and Jonathan Berant. 2022. Diagonal state spaces are as effective as structured state spaces. Advances in Neural Information Processing Systems 35 (2022) 22982\u201322994."},{"key":"e_1_3_3_1_29_2","volume-title":"The Eleventh International Conference on Learning Representations","author":"Hasani Ramin","year":"2023","unstructured":"Ramin Hasani, Mathias Lechner, Tsun-Hsuan Wang, Makram Chahine, Alexander Amini, and Daniela Rus. 2023. Liquid Structural State-Space Models. In The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=g4OTKRKfS7R"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_39"},{"key":"e_1_3_3_1_31_2","first-page":"72096","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Huang Shaohan","year":"2023","unstructured":"Shaohan Huang, Li Dong, Wenhui Wang, Yaru Hao, and et al. 2023. Language Is Not All You Need: Aligning Perception with Language Models. In Advances in Neural Information Processing Systems, A.\u00a0Oh, T.\u00a0Naumann, A.\u00a0Globerson, K.\u00a0Saenko, M.\u00a0Hardt, and S.\u00a0Levine (Eds.), Vol.\u00a036. Curran Associates, Inc., 72096\u201372109. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2023\/file\/e425b75bac5742a008d643826428787c-Paper-Conference.pdf"},{"key":"e_1_3_3_1_32_2","volume-title":"The Twelfth International Conference on Learning Representations","author":"Jain Neel","year":"2024","unstructured":"Neel Jain, Ping yeh Chiang, Yuxin Wen, John Kirchenbauer, Hong-Min Chu, Gowthami Somepalli, Brian\u00a0R. Bartoldson, Bhavya Kailkhura, Avi Schwarzschild, Aniruddha Saha, Micah Goldblum, Jonas Geiping, and Tom Goldstein. 2024. NEFTune: Noisy Embeddings Improve Instruction Finetuning. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=0bMmZ3fkCk"},{"key":"e_1_3_3_1_33_2","unstructured":"Jared Kaplan Sam McCandlish Tom Henighan Tom\u00a0B. Brown Benjamin Chess Rewon Child Scott Gray Alec Radford Jeffrey Wu and Dario Amodei. 2020. Scaling Laws for Neural Language Models. arxiv:https:\/\/arXiv.org\/abs\/2001.08361\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/2001.08361"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"crossref","unstructured":"Chenliang Li Haiyang Xu Junfeng Tian Wei Wang and et al. 2022. mPLUG: Effective and Efficient Vision-Language Learning by Cross-modal Skip-connections. arxiv:https:\/\/arXiv.org\/abs\/2205.12005\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2205.12005","DOI":"10.18653\/v1\/2022.emnlp-main.488"},{"key":"e_1_3_3_1_35_2","unstructured":"Junnan Li Dongxu Li Silvio Savarese and Steven Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. arxiv:https:\/\/arXiv.org\/abs\/2301.12597\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2301.12597"},{"key":"e_1_3_3_1_36_2","unstructured":"Junnan Li Dongxu Li Caiming Xiong and Steven Hoi. 2022. BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. arxiv:https:\/\/arXiv.org\/abs\/2201.12086\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2201.12086"},{"key":"e_1_3_3_1_37_2","unstructured":"Kunchang Li Xinhao Li Yi Wang Yinan He Yali Wang Limin Wang and Yu Qiao. 2024. VideoMamba: State Space Model for Efficient Video Understanding. arxiv:https:\/\/arXiv.org\/abs\/2403.06977\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2403.06977"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02527"},{"key":"e_1_3_3_1_39_2","unstructured":"Opher Lieber Barak Lenz Hofit Bata Gal Cohen Jhonathan Osin Itay Dalmedigos Erez Safahi Shaked Meirom Yonatan Belinkov Shai Shalev-Shwartz Omri Abend Raz Alon Tomer Asida Amir Bergman Roman Glozman Michael Gokhman Avashalom Manevich Nir Ratner Noam Rozen Erez Shwartz Mor Zusman and Yoav Shoham. 2024. Jamba: A Hybrid Transformer-Mamba Language Model. arxiv:https:\/\/arXiv.org\/abs\/2403.19887\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2403.19887"},{"key":"e_1_3_3_1_40_2","unstructured":"Bin Lin Zhenyu Tang Yang Ye Jiaxi Cui Bin Zhu Peng Jin Jinfa Huang Junwu Zhang Yatian Pang Munan Ning and Li Yuan. 2024. MoE-LLaVA: Mixture of Experts for Large Vision-Language Models. arxiv:https:\/\/arXiv.org\/abs\/2401.15947\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2401.15947"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_3_1_42_2","unstructured":"Haotian Liu Chunyuan Li Yuheng Li and Yong\u00a0Jae Lee. 2024. Improved Baselines with Visual Instruction Tuning. arxiv:https:\/\/arXiv.org\/abs\/2310.03744\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2310.03744"},{"key":"e_1_3_3_1_43_2","unstructured":"Haotian Liu Chunyuan Li Yuheng Li Bo Li Yuanhan Zhang Sheng Shen and Yong\u00a0Jae Lee. 2024. LLaVA-NeXT: Improved reasoning OCR and world knowledge. https:\/\/llava-vl.github.io\/blog\/2024-01-30-llava-next\/"},{"key":"e_1_3_3_1_44_2","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong\u00a0Jae Lee. 2023. Visual Instruction Tuning. arxiv:https:\/\/arXiv.org\/abs\/2304.08485\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2304.08485"},{"key":"e_1_3_3_1_45_2","unstructured":"Yue Liu Yunjie Tian Yuzhong Zhao Hongtian Yu Lingxi Xie Yaowei Wang Qixiang Ye and Yunfan Liu. 2024. VMamba: Visual State Space Model. arxiv:https:\/\/arXiv.org\/abs\/2401.10166\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2401.10166"},{"key":"e_1_3_3_1_46_2","first-page":"47016","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Lu Chris","year":"2023","unstructured":"Chris Lu, Yannick Schroecker, Albert Gu, Emilio Parisotto, Jakob Foerster, Satinder Singh, and Feryal Behbahani. 2023. Structured State Space Models for In-Context Reinforcement Learning. In Advances in Neural Information Processing Systems, A.\u00a0Oh, T.\u00a0Naumann, A.\u00a0Globerson, K.\u00a0Saenko, M.\u00a0Hardt, and S.\u00a0Levine (Eds.), Vol.\u00a036. Curran Associates, Inc., 47016\u201347031. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2023\/file\/92d3d2a9801211ca3693ccb2faa1316f-Paper-Conference.pdf"},{"key":"e_1_3_3_1_47_2","volume-title":"The Eleventh International Conference on Learning Representations","author":"Ma Xuezhe","year":"2023","unstructured":"Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer. 2023. Mega: Moving Average Equipped Gated Attention. In The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=qNLe3iq2El"},{"key":"e_1_3_3_1_48_2","unstructured":"OpenAI Josh Achiam Steven Adler Sandhini Agarwal Lama Ahmad Ilge Akkaya and et al. 2024. GPT-4 Technical Report. arxiv:https:\/\/arXiv.org\/abs\/2303.08774\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2303.08774"},{"key":"e_1_3_3_1_49_2","first-page":"27730","volume-title":"Advances in Neural Information Processing Systems","volume":"35","author":"Ouyang Long","year":"2022","unstructured":"Long Ouyang, Jeffrey Wu, Xu Jiang, Diogo Almeida, Carroll Wainwright, and et al. 2022. Training language models to follow instructions with human feedback. In Advances in Neural Information Processing Systems, S.\u00a0Koyejo, S.\u00a0Mohamed, A.\u00a0Agarwal, D.\u00a0Belgrave, K.\u00a0Cho, and A.\u00a0Oh (Eds.), Vol.\u00a035. Curran Associates, Inc., 27730\u201327744. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2022\/file\/b1efde53be364a73914f58805a001731-Paper-Conference.pdf"},{"key":"e_1_3_3_1_50_2","unstructured":"Jinyoung Park Hee-Seon Kim Kangwook Ko Minbeom Kim and Changick Kim. 2024. VideoMamba: Spatio-Temporal Selective State Space Model. arxiv:https:\/\/arXiv.org\/abs\/2407.08476\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2407.08476"},{"key":"e_1_3_3_1_51_2","unstructured":"Maciej Pi\u00f3ro Kamil Ciebiera Krystian Kr\u00f3l Jan Ludziejewski Micha\u0142 Krutul Jakub Krajewski Szymon Antoniak Piotr Mi\u0142o\u015b Marek Cygan and Sebastian Jaszczur. 2024. MoE-Mamba: Efficient Selective State Space Models with Mixture of Experts. arxiv:https:\/\/arXiv.org\/abs\/2401.04081\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/2401.04081"},{"key":"e_1_3_3_1_52_2","unstructured":"Yanyuan Qiao Zheng Yu Longteng Guo Sihan Chen Zijia Zhao Mingzhen Sun Qi Wu and Jing Liu. 2024. VL-Mamba: Exploring State Space Models for Multimodal Learning. arxiv:https:\/\/arXiv.org\/abs\/2403.13600\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2403.13600"},{"key":"e_1_3_3_1_53_2","unstructured":"Alec Radford Jong\u00a0Wook Kim Chris Hallacy Aditya Ramesh Gabriel Goh Sandhini Agarwal Girish Sastry Amanda Askell Pamela Mishkin Jack Clark Gretchen Krueger and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. arxiv:https:\/\/arXiv.org\/abs\/2103.00020\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2103.00020"},{"key":"e_1_3_3_1_54_2","unstructured":"Alec Radford Karthik Narasimhan Tim Salimans Ilya Sutskever et\u00a0al. 2018. Improving language understanding by generative pre-training. (2018)."},{"key":"e_1_3_3_1_55_2","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et\u00a0al. 2019. Language models are unsupervised multitask learners. OpenAI blog 1 8 (2019) 9."},{"key":"e_1_3_3_1_56_2","unstructured":"Prajit Ramachandran Barret Zoph and Quoc\u00a0V. Le. 2017. Searching for Activation Functions. arxiv:https:\/\/arXiv.org\/abs\/1710.05941\u00a0[cs.NE] https:\/\/arxiv.org\/abs\/1710.05941"},{"key":"e_1_3_3_1_57_2","unstructured":"Yuheng Shi Minjing Dong and Chang Xu. 2024. Multi-Scale VMamba: Hierarchy in Hierarchy Visual State Space Model. arxiv:https:\/\/arXiv.org\/abs\/2405.14174\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2405.14174"},{"key":"e_1_3_3_1_58_2","doi-asserted-by":"publisher","unstructured":"shi bin and Wang Hao. 2024. Chinese Ceramics Image Caption Dataset. 10.57760\/sciencedb.j00133.00381","DOI":"10.57760\/sciencedb.j00133.00381"},{"key":"e_1_3_3_1_59_2","doi-asserted-by":"publisher","unstructured":"shi bin Wang Hao and Deng Sanhong. 2023. Chinese ceramic image clique identification data set. 10.57760\/sciencedb.j00133.00225","DOI":"10.57760\/sciencedb.j00133.00225"},{"key":"e_1_3_3_1_60_2","volume-title":"The Eleventh International Conference on Learning Representations","author":"Smith Jimmy\u00a0T.H.","year":"2023","unstructured":"Jimmy\u00a0T.H. Smith, Andrew Warrington, and Scott Linderman. 2023. Simplified State Space Layers for Sequence Modeling. In The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=Ai8Hw3AXqks"},{"key":"e_1_3_3_1_61_2","unstructured":"Nitish Srivastava Geoffrey Hinton Alex Krizhevsky Ilya Sutskever and Ruslan Salakhutdinov. 2014. Dropout: A Simple Way to Prevent Neural Networks from Overfitting. Journal of Machine Learning Research 15 56 (2014) 1929\u20131958. http:\/\/jmlr.org\/papers\/v15\/srivastava14a.html"},{"key":"e_1_3_3_1_62_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.308"},{"key":"e_1_3_3_1_63_2","unstructured":"Rohan Taori Ishaan Gulrajani Tianyi Zhang Yann Dubois Xuechen Li Carlos Guestrin Percy Liang and Tatsunori\u00a0B. Hashimoto. 2023. Stanford Alpaca: An Instruction-following LLaMA model. https:\/\/github.com\/tatsu-lab\/stanford_alpaca."},{"key":"e_1_3_3_1_64_2","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar Aurelien Rodriguez Armand Joulin Edouard Grave and Guillaume Lample. 2023. LLaMA: Open and Efficient Foundation Language Models. arxiv:https:\/\/arXiv.org\/abs\/2302.13971\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2302.13971"},{"key":"e_1_3_3_1_65_2","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi and et al. 2023. Llama 2: Open Foundation and Fine-Tuned Chat Models. arxiv:https:\/\/arXiv.org\/abs\/2307.09288\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2307.09288"},{"key":"e_1_3_3_1_66_2","unstructured":"Wenhui Wang Hangbo Bao Li Dong Johan Bjorck Zhiliang Peng Qiang Liu Kriti Aggarwal Owais\u00a0Khan Mohammed Saksham Singhal Subhojit Som and Furu Wei. 2022. Image as a Foreign Language: BEiT Pretraining for All Vision and Vision-Language Tasks. arxiv:https:\/\/arXiv.org\/abs\/2208.10442\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2208.10442"},{"key":"e_1_3_3_1_67_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.754"},{"key":"e_1_3_3_1_68_2","volume-title":"International Conference on Learning Representations","author":"Wang Zirui","year":"2022","unstructured":"Zirui Wang, Jiahui Yu, Adams\u00a0Wei Yu, Zihang Dai, Yulia Tsvetkov, and Yuan Cao. 2022. SimVLM: Simple Visual Language Model Pretraining with Weak Supervision. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=GUrhfTuf_3"},{"key":"e_1_3_3_1_69_2","unstructured":"Ruyi Xu Yuan Yao Zonghao Guo Junbo Cui Zanlin Ni Chunjiang Ge Tat-Seng Chua Zhiyuan Liu Maosong Sun and Gao Huang. 2024. LLaVA-UHD: an LMM Perceiving Any Aspect Ratio and High-Resolution Images. arxiv:https:\/\/arXiv.org\/abs\/2403.11703\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2403.11703"},{"key":"e_1_3_3_1_70_2","unstructured":"Jiahui Yu Zirui Wang Vijay Vasudevan Legg Yeung Mojtaba Seyedhosseini and Yonghui Wu. 2022. CoCa: Contrastive Captioners are Image-Text Foundation Models. arxiv:https:\/\/arXiv.org\/abs\/2205.01917\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2205.01917"},{"key":"e_1_3_3_1_71_2","unstructured":"Susan Zhang Stephen Roller Naman Goyal Mikel Artetxe Moya Chen Shuohui Chen Christopher Dewan Mona Diab Xian Li Xi\u00a0Victoria Lin Todor Mihaylov Myle Ott Sam Shleifer Kurt Shuster Daniel Simig Punit\u00a0Singh Koura Anjali Sridhar Tianlu Wang and Luke Zettlemoyer. 2022. OPT: Open Pre-trained Transformer Language Models. arxiv:https:\/\/arXiv.org\/abs\/2205.01068\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2205.01068"},{"key":"e_1_3_3_1_72_2","first-page":"46595","volume-title":"Advances in Neural Information Processing Systems","author":"Zheng Lianmin","year":"2023","unstructured":"Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, Zhanghao Wu, and et al. 2023. Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. In Advances in Neural Information Processing Systems, A.\u00a0Oh, T.\u00a0Naumann, A.\u00a0Globerson, K.\u00a0Saenko, M.\u00a0Hardt, and S.\u00a0Levine (Eds.), Vol.\u00a036. Curran Associates, Inc., 46595\u201346623. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2023\/file\/91f18a1287b398d378ef22505bf41832-Paper-Datasets_and_Benchmarks.pdf"},{"key":"e_1_3_3_1_73_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.7000"},{"key":"e_1_3_3_1_74_2","unstructured":"Deyao Zhu Jun Chen Xiaoqian Shen Xiang Li and Mohamed Elhoseiny. 2023. MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models. arxiv:https:\/\/arXiv.org\/abs\/2304.10592\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2304.10592"},{"key":"e_1_3_3_1_75_2","unstructured":"Lianghui Zhu Bencheng Liao Qian Zhang Xinlong Wang Wenyu Liu and Xinggang Wang. 2024. Vision Mamba: Efficient Visual Representation Learning with Bidirectional State Space Model. arxiv:https:\/\/arXiv.org\/abs\/2401.09417\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2401.09417"}],"event":{"name":"ICCPR 2024: 2024 13th International Conference on Computing and Pattern Recognition","acronym":"ICCPR 2024","location":"Tianjin China"},"container-title":["Proceedings of the 2024 13th International Conference on Computing and Pattern Recognition"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3704323.3704364","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3704323.3704364","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:05Z","timestamp":1750295885000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3704323.3704364"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,25]]},"references-count":74,"alternative-id":["10.1145\/3704323.3704364","10.1145\/3704323"],"URL":"https:\/\/doi.org\/10.1145\/3704323.3704364","relation":{},"subject":[],"published":{"date-parts":[[2024,10,25]]},"assertion":[{"value":"2025-01-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}