{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,16]],"date-time":"2026-05-16T03:28:03Z","timestamp":1778902083939,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":71,"publisher":"ACM","funder":[{"name":"the National Natural Science Foundation of China","award":["(Grant No. 62276245)"],"award-info":[{"award-number":["(Grant No. 62276245)"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755754","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:55:00Z","timestamp":1761375300000},"page":"5110-5119","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Input Domain Aware MoE: Decoupling Routing Decisions from Task Optimization in Mixture of Experts"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-8849-4717","authenticated-orcid":false,"given":"YongXiang","family":"Hua","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Hefei, China and State Key Laboratory of Cognitive Intelligence, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3789-9705","authenticated-orcid":false,"given":"Haoyu","family":"Cao","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China and State Key Laboratory of Cognitive Intelligence, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-4433-4373","authenticated-orcid":false,"given":"Zhou","family":"Tao","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China and State Key Laboratory of Cognitive Intelligence, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-2171-7902","authenticated-orcid":false,"given":"Bocheng","family":"Li","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China and State Key Laboratory of Cognitive Intelligence, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-2199-8518","authenticated-orcid":false,"given":"Zihao","family":"Wu","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China and State Key Laboratory of Cognitive Intelligence, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7588-4264","authenticated-orcid":false,"given":"Chaohu","family":"Liu","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China and State Key Laboratory of Cognitive Intelligence, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0227-3793","authenticated-orcid":false,"given":"Linli","family":"Xu","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China and State Key Laboratory of Cognitive Intelligence, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"Jinze Bai Shuai Bai Yunfei Chu Zeyu Cui Kai Dang Xiaodong Deng Yang Fan Wenbin Ge Yu Han Fei Huang et al. 2023. Qwen technical report. arXiv preprint arXiv:2309.16609 (2023)."},{"key":"e_1_3_2_2_2_1","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang et al. 2025. Qwen2. 5-vl technical report. arXiv preprint arXiv:2502.13923 (2025)."},{"key":"e_1_3_2_2_3_1","first-page":"32897","article-title":"Vlmo: Unified vision-language pre-training with mixture-of-modality-experts","volume":"35","author":"Bao Hangbo","year":"2022","unstructured":"Hangbo Bao, Wenhui Wang, Li Dong, Qiang Liu, Owais Khan Mohammed, Kriti Aggarwal, Subhojit Som, Songhao Piao, and Furu Wei. 2022. Vlmo: Unified vision-language pre-training with mixture-of-modality-experts. Advances in Neural Information Processing Systems, Vol. 35 (2022), 32897-32912.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01788"},{"key":"e_1_3_2_2_5_1","unstructured":"Zhe Chen Weiyun Wang Yue Cao Yangzhou Liu Zhangwei Gao Erfei Cui Jinguo Zhu Shenglong Ye Hao Tian Zhaoyang Liu et al. 2024a. Expanding performance boundaries of open-source multimodal models with model data and test-time scaling. arXiv preprint arXiv:2412.05271 (2024)."},{"key":"e_1_3_2_2_6_1","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 24185-24198","author":"Chen Zhe","year":"2024","unstructured":"Zhe Chen, Jiannan Wu, Wenhai Wang, Weijie Su, Guo Chen, Sen Xing, Muyan Zhong, Qinglong Zhang, Xizhou Zhu, Lewei Lu, et al., 2024b. Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 24185-24198."},{"key":"e_1_3_2_2_7_1","first-page":"34600","article-title":"On the representation collapse of sparse mixture of experts","volume":"35","author":"Chi Zewen","year":"2022","unstructured":"Zewen Chi, Li Dong, Shaohan Huang, Damai Dai, Shuming Ma, Barun Patra, Saksham Singhal, Payal Bajaj, Xia Song, Xian-Ling Mao, et al., 2022. On the representation collapse of sparse mixture of experts. Advances in Neural Information Processing Systems, Vol. 35 (2022), 34600-34613.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_8_1","volume-title":"Deepseekmoe: Towards ultimate expert specialization in mixture-of-experts language models. arXiv preprint arXiv:2401.06066","author":"Dai Damai","year":"2024","unstructured":"Damai Dai, Chengqi Deng, Chenggang Zhao, RX Xu, Huazuo Gao, Deli Chen, Jiashi Li, Wangding Zeng, Xingkai Yu, Y Wu, et al., 2024. Deepseekmoe: Towards ultimate expert specialization in mixture-of-experts language models. arXiv preprint arXiv:2401.06066 (2024)."},{"key":"e_1_3_2_2_9_1","volume-title":"Stablemoe: Stable routing strategy for mixture of experts. arXiv preprint arXiv:2204.08396","author":"Dai Damai","year":"2022","unstructured":"Damai Dai, Li Dong, Shuming Ma, Bo Zheng, Zhifang Sui, Baobao Chang, and Furu Wei. 2022. Stablemoe: Stable routing strategy for mixture of experts. arXiv preprint arXiv:2204.08396 (2022)."},{"key":"e_1_3_2_2_10_1","first-page":"1","article-title":"Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity","volume":"23","author":"Fedus William","year":"2022","unstructured":"William Fedus, Barret Zoph, and Noam Shazeer. 2022. Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity. Journal of Machine Learning Research, Vol. 23, 120 (2022), 1-39.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_2_11_1","volume-title":"Vita: Towards open-source interactive omni multimodal llm. arXiv preprint arXiv:2408.05211","author":"Fu Chaoyou","year":"2024","unstructured":"Chaoyou Fu, Haojia Lin, Zuwei Long, Yunhang Shen, Yuhang Dai, Meng Zhao, Yi-Fan Zhang, Shaoqi Dong, Yangze Li, Xiong Wang, et al., 2024. Vita: Towards open-source interactive omni multimodal llm. arXiv preprint arXiv:2408.05211 (2024)."},{"key":"e_1_3_2_2_12_1","unstructured":"Chaoyou Fu Haojia Lin Xiong Wang Yi-Fan Zhang Yunhang Shen Xiaoyu Liu Haoyu Cao Zuwei Long Heting Gao Ke Li et al. 2025. Vita-1.5: Towards gpt-4o level real-time vision and speech interaction. arXiv preprint arXiv:2501.01957 (2025)."},{"key":"e_1_3_2_2_13_1","volume-title":"Heng Tao Shen, and Xing Xu","author":"Gao Zixian","year":"2024","unstructured":"Zixian Gao, Disen Hu, Xun Jiang, Huimin Lu, Heng Tao Shen, and Xing Xu. 2024. Enhanced Experts with Uncertainty-Aware Routing for Multimodal Sentiment Analysis. In ACM Multimedia 2024. https:\/\/openreview.net\/forum?id=78TMql1c04"},{"key":"e_1_3_2_2_14_1","unstructured":"Aaron Grattafiori Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Alex Vaughan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00380"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503221.3508418"},{"key":"e_1_3_2_2_17_1","volume-title":"Efficient multimodal learning from data-centric perspective. arXiv preprint arXiv:2402.11530","author":"He Muyang","year":"2024","unstructured":"Muyang He, Yexin Liu, Boya Wu, Jianhao Yuan, Yueze Wang, Tiejun Huang, and Bo Zhao. 2024. Efficient multimodal learning from data-centric perspective. arXiv preprint arXiv:2402.11530 (2024)."},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00686"},{"key":"e_1_3_2_2_19_1","volume-title":"Diego de las Casas, Emma Bou Hanna, Florian Bressand, et al.","author":"Jiang Albert Q","year":"2024","unstructured":"Albert Q Jiang, Alexandre Sablayrolles, Antoine Roux, Arthur Mensch, Blanche Savary, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Emma Bou Hanna, Florian Bressand, et al., 2024. Mixtral of experts. arXiv preprint arXiv:2401.04088 (2024)."},{"key":"e_1_3_2_2_20_1","volume-title":"Proceedings of the 41st International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"24353","author":"Kim Yechan","year":"2024","unstructured":"Yechan Kim, Hwijoon Lim, and Dongsu Han. 2024. Scaling Beyond the GPU Memory Limit for Large Mixture-of-Experts Model Training. In Proceedings of the 41st International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 235), Ruslan Salakhutdinov, Zico Kolter, Katherine Heller, Adrian Weller, Nuria Oliver, Jonathan Scarlett, and Felix Berkenkamp (Eds.). PMLR, 24342-24353. https:\/\/proceedings.mlr.press\/v235\/kim24w.html"},{"key":"e_1_3_2_2_21_1","volume-title":"Basil Mustafa, Joshua Ainslie, Yi Tay, Mostafa Dehghani, and Neil Houlsby.","author":"Komatsuzaki Aran","year":"2022","unstructured":"Aran Komatsuzaki, Joan Puigcerver, James Lee-Thorp, Carlos Riquelme Ruiz, Basil Mustafa, Joshua Ainslie, Yi Tay, Mostafa Dehghani, and Neil Houlsby. 2022. Sparse upcycling: Training mixture-of-experts from dense checkpoints. arXiv preprint arXiv:2212.05055 (2022)."},{"key":"e_1_3_2_2_22_1","volume-title":"Gshard: Scaling giant models with conditional computation and automatic sharding. arXiv preprint arXiv:2006.16668","author":"Lepikhin Dmitry","year":"2020","unstructured":"Dmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu, Dehao Chen, Orhan Firat, Yanping Huang, Maxim Krikun, Noam Shazeer, and Zhifeng Chen. 2020. Gshard: Scaling giant models with conditional computation and automatic sharding. arXiv preprint arXiv:2006.16668 (2020)."},{"key":"e_1_3_2_2_23_1","volume-title":"Mimic-it: Multi-modal in-context instruction tuning. arXiv preprint arXiv:2306.05425","author":"Li Bo","year":"2023","unstructured":"Bo Li, Yuanhan Zhang, Liangyu Chen, Jinghao Wang, Fanyi Pu, Jingkang Yang, Chunyuan Li, and Ziwei Liu. 2023c. Mimic-it: Multi-modal in-context instruction tuning. arXiv preprint arXiv:2306.05425 (2023)."},{"key":"e_1_3_2_2_24_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023b. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730-19742."},{"key":"e_1_3_2_2_25_1","first-page":"131224","article-title":"Cumo: Scaling multimodal llm with co-upcycled mixture-of-experts","volume":"37","author":"Li Jiachen","year":"2024","unstructured":"Jiachen Li, Xinyao Wang, Sijie Zhu, Chia-Wen Kuo, Lu Xu, Fan Chen, Jitesh Jain, Humphrey Shi, and Longyin Wen. 2024a. Cumo: Scaling multimodal llm with co-upcycled mixture-of-experts. Advances in Neural Information Processing Systems, Vol. 37 (2024), 131224-131246.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_26_1","volume-title":"Wayne Xin Zhao, and Ji-Rong Wen","author":"Li Yifan","year":"2023","unstructured":"Yifan Li, Yifan Du, Kun Zhou, Jinpeng Wang, Wayne Xin Zhao, and Ji-Rong Wen. 2023a. Evaluating object hallucination in large vision-language models. arXiv preprint arXiv:2305.10355 (2023)."},{"key":"e_1_3_2_2_27_1","volume-title":"Uni-moe: Scaling unified multimodal llms with mixture of experts","author":"Li Yunxin","year":"2025","unstructured":"Yunxin Li, Shenyuan Jiang, Baotian Hu, Longyue Wang, Wanqi Zhong, Wenhan Luo, Lin Ma, and Min Zhang. 2025. Uni-moe: Scaling unified multimodal llms with mixture of experts. IEEE Transactions on Pattern Analysis and Machine Intelligence (2025)."},{"key":"e_1_3_2_2_28_1","volume-title":"Mini-gemini: Mining the potential of multi-modality vision language models. arXiv preprint arXiv:2403.18814","author":"Li Yanwei","year":"2024","unstructured":"Yanwei Li, Yuechen Zhang, Chengyao Wang, Zhisheng Zhong, Yixin Chen, Ruihang Chu, Shaoteng Liu, and Jiaya Jia. 2024b. Mini-gemini: Mining the potential of multi-modality vision language models. arXiv preprint arXiv:2403.18814 (2024)."},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3672758.3672824"},{"key":"e_1_3_2_2_30_1","volume-title":"Moe-llava: Mixture of experts for large vision-language models. arXiv preprint arXiv:2401.15947","author":"Lin Bin","year":"2024","unstructured":"Bin Lin, Zhenyu Tang, Yang Ye, Jiaxi Cui, Bin Zhu, Peng Jin, Jinfa Huang, Junwu Zhang, Yatian Pang, Munan Ning, et al., 2024a. Moe-llava: Mixture of experts for large vision-language models. arXiv preprint arXiv:2401.15947 (2024)."},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02520"},{"key":"e_1_3_2_2_32_1","unstructured":"Aixin Liu Bei Feng Bin Wang Bingxuan Wang Bo Liu Chenggang Zhao Chengqi Dengr Chong Ruan Damai Dai Daya Guo et al. 2024b. Deepseek-v2: A strong economical and efficient mixture-of-experts language model. arXiv preprint arXiv:2405.04434 (2024)."},{"key":"e_1_3_2_2_33_1","unstructured":"Aixin Liu Bei Feng Bing Xue Bingxuan Wang Bochao Wu Chengda Lu Chenggang Zhao Chengqi Deng Chenyu Zhang Chong Ruan et al. 2024c. Deepseek-v3 technical report. arXiv preprint arXiv:2412.19437 (2024)."},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01471"},{"key":"e_1_3_2_2_35_1","volume-title":"Aligning Large Multi-Modal Model with Robust Instruction Tuning. arXiv preprint arXiv:2306.14565","author":"Liu Fuxiao","year":"2023","unstructured":"Fuxiao Liu, Kevin Lin, Linjie Li, Jianfeng Wang, Yaser Yacoob, and Lijuan Wang. 2023c. Aligning Large Multi-Modal Model with Robust Instruction Tuning. arXiv preprint arXiv:2306.14565 (2023)."},{"key":"e_1_3_2_2_36_1","volume-title":"Improved Baselines with Visual Instruction Tuning. arXiv preprint arXiv:2310.03744","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Yuheng Li, and Yong Jae Lee. 2023a. Improved Baselines with Visual Instruction Tuning. arXiv preprint arXiv:2310.03744 (2023)."},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"e_1_3_2_2_38_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023b. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2023), 34892-34916."},{"key":"e_1_3_2_2_39_1","volume-title":"Shuohang Wang, Chen Liang, Yelong Shen, Hao Cheng, Xiaodong Liu, Masahiro Tanaka, Xiaoxia Wu, Wenxiang Hu, et al.","author":"Liu Liyuan","year":"2024","unstructured":"Liyuan Liu, Young Jin Kim, Shuohang Wang, Chen Liang, Yelong Shen, Hao Cheng, Xiaodong Liu, Masahiro Tanaka, Xiaoxia Wu, Wenxiang Hu, et al., 2024d. Grin: Gradient-informed moe. arXiv preprint arXiv:2409.12136 (2024)."},{"key":"e_1_3_2_2_40_1","volume-title":"European conference on computer vision. Springer, 216-233","author":"Liu Yuan","year":"2024","unstructured":"Yuan Liu, Haodong Duan, Yuanhan Zhang, Bo Li, Songyang Zhang, Wangbo Zhao, Yike Yuan, Jiaqi Wang, Conghui He, Ziwei Liu, et al., 2024a. Mmbench: Is your multi-modal model an all-around player?. In European conference on computer vision. Springer, 216-233."},{"key":"e_1_3_2_2_41_1","first-page":"2507","article-title":"Learn to explain: Multimodal reasoning via thought chains for science question answering","volume":"35","author":"Lu Pan","year":"2022","unstructured":"Pan Lu, Swaroop Mishra, Tanglin Xia, Liang Qiu, Kai-Wei Chang, Song-Chun Zhu, Oyvind Tafjord, Peter Clark, and Ashwin Kalyan. 2022. Learn to explain: Multimodal reasoning via thought chains for science question answering. Advances in Neural Information Processing Systems, Vol. 35 (2022), 2507-2521.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3588964"},{"key":"e_1_3_2_2_43_1","volume-title":"International conference on machine learning. PmLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748-8763."},{"key":"e_1_3_2_2_44_1","volume-title":"Pangu: Towards trillion parameter language model with sparse heterogeneous computing. arXiv preprint arXiv:2303.10845","author":"Ren Xiaozhe","year":"2023","unstructured":"Xiaozhe Ren, Pingyi Zhou, Xinfan Meng, Xinjing Huang, Yadao Wang, Weichao Wang, Pengfei Li, Xiaoda Zhang, Alexander Podolskiy, Grigory Arshinov, et al., 2023. Pangu: Towards trillion parameter language model with sparse heterogeneous computing. arXiv preprint arXiv:2303.10845 (2023)."},{"key":"e_1_3_2_2_45_1","first-page":"17555","article-title":"Hash layers for large sparse models","volume":"34","author":"Roller Stephen","year":"2021","unstructured":"Stephen Roller, Sainbayar Sukhbaatar, Jason Weston, et al., 2021. Hash layers for large sparse models. Advances in Neural Information Processing Systems, Vol. 34 (2021), 17555-17566.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_46_1","volume-title":"Imp: Highly capable large multimodal models for mobile devices. arXiv preprint arXiv:2405.12107","author":"Shao Zhenwei","year":"2024","unstructured":"Zhenwei Shao, Zhou Yu, Jun Yu, Xuecheng Ouyang, Lihao Zheng, Zhenbiao Gai, Mingyang Wang, and Jiajun Ding. 2024. Imp: Highly capable large multimodal models for mobile devices. arXiv preprint arXiv:2405.12107 (2024)."},{"key":"e_1_3_2_2_47_1","volume-title":"Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=B1ckMDqlg","author":"Shazeer Noam","year":"2017","unstructured":"Noam Shazeer, *Azalia Mirhoseini, *Krzysztof Maziarz, Andy Davis, Quoc Le, Geoffrey Hinton, and Jeff Dean. 2017. Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=B1ckMDqlg"},{"key":"e_1_3_2_2_48_1","unstructured":"Yunhang Shen Chaoyou Fu Shaoqi Dong Xiong Wang Yi-Fan Zhang Peixian Chen Mengdan Zhang Haoyu Cao Ke Li Xiawu Zheng et al. 2025. Long-VITA: Scaling Large Multi-modal Models to 1 Million Tokens with Leading Short-Context Accuracy. arXiv preprint arXiv:2502.05177 (2025)."},{"key":"e_1_3_2_2_49_1","volume-title":"ModuleFormer: Modularity Emerges from Mixture-of-Experts. arXiv preprint arXiv:2306.04640","author":"Shen Yikang","year":"2023","unstructured":"Yikang Shen, Zheyu Zhang, Tianyou Cao, Shawn Tan, Zhenfang Chen, and Chuang Gan. 2023. ModuleFormer: Modularity Emerges from Mixture-of-Experts. arXiv preprint arXiv:2306.04640 (2023)."},{"key":"e_1_3_2_2_50_1","volume-title":"Llava-mod: Making llava tiny via moe knowledge distillation. arXiv preprint arXiv:2408.15881","author":"Shu Fangxun","year":"2024","unstructured":"Fangxun Shu, Yue Liao, Le Zhuo, Chenning Xu, Lei Zhang, Guanghao Zhang, Haonan Shi, Long Chen, Tao Zhong, Wanggui He, et al., 2024. Llava-mod: Making llava tiny via moe knowledge distillation. arXiv preprint arXiv:2408.15881 (2024)."},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"crossref","unstructured":"Amanpreet Singh Vivek Natarajan Meet Shah Yu Jiang Xinlei Chen Dhruv Batra Devi Parikh and Marcus Rohrbach. 2019. Towards vqa models that can read. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 8317-8326.","DOI":"10.1109\/CVPR.2019.00851"},{"key":"e_1_3_2_2_52_1","volume-title":"To see is to believe: Prompting GPT-4V for better visual instruction tuning. arXiv preprint arXiv:2311.07574","author":"Wang Junke","year":"2023","unstructured":"Junke Wang, Lingchen Meng, Zejia Weng, Bo He, Zuxuan Wu, and Yu-Gang Jiang. 2023. To see is to believe: Prompting GPT-4V for better visual instruction tuning. arXiv preprint arXiv:2311.07574 (2023)."},{"key":"e_1_3_2_2_53_1","volume-title":"Auxiliary-loss-free load balancing strategy for mixture-of-experts. arXiv preprint arXiv:2408.15664","author":"Wang Lean","year":"2024","unstructured":"Lean Wang, Huazuo Gao, Chenggang Zhao, Xu Sun, and Damai Dai. 2024a. Auxiliary-loss-free load balancing strategy for mixture-of-experts. arXiv preprint arXiv:2408.15664 (2024)."},{"key":"e_1_3_2_2_54_1","volume-title":"Pro-Prophet: Systematic Load Balancing Method for Efficient Parallel Training of Large-scale MoE Models. arXiv preprint arXiv:2411.10003","author":"Wang Wei","year":"2024","unstructured":"Wei Wang, Zhiquan Lai, Shengwei Li, Weijie Liu, Keshi Ge, Ao Shen, and Dongsheng Li. 2024b. Pro-Prophet: Systematic Load Balancing Method for Efficient Parallel Training of Large-scale MoE Models. arXiv preprint arXiv:2411.10003 (2024)."},{"key":"e_1_3_2_2_55_1","volume-title":"Skywork-moe: A deep dive into training techniques for mixture-of-experts language models. arXiv preprint arXiv:2406.06563","author":"Wei Tianwen","year":"2024","unstructured":"Tianwen Wei, Bo Zhu, Liang Zhao, Cheng Cheng, Biye Li, Weiwei L\u00fc, Peng Cheng, Jianhao Zhang, Xiaoyu Zhang, Liang Zeng, et al., 2024. Skywork-moe: A deep dive into training techniques for mixture-of-experts language models. arXiv preprint arXiv:2406.06563 (2024)."},{"key":"e_1_3_2_2_56_1","volume-title":"The Thirteenth International Conference on Learning Representations.","author":"Wu Chenwei","unstructured":"Chenwei Wu, Zitao Shuai, Zhengxu Tang, Luning Wang, and Liyue Shen. [n.d.]. Dynamic Modeling of Patients, Modalities and Tasks via Multi-modal Multi-task Mixture of Experts. In The Thirteenth International Conference on Learning Representations."},{"key":"e_1_3_2_2_57_1","volume-title":"GW-MoE: Resolving Uncertainty in MoE Router with Global Workspace Theory. arXiv preprint arXiv:2406.12375","author":"Wu Haoze","year":"2024","unstructured":"Haoze Wu, Zihan Qiu, Zili Wang, Hang Zhao, and Jie Fu. 2024c. GW-MoE: Resolving Uncertainty in MoE Router with Global Workspace Theory. arXiv preprint arXiv:2406.12375 (2024)."},{"key":"e_1_3_2_2_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01347"},{"key":"e_1_3_2_2_59_1","unstructured":"Zhiyu Wu Xiaokang Chen Zizheng Pan Xingchao Liu Wen Liu Damai Dai Huazuo Gao Yiyang Ma Chengyue Wu Bingxuan Wang et al. 2024a. Deepseek-vl2: Mixture-of-experts vision-language models for advanced multimodal understanding. arXiv preprint arXiv:2412.10302 (2024)."},{"key":"e_1_3_2_2_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681683"},{"key":"e_1_3_2_2_61_1","volume-title":"Openmoe: An early effort on open mixture-of-experts language models. arXiv preprint arXiv:2402.01739","author":"Xue Fuzhao","year":"2024","unstructured":"Fuzhao Xue, Zian Zheng, Yao Fu, Jinjie Ni, Zangwei Zheng, Wangchunshu Zhou, and Yang You. 2024. Openmoe: An early effort on open mixture-of-experts language models. arXiv preprint arXiv:2402.01739 (2024)."},{"key":"e_1_3_2_2_62_1","volume-title":"Yi: Open foundation models by 01. ai. arXiv preprint arXiv:2403.04652","author":"Young Alex","year":"2024","unstructured":"Alex Young, Bei Chen, Chao Li, Chengen Huang, Ge Zhang, Guanwei Zhang, Guoyin Wang, Heng Li, Jiangcheng Zhu, Jianqun Chen, et al., 2024. Yi: Open foundation models by 01. ai. arXiv preprint arXiv:2403.04652 (2024)."},{"key":"e_1_3_2_2_63_1","volume-title":"MMoE: Enhancing Multimodal Models with Mixtures of Multimodal Interaction Experts. arXiv preprint arXiv:2311.09580","author":"Yu Haofei","year":"2023","unstructured":"Haofei Yu, Zhengyang Qi, Lawrence Jang, Ruslan Salakhutdinov, Louis-Philippe Morency, and Paul Pu Liang. 2023a. MMoE: Enhancing Multimodal Models with Mixtures of Multimodal Interaction Experts. arXiv preprint arXiv:2311.09580 (2023)."},{"key":"e_1_3_2_2_64_1","volume-title":"Mm-vet: Evaluating large multimodal models for integrated capabilities. arXiv preprint arXiv:2308.02490","author":"Yu Weihao","year":"2023","unstructured":"Weihao Yu, Zhengyuan Yang, Linjie Li, Jianfeng Wang, Kevin Lin, Zicheng Liu, Xinchao Wang, and Lijuan Wang. 2023b. Mm-vet: Evaluating large multimodal models for integrated capabilities. arXiv preprint arXiv:2308.02490 (2023)."},{"key":"e_1_3_2_2_65_1","unstructured":"Haotian Zhang Mingfei Gao Zhe Gan Philipp Dufter Nina Wenzel Forrest Huang Dhruti Shah Xianzhi Du Bowen Zhang Yanghao Li et al. 2024. Mm1. 5: Methods analysis & insights from multimodal llm fine-tuning. arXiv preprint arXiv:2409.20566 (2024)."},{"key":"e_1_3_2_2_66_1","volume-title":"Svit: Scaling up visual instruction tuning. arXiv preprint arXiv:2307.04087","author":"Zhao Bo","year":"2023","unstructured":"Bo Zhao, Boya Wu, and Tiejun Huang. 2023. Svit: Scaling up visual instruction tuning. arXiv preprint arXiv:2307.04087 (2023)."},{"key":"e_1_3_2_2_67_1","volume-title":"International Conference on Machine Learning. PMLR, 42531-42542","author":"Zhou Yanqi","year":"2023","unstructured":"Yanqi Zhou, Nan Du, Yanping Huang, Daiyi Peng, Chang Lan, Da Huang, Siamak Shakeri, David So, Andrew M Dai, Yifeng Lu, et al., 2023. Brainformers: Trading simplicity for efficiency. In International Conference on Machine Learning. PMLR, 42531-42542."},{"key":"e_1_3_2_2_68_1","first-page":"7103","article-title":"Mixture-of-experts with expert choice routing","volume":"35","author":"Zhou Yanqi","year":"2022","unstructured":"Yanqi Zhou, Tao Lei, Hanxiao Liu, Nan Du, Yanping Huang, Vincent Zhao, Andrew M Dai, Quoc V Le, James Laudon, et al., 2022. Mixture-of-experts with expert choice routing. Advances in Neural Information Processing Systems, Vol. 35 (2022), 7103-7114.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_69_1","doi-asserted-by":"publisher","DOI":"10.1145\/3688863.3689575"},{"key":"e_1_3_2_2_70_1","volume-title":"St-moe: Designing stable and transferable sparse expert models. arXiv preprint arXiv:2202.08906","author":"Zoph Barret","year":"2022","unstructured":"Barret Zoph, Irwan Bello, Sameer Kumar, Nan Du, Yanping Huang, Jeff Dean, Noam Shazeer, and William Fedus. 2022. St-moe: Designing stable and transferable sparse expert models. arXiv preprint arXiv:2202.08906 (2022)."},{"key":"e_1_3_2_2_71_1","volume-title":"Hany Hassan, Ruofei Zhang, Tuo Zhao, and Jianfeng Gao.","author":"Zuo Simiao","year":"2021","unstructured":"Simiao Zuo, Xiaodong Liu, Jian Jiao, Young Jin Kim, Hany Hassan, Ruofei Zhang, Tuo Zhao, and Jianfeng Gao. 2021. Taming sparsely activated transformer with stochastic experts. arXiv preprint arXiv:2110.04260 (2021)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755754","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T03:59:39Z","timestamp":1765339179000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755754"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":71,"alternative-id":["10.1145\/3746027.3755754","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755754","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}