{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T20:07:08Z","timestamp":1765310828270,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","funder":[{"name":"The Postdoctoral Fellowship Program (Grade C) of China Postdoctoral Science Foundation (CPSF)","award":["GZC20251087"],"award-info":[{"award-number":["GZC20251087"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755586","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:30:51Z","timestamp":1761377451000},"page":"4758-4767","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Multi-Task Dense Prediction Fine-Tuning with Mixture of Fine-Grained Experts"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7098-8419","authenticated-orcid":false,"given":"Yangyang","family":"Xu","sequence":"first","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1763-6019","authenticated-orcid":false,"given":"Xi","family":"Ye","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9607-3639","authenticated-orcid":false,"given":"Duo","family":"Su","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"MTLoRA: A Low-Rank Adaptation Approach for Efficient Multi-Task Learning. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 16196-16205","author":"Agiza Ahmed","year":"2024","unstructured":"Ahmed Agiza, Marina Neseem, and Sherief Reda. 2024. MTLoRA: A Low-Rank Adaptation Approach for Efficient Multi-Task Learning. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 16196-16205."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01743"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01557"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.254"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01138"},{"key":"e_1_3_2_1_6_1","volume-title":"Deepseekmoe: Towards ultimate expert specialization in mixture-of-experts language models. arXiv preprint arXiv:2401.06066","author":"Dai Damai","year":"2024","unstructured":"Damai Dai, Chengqi Deng, Chenggang Zhao, RX Xu, Huazuo Gao, Deli Chen, Jiashi Li, Wangding Zeng, Xingkai Yu, Yu Wu, et al., 2024. Deepseekmoe: Towards ultimate expert specialization in mixture-of-experts language models. arXiv preprint arXiv:2401.06066 (2024)."},{"key":"e_1_3_2_1_7_1","volume-title":"International Conference on Learning Representations.","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_8_1","volume-title":"International conference on machine learning. PMLR, 5547-5569","author":"Du Nan","year":"2022","unstructured":"Nan Du, Yanping Huang, Andrew M Dai, Simon Tong, Dmitry Lepikhin, Yuanzhong Xu, Maxim Krikun, Yanqi Zhou, Adams Wei Yu, Orhan Firat, et al., 2022. Glam: Efficient scaling of language models with mixture-of-experts. In International conference on machine learning. PMLR, 5547-5569."},{"key":"e_1_3_2_1_9_1","first-page":"1","article-title":"Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity","volume":"23","author":"Fedus William","year":"2022","unstructured":"William Fedus, Barret Zoph, and Noam Shazeer. 2022. Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity. Journal of Machine Learning Research, Vol. 23, 120 (2022), 1-39.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_10_1","volume-title":"The Tenth International Conference on Learning Representations.","author":"He Junxian","year":"2022","unstructured":"Junxian He, Chunting Zhou, Xuezhe Ma, Taylor Berg-Kirkpatrick, and Graham Neubig. 2022. Towards a unified view of parameter-efficient transfer learning. In The Tenth International Conference on Learning Representations."},{"key":"e_1_3_2_1_11_1","volume-title":"The Tenth International Conference on Learning Representations.","author":"Hu Edward J","year":"2022","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2022. Lora: Low-rank adaptation of large language models. In The Tenth International Conference on Learning Representations."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02662"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/21.229447"},{"key":"e_1_3_2_1_14_1","volume-title":"Adaptive mixtures of local experts. Neural computation","author":"Jacobs Robert A","year":"1991","unstructured":"Robert A Jacobs, Michael I Jordan, Steven J Nowlan, and Geoffrey E Hinton. 1991. Adaptive mixtures of local experts. Neural computation, Vol. 3, 1 (1991), 79-87."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681581"},{"key":"e_1_3_2_1_17_1","first-page":"1022","article-title":"Compacter: Efficient low-rank hypercomplex adapter layers","volume":"34","author":"Mahabadi Rabeeh Karimi","year":"2021","unstructured":"Rabeeh Karimi Mahabadi, James Henderson, and Sebastian Ruder. 2021. Compacter: Efficient low-rank hypercomplex adapter layers. In Advances in Neural Information Processing Systems, Vol. 34. 1022-1035.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.579"},{"key":"e_1_3_2_1_19_1","volume-title":"GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding. In 9th International Conference on Learning Representations.","author":"Lepikhin Dmitry","year":"2021","unstructured":"Dmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu, Dehao Chen, Orhan Firat, Yanping Huang, Maxim Krikun, Noam Shazeer, and Zhifeng Chen. 2021. GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding. In 9th International Conference on Learning Representations."},{"key":"e_1_3_2_1_20_1","unstructured":"Hanxue liang Zhiwen Fan Rishov Sarkar Ziyu Jiang Tianlong Chen Kai Zou Yu Cheng Cong Hao and Zhangyang Wang. 2022. M3ViT: Mixture-of-Experts Vision Transformer for Efficient Multi-task Learning with Model-Accelerator Co-design. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_21_1","first-page":"19645","article-title":"Effective adaptation in multi-task co-training for unified autonomous driving","volume":"35","author":"Liang Xiwen","year":"2022","unstructured":"Xiwen Liang, Yangxin Wu, Jianhua Han, Hang Xu, Chunjing Xu, and Xiaodan Liang. 2022. Effective adaptation in multi-task co-training for unified autonomous driving. Advances in Neural Information Processing Systems, Vol. 35 (2022), 19645-19658.","journal-title":"Advances in Neural Information Processing Systems"},{"volume-title":"Advances in neural information processing systems","author":"Liu Hanxiao","key":"e_1_3_2_1_22_1","unstructured":"Hanxiao Liu, Zihang Dai, David So, and Quoc V Le. 2021a. Pay Attention to MLPs. In Advances in neural information processing systems, Vol. 34. 9204-9215."},{"key":"e_1_3_2_1_23_1","volume-title":"Polyhistor: Parameter-Efficient Multi-Task Adaptation for Dense Vision Tasks. In Advances in Neural Information Processing Systems.","author":"Liu Yen-Cheng","year":"2022","unstructured":"Yen-Cheng Liu, Chih-Yao Ma, Junjiao Tian, Zijian He, and Zsolt Kira. 2022. Polyhistor: Parameter-Efficient Multi-Task Adaptation for Dense Vision Tasks. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2024.3349865"},{"key":"e_1_3_2_1_26_1","first-page":"565","article-title":"Parameter-efficient Multi-task Fine-tuning for Transformers via Shared Hypernetworks","author":"Mahabadi Rabeeh Karimi","year":"2021","unstructured":"Rabeeh Karimi Mahabadi, Sebastian Ruder, Mostafa Dehghani, and James Henderson. 2021. Parameter-efficient Multi-task Fine-tuning for Transformers via Shared Hypernetworks. In ACL\/IJCNLP. 565-576.","journal-title":"ACL\/IJCNLP."},{"key":"e_1_3_2_1_27_1","volume-title":"DiTASK: Multi-Task Fine-Tuning with Diffeomorphic Transformations. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","author":"Ipsit Mantri Krishna Sri","year":"2025","unstructured":"Krishna Sri Ipsit Mantri, Carola-Bibiane Sch\u00f6nlieb, Bruno Ribeiro, Chaim Baskin, and Moshe Eliasof. 2025. DiTASK: Multi-Task Fine-Tuning with Diffeomorphic Transformations. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01505"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-33715-4_54"},{"key":"e_1_3_2_1_30_1","volume-title":"Deep High-Resolution Representation Learning for Human Pose Estimation. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 5693-5703","author":"Sun Ke","year":"2019","unstructured":"Ke Sun, Bin Xiao, Dong Liu, and Jingdong Wang. 2019. Deep High-Resolution Representation Learning for Human Pose Estimation. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 5693-5703."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00516"},{"key":"e_1_3_2_1_32_1","volume-title":"Smile: Zero-shot sparse mixture of low-rank experts construction from pre-trained foundation models. arXiv preprint arXiv:2408.10174","author":"Tang Anke","year":"2024","unstructured":"Anke Tang, Li Shen, Yong Luo, Shuai Xie, Han Hu, Lefei Zhang, Bo Du, and Dacheng Tao. 2024a. Smile: Zero-shot sparse mixture of low-rank experts construction from pre-trained foundation models. arXiv preprint arXiv:2408.10174 (2024)."},{"key":"e_1_3_2_1_33_1","volume-title":"International Conference on Machine Learning. PMLR, 47778-47799","author":"Tang Anke","year":"2024","unstructured":"Anke Tang, Li Shen, Yong Luo, Nan Yin, Lefei Zhang, and Dacheng Tao. 2024b. Merging Multi-Task Models via Weight-Ensembling Mixture of Experts. In International Conference on Machine Learning. PMLR, 47778-47799."},{"key":"e_1_3_2_1_34_1","first-page":"3614","article-title":"Multi-Task Learning for Dense Prediction Tasks: A Survey","volume":"44","author":"Vandenhende S.","year":"2022","unstructured":"S. Vandenhende, S. Georgoulis, W. Van Gansbeke, M. Proesmans, D. Dai, and L. Van Gool. 2022. Multi-Task Learning for Dense Prediction Tasks: A Survey. IEEE TPAMI, Vol. 44, 7 (2022), 3614-3633.","journal-title":"IEEE TPAMI"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58548-8_31"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i14.29541"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00077"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3292995"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i3.25411"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01970"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i3.25424"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02638"},{"key":"e_1_3_2_1_43_1","volume-title":"The Thirteenth International Conference on Learning Representations.","author":"Yang Yuqi","year":"2025","unstructured":"Yuqi Yang, Peng-Tao Jiang, Qibin Hou, Hao Zhang, Jinwei Chen, and Bo Li. 2025. Multi-Task Dense Predictions via Unleashing the Power of Diffusion. In The Thirteenth International Conference on Learning Representations."},{"key":"e_1_3_2_1_44_1","volume-title":"Inverted Pyramid Multi-task Transformer for Dense Scene Understanding. In European Conference on Computer Vision. 514-530","author":"Ye Hanrong","year":"2022","unstructured":"Hanrong Ye and Dan Xu. 2022. Inverted Pyramid Multi-task Transformer for Dense Scene Understanding. In European Conference on Computer Vision. 514-530."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01995"},{"key":"e_1_3_2_1_46_1","volume-title":"TaskPrompter: Spatial-Channel Multi-Task Prompting for Dense Scene Understanding. In The Eleventh International Conference on Learning Representations.","author":"Ye Hanrong","year":"2023","unstructured":"Hanrong Ye and Dan Xu. 2023b. TaskPrompter: Spatial-Channel Multi-Task Prompting for Dense Scene Understanding. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02641"},{"key":"e_1_3_2_1_48_1","first-page":"1","article-title":"BitFit","author":"Zaken Elad Ben","year":"2022","unstructured":"Elad Ben Zaken, Yoav Goldberg, and Shauli Ravfogel. 2022. BitFit: Simple Parameter-efficient Fine-tuning for Transformer-based Masked Language-models. In ACL. 1-9.","journal-title":"Simple Parameter-efficient Fine-tuning for Transformer-based Masked Language-models. In ACL."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755586","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T20:03:59Z","timestamp":1765310639000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755586"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":48,"alternative-id":["10.1145\/3746027.3755586","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755586","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}