{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,26]],"date-time":"2026-03-26T15:25:15Z","timestamp":1774538715118,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Zhejiang Province Pioneer Research and Development Project Research on Multi-modal Traffic Accident Holographic Restoration and Scene Database Construction Based on Vehicle-cloud Intersection","award":["2024C01017"],"award-info":[{"award-number":["2024C01017"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681581","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"2059-2068","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Task-Conditional Adapter for Multi-Task Dense Prediction"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-4721-5794","authenticated-orcid":false,"given":"Fengze","family":"Jiang","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3405-7473","authenticated-orcid":false,"given":"Shuling","family":"Wang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9955-3569","authenticated-orcid":false,"given":"Xiaojin","family":"Gong","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01743"},{"key":"e_1_3_2_1_2_1","volume-title":"European conference on computer vision. Springer, 205--218","author":"Cao Hu","year":"2022","unstructured":"Hu Cao, Yueyue Wang, Joy Chen, Dongsheng Jiang, Xiaopeng Zhang, Qi Tian, and Manning Wang. 2022. Swin-unet: Unet-like pure transformer for medical image segmentation. In European conference on computer vision. Springer, 205--218."},{"key":"e_1_3_2_1_3_1","volume-title":"Multitask learning. Machine learning","author":"Caruana Rich","year":"1997","unstructured":"Rich Caruana. 1997. Multitask learning. Machine learning, Vol. 28 (1997), 41--75."},{"key":"e_1_3_2_1_4_1","first-page":"16664","article-title":"Adaptformer: Adapting vision transformers for scalable visual recognition","volume":"35","author":"Chen Shoufa","year":"2022","unstructured":"Shoufa Chen, Chongjian Ge, Zhan Tong, Jiangliu Wang, Yibing Song, Jue Wang, and Ping Luo. 2022. Adaptformer: Adapting vision transformers for scalable visual recognition. Advances in Neural Information Processing Systems, Vol. 35 (2022), 16664--16678.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01591"},{"key":"e_1_3_2_1_6_1","volume-title":"Vision transformer adapter for dense predictions. arXiv preprint arXiv:2205.08534","author":"Chen Zhe","year":"2022","unstructured":"Zhe Chen, Yuchen Duan, Wenhai Wang, Junjun He, Tong Lu, Jifeng Dai, and Yu Qiao. 2022. Vision transformer adapter for dense predictions. arXiv preprint arXiv:2205.08534 (2022)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01138"},{"key":"e_1_3_2_1_8_1","volume-title":"Multi-task learning with deep neural networks: A survey. arXiv preprint arXiv:2009.09796","author":"Crawshaw Michael","year":"2020","unstructured":"Michael Crawshaw. 2020. Multi-task learning with deep neural networks: A survey. arXiv preprint arXiv:2009.09796 (2020)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-023-00626-4"},{"key":"e_1_3_2_1_11_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_12_1","volume-title":"Christopher KI Williams, John Winn, and Andrew Zisserman.","author":"Everingham Mark","year":"2010","unstructured":"Mark Everingham, Luc Van Gool, Christopher KI Williams, John Winn, and Andrew Zisserman. 2010. The pascal visual object classes (voc) challenge. International journal of computer vision, Vol. 88 (2010), 303--338."},{"key":"e_1_3_2_1_13_1","first-page":"28441","article-title":"M^3vit: Mixture-of-experts vision transformer for efficient multi-task learning with model-accelerator co-design","volume":"35","author":"Fan Zhiwen","year":"2022","unstructured":"Zhiwen Fan, Rishov Sarkar, Ziyu Jiang, Tianlong Chen, Kai Zou, Yu Cheng, Cong Hao, Zhangyang Wang, et al. 2022. M^3vit: Mixture-of-experts vision transformer for efficient multi-task learning with model-accelerator co-design. Advances in Neural Information Processing Systems, Vol. 35 (2022), 28441--28457.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00326"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00393"},{"key":"e_1_3_2_1_16_1","volume-title":"Gaussian error linear units (gelus). arXiv preprint arXiv:1606.08415","author":"Hendrycks Dan","year":"2016","unstructured":"Dan Hendrycks and Kevin Gimpel. 2016. Gaussian error linear units (gelus). arXiv preprint arXiv:1606.08415 (2016)."},{"key":"e_1_3_2_1_17_1","volume-title":"International conference on machine learning. PMLR, 2790--2799","author":"Houlsby Neil","year":"2019","unstructured":"Neil Houlsby, Andrei Giurgiu, Stanislaw Jastrzebski, Bruna Morrone, Quentin De Laroussilhe, Andrea Gesmundo, Mona Attariyan, and Sylvain Gelly. 2019. Parameter-efficient transfer learning for NLP. In International conference on machine learning. PMLR, 2790--2799."},{"key":"e_1_3_2_1_18_1","volume-title":"Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685","author":"Hu Edward J","year":"2021","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2021. Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)."},{"key":"e_1_3_2_1_19_1","volume-title":"Convolutional bypasses are better vision transformer adapters. arXiv preprint arXiv:2207.07039","author":"Jie Shibo","year":"2022","unstructured":"Shibo Jie and Zhi-Hong Deng. 2022. Convolutional bypasses are better vision transformer adapters. arXiv preprint arXiv:2207.07039 (2022)."},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings, Part XX 16","author":"Kanakis Menelaos","year":"2020","unstructured":"Menelaos Kanakis, David Bruggemann, Suman Saha, Stamatios Georgoulis, Anton Obukhov, and Luc Van Gool. 2020. Reparameterizing convolutions for incremental multi-task learning without task interference. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XX 16. Springer, 689--707."},{"key":"e_1_3_2_1_21_1","first-page":"1022","article-title":"Compacter: Efficient low-rank hypercomplex adapter layers","volume":"34","author":"Mahabadi Rabeeh Karimi","year":"2021","unstructured":"Rabeeh Karimi Mahabadi, James Henderson, and Sebastian Ruder. 2021. Compacter: Efficient low-rank hypercomplex adapter layers. Advances in Neural Information Processing Systems, Vol. 34 (2021), 1022--1035.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00654"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01298"},{"key":"e_1_3_2_1_24_1","first-page":"1","article-title":"Pre-train, prompt, and predict: A systematic survey of prompting methods in natural language processing","volume":"55","author":"Liu Pengfei","year":"2023","unstructured":"Pengfei Liu, Weizhe Yuan, Jinlan Fu, Zhengbao Jiang, Hiroaki Hayashi, and Graham Neubig. 2023. Pre-train, prompt, and predict: A systematic survey of prompting methods in natural language processing. Comput. Surveys, Vol. 55, 9 (2023), 1--35.","journal-title":"Comput. Surveys"},{"key":"e_1_3_2_1_25_1","first-page":"36889","article-title":"Polyhistor: Parameter-efficient multi-task adaptation for dense vision tasks","volume":"35","author":"Liu Yen-Cheng","year":"2022","unstructured":"Yen-Cheng Liu, Chih-Yao Ma, Junjiao Tian, Zijian He, and Zsolt Kira. 2022. Polyhistor: Parameter-efficient multi-task adaptation for dense vision tasks. Advances in Neural Information Processing Systems, Vol. 35 (2022), 36889--36901.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_27_1","unstructured":"Ilya Loshchilov Frank Hutter et al. 2017. Fixing weight decay regularization in adam. arXiv preprint arXiv:1711.05101 Vol. 5 (2017)."},{"key":"e_1_3_2_1_28_1","volume-title":"Task Indicating Transformer for Task-conditional Dense Predictions. ArXiv","author":"Lu Yuxiang","year":"2024","unstructured":"Yuxiang Lu, Shalayiding Sirejiding, Bayram Bayramli, Suizhi Huang, Yue Ding, and Hongtao Lu. 2024. Task Indicating Transformer for Task-conditional Dense Predictions. ArXiv, Vol. abs\/2403.00327 (2024). https:\/\/api.semanticscholar.org\/CorpusID:268201472"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2024.3349865"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00195"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cag.2021.07.014"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.433"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.119"},{"key":"e_1_3_2_1_34_1","volume-title":"Proceedings, Part V 12","author":"Silberman Nathan","year":"2012","unstructured":"Nathan Silberman, Derek Hoiem, Pushmeet Kohli, and Rob Fergus. 2012. Indoor segmentation and support inference from rgbd images. In Computer Vision--ECCV 2012: 12th European Conference on Computer Vision, Florence, Italy, October 7--13, 2012, Proceedings, Part V 12. Springer, 746--760."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00818"},{"key":"e_1_3_2_1_36_1","volume-title":"Multinet: Real-time joint semantic reasoning for autonomous driving. In 2018 IEEE intelligent vehicles symposium (IV)","author":"Teichmann Marvin","year":"2018","unstructured":"Marvin Teichmann, Michael Weber, Marius Zoellner, Roberto Cipolla, and Raquel Urtasun. 2018. Multinet: Real-time joint semantic reasoning for autonomous driving. In 2018 IEEE intelligent vehicles symposium (IV). IEEE, 1013--1020."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00470"},{"key":"e_1_3_2_1_38_1","volume-title":"Marc Proesmans, Dengxin Dai, and Luc Van Gool.","author":"Vandenhende Simon","year":"2021","unstructured":"Simon Vandenhende, Stamatios Georgoulis, Wouter Van Gansbeke, Marc Proesmans, Dengxin Dai, and Luc Van Gool. 2021. Multi-task learning for dense prediction tasks: A survey. IEEE transactions on pattern analysis and machine intelligence, Vol. 44, 7 (2021), 3614--3633."},{"key":"e_1_3_2_1_39_1","volume-title":"Proceedings, Part IV 16","author":"Vandenhende Simon","year":"2020","unstructured":"Simon Vandenhende, Stamatios Georgoulis, and Luc Van Gool. 2020. Mti-net: Multi-scale task interaction networks for multi-task learning. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part IV 16. Springer, 527--543."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00097"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01234-2_1"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00077"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19812-0_18"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i3.25411"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02638"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19812-0_30"},{"key":"e_1_3_2_1_47_1","volume-title":"The Eleventh International Conference on Learning Representations.","author":"Ye Hanrong","year":"2022","unstructured":"Hanrong Ye and Dan Xu. 2022. Taskprompter: Spatial-channel multi-task prompting for dense scene understanding. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01995"},{"key":"e_1_3_2_1_49_1","volume-title":"Parameter-efficient is not sufficient: Exploring parameter, memory, and time efficient adapter tuning for dense predictions. arXiv preprint arXiv:2306.09729","author":"Yin Dongshuo","year":"2023","unstructured":"Dongshuo Yin, Xueting Han, Bin Li, Hao Feng, and Jing Bai. 2023. Parameter-efficient is not sufficient: Exploring parameter, memory, and time efficient adapter tuning for dense predictions. arXiv preprint arXiv:2306.09729 (2023)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01926"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00972"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20231"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00423"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"crossref","unstructured":"Ce Zhou Qian Li Chen Li Jun Yu Yixin Liu Guangjing Wang Kai Zhang Cheng Ji Qiben Yan Lifang He et al. 2023. A comprehensive survey on pretrained foundation models: A history from bert to chatgpt. arXiv preprint arXiv:2302.09419 (2023).","DOI":"10.1007\/s13042-024-02443-6"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681581","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681581","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:48Z","timestamp":1750295868000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681581"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":54,"alternative-id":["10.1145\/3664647.3681581","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681581","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}