{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,10]],"date-time":"2026-05-10T04:03:53Z","timestamp":1778385833200,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":52,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62176194"],"award-info":[{"award-number":["62176194"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Key Research and Development Program of Hubei Province","award":["2023BAB083"],"award-info":[{"award-number":["2023BAB083"]}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2022ZD0160604"],"award-info":[{"award-number":["2022ZD0160604"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Hainan Provincial Joint Project of Sanya Yazhou Bay Science and Technology City","award":["2021JJLH0099"],"award-info":[{"award-number":["2021JJLH0099"]}]},{"name":"Young Scientists Fund of the National Natural Science Foundation of China","award":["62306219"],"award-info":[{"award-number":["62306219"]}]},{"name":"Project of Sanya Yazhou Bay Science and Technology City","award":["SCKJ-JYRC-2022-76, SKJC-2022-PTDX-031"],"award-info":[{"award-number":["SCKJ-JYRC-2022-76, SKJC-2022-PTDX-031"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681256","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"3877-3886","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":19,"title":["Visual Grounding with Multi-modal Conditional Adaptation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-6654-2294","authenticated-orcid":false,"given":"Ruilin","family":"Yao","sequence":"first","affiliation":[{"name":"Wuhan University of Technology &amp; Shanghai Artificial Intelligence Laboratory, Wuhan, Hubei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4006-7029","authenticated-orcid":false,"given":"Shengwu","family":"Xiong","sequence":"additional","affiliation":[{"name":"Wuhan University of Technology &amp; Sanya Science and Education Innovation Park of Wuhan University of Technology, Wuhan, Hubei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7552-3453","authenticated-orcid":false,"given":"Yichen","family":"Zhao","sequence":"additional","affiliation":[{"name":"Sanya Science and Education Innovation Park of Wuhan University of Technology &amp; Wuhan University of Technology, Sanya, Hainan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4867-6811","authenticated-orcid":false,"given":"Yi","family":"Rong","sequence":"additional","affiliation":[{"name":"Wuhan University of Technology &amp; Sanya Science and Education Innovation Park of Wuhan University of Technology, Wuhan, Hubei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. Advances in neural information processing systems Vol. 33 (2020) 1877--1901."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16188"},{"key":"e_1_3_2_1_4_1","first-page":"16664","article-title":"Adaptformer: Adapting vision transformers for scalable visual recognition","volume":"35","author":"Chen Shoufa","year":"2022","unstructured":"Shoufa Chen, Chongjian Ge, Zhan Tong, Jiangliu Wang, Yibing Song, Jue Wang, and Ping Luo. 2022. Adaptformer: Adapting vision transformers for scalable visual recognition. Advances in Neural Information Processing Systems, Vol. 35 (2022), 16664--16678.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_5_1","volume-title":"Real-time referring expression comprehension by single-stage grounding network. arXiv preprint arXiv:1812.03426","author":"Chen Xinpeng","year":"2018","unstructured":"Xinpeng Chen, Lin Ma, Jingyuan Chen, Zequn Jie, Wei Liu, and Jiebo Luo. 2018. Real-time referring expression comprehension by single-stage grounding network. arXiv preprint arXiv:1812.03426 (2018)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00179"},{"key":"e_1_3_2_1_7_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_8_1","volume-title":"The segmented and annotated IAPR TC-12 benchmark. Computer vision and image understanding","author":"Escalante Hugo Jair","year":"2010","unstructured":"Hugo Jair Escalante, Carlos A Hern\u00e1ndez, Jesus A Gonzalez, Aurelio L\u00f3pez-L\u00f3pez, Manuel Montes, Eduardo F Morales, L Enrique Sucar, Luis Villasenor, and Michael Grubinger. 2010. The segmented and annotated IAPR TC-12 benchmark. Computer vision and image understanding, Vol. 114, 4 (2010), 419--428."},{"key":"e_1_3_2_1_9_1","volume-title":"ICLR 2023 Workshop on Mathematical and Empirical Understanding of Foundation Models.","author":"Gagnon-Audet Jean-Christophe","year":"2023","unstructured":"Jean-Christophe Gagnon-Audet, Ricardo Pio Monti, and David J Schwab. 2023. AWE: Adaptive weight-space ensembling for few-shot fine-tuning. In ICLR 2023 Workshop on Mathematical and Empirical Understanding of Foundation Models."},{"key":"e_1_3_2_1_10_1","volume-title":"Making pre-trained language models better few-shot learners. arXiv preprint arXiv:2012.15723","author":"Gao Tianyu","year":"2020","unstructured":"Tianyu Gao, Adam Fisch, and Danqi Chen. 2020. Making pre-trained language models better few-shot learners. arXiv preprint arXiv:2012.15723 (2020)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2911066"},{"key":"e_1_3_2_1_13_1","volume-title":"International Conference on Machine Learning. PMLR, 2790--2799","author":"Houlsby Neil","year":"2019","unstructured":"Neil Houlsby, Andrei Giurgiu, Stanislaw Jastrzebski, Bruna Morrone, Quentin De Laroussilhe, Andrea Gesmundo, Mona Attariyan, and Sylvain Gelly. 2019. Parameter-efficient transfer learning for NLP. In International Conference on Machine Learning. PMLR, 2790--2799."},{"key":"e_1_3_2_1_14_1","volume-title":"Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685","author":"Hu Edward J","year":"2021","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2021. Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)."},{"key":"e_1_3_2_1_15_1","volume-title":"Knowledgeable prompt-tuning: Incorporating knowledge into prompt verbalizer for text classification. arXiv preprint arXiv:2108.02035","author":"Hu Shengding","year":"2021","unstructured":"Shengding Hu, Ning Ding, Huadong Wang, Zhiyuan Liu, Jingang Wang, Juanzi Li, Wei Wu, and Maosong Sun. 2021. Knowledgeable prompt-tuning: Incorporating knowledge into prompt verbalizer for text classification. arXiv preprint arXiv:2108.02035 (2021)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"e_1_3_2_1_17_1","first-page":"1022","article-title":"Compacter: Efficient low-rank hypercomplex adapter layers","volume":"34","author":"Mahabadi Rabeeh Karimi","year":"2021","unstructured":"Rabeeh Karimi Mahabadi, James Henderson, and Sebastian Ruder. 2021. Compacter: Efficient low-rank hypercomplex adapter layers. Advances in Neural Information Processing Systems, Vol. 34 (2021), 1022--1035.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1086"},{"key":"e_1_3_2_1_19_1","volume-title":"Fahad Shahbaz Khan, and Mubarak Shah.","author":"Khan Salman","year":"2022","unstructured":"Salman Khan, Muzammal Naseer, Munawar Hayat, Syed Waqas Zamir, Fahad Shahbaz Khan, and Mubarak Shah. 2022. Transformers in vision: A survey. ACM computing surveys (CSUR), Vol. 54, 10s (2022), 1--41."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"e_1_3_2_1_21_1","volume-title":"Referring transformer: A one-step approach to multi-task visual grounding. Advances in neural information processing systems","author":"Li Muchen","year":"2021","unstructured":"Muchen Li and Leonid Sigal. 2021. Referring transformer: A one-step approach to multi-task visual grounding. Advances in neural information processing systems, Vol. 34 (2021), 19652--19664."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612584"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01089"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00477"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00205"},{"key":"e_1_3_2_1_26_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.9"},{"key":"e_1_3_2_1_29_1","volume-title":"Proceedings, Part IV 14","author":"Nagaraja Varun K","year":"2016","unstructured":"Varun K Nagaraja, Vlad I Morariu, and Larry S Davis. 2016. Modeling context between objects for referring expression understanding. In Computer Vision--ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11--14, 2016, Proceedings, Part IV 14. Springer, 792--807."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413850"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00075"},{"key":"e_1_3_2_1_32_1","volume-title":"Dynamic mdetr: A dynamic multimodal transformer decoder for visual grounding. arXiv preprint arXiv:2209.13959","author":"Shi Fengyuan","year":"2022","unstructured":"Fengyuan Shi, Ruopeng Gao, Weilin Huang, and Limin Wang. 2022. Dynamic mdetr: A dynamic multimodal transformer decoder for visual grounding. arXiv preprint arXiv:2209.13959 (2022)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","unstructured":"Wei Su Peihan Miao Huanzhang Dou Yongjian Fu and Xi Li. 2023. Referring Expression Comprehension Using Language Adaptive Inference. In Proceedings of the Thirty-Seventh AAAI Conference on Artificial Intelligence and Thirty-Fifth Conference on Innovative Applications of Artificial Intelligence and Thirteenth Symposium on Educational Advances in Artificial Intelligence (AAAI'23\/IAAI'23\/EAAI'23). AAAI Press Article 262 9 pages. https:\/\/doi.org\/10.1609\/aaai.v37i2.25331","DOI":"10.1609\/aaai.v37i2.25331"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01045"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00516"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3227466"},{"key":"e_1_3_2_1_37_1","volume-title":"International Conference on Machine Learning. PMLR, 23965--23998","author":"Wortsman Mitchell","year":"2022","unstructured":"Mitchell Wortsman, Gabriel Ilharco, Samir Ya Gadre, Rebecca Roelofs, Raphael Gontijo-Lopes, Ari S Morcos, Hongseok Namkoong, Ali Farhadi, Yair Carmon, Simon Kornblith, et al. 2022. Model soups: averaging weights of multiple fine-tuned models improves accuracy without increasing inference time. In International Conference on Machine Learning. PMLR, 23965--23998."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00780"},{"key":"e_1_3_2_1_39_1","volume-title":"Exploring efficient few-shot adaptation for vision transformers. arXiv preprint arXiv:2301.02419","author":"Xu Chengming","year":"2023","unstructured":"Chengming Xu, Siqian Yang, Yabiao Wang, Zhanxiong Wang, Yanwei Fu, and Xiangyang Xue. 2023. Exploring efficient few-shot adaptation for vision transformers. arXiv preprint arXiv:2301.02419 (2023)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00928"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00474"},{"key":"e_1_3_2_1_42_1","volume-title":"Proceedings, Part XIV 16","author":"Yang Zhengyuan","year":"2020","unstructured":"Zhengyuan Yang, Tianlang Chen, Liwei Wang, and Jiebo Luo. 2020. Improving one-stage visual grounding by recursive sub-query construction. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XIV 16. Springer, 387--404."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00478"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01762"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475313"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01506"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00142"},{"key":"e_1_3_2_1_48_1","volume-title":"Proceedings, Part II 14","author":"Yu Licheng","year":"2016","unstructured":"Licheng Yu, Patrick Poirson, Shan Yang, Alexander C Berg, and Tamara L Berg. 2016. Modeling context in referring expressions. In Computer Vision--ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11--14, 2016, Proceedings, Part II 14. Springer, 69--85."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00437"},{"key":"e_1_3_2_1_50_1","first-page":"36067","article-title":"Glipv2: Unifying localization and vision-language understanding","volume":"35","author":"Zhang Haotian","year":"2022","unstructured":"Haotian Zhang, Pengchuan Zhang, Xiaowei Hu, Yen-Chun Chen, Liunian Li, Xiyang Dai, Lijuan Wang, Lu Yuan, Jenq-Neng Hwang, and Jianfeng Gao. 2022. Glipv2: Unifying localization and vision-language understanding. Advances in Neural Information Processing Systems, Vol. 35 (2022), 36067--36080.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_51_1","volume-title":"Adaptive budget allocation for parameter-efficient fine-tuning. arXiv preprint arXiv:2303.10512","author":"Zhang Qingru","year":"2023","unstructured":"Qingru Zhang, Minshuo Chen, Alexander Bukharin, Pengcheng He, Yu Cheng, Weizhu Chen, and Tuo Zhao. 2023. Adaptive budget allocation for parameter-efficient fine-tuning. arXiv preprint arXiv:2303.10512 (2023)."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01451"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681256","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681256","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:42Z","timestamp":1750295862000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681256"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":52,"alternative-id":["10.1145\/3664647.3681256","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681256","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}