{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T20:13:52Z","timestamp":1776888832873,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","funder":[{"name":"Young Scientists Fund of the National Natural Science Foundation of China","award":["62306219"],"award-info":[{"award-number":["62306219"]}]},{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2022ZD0160604"],"award-info":[{"award-number":["2022ZD0160604"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Key Research and Development Program of Hubei Province","award":["2023BAB083"],"award-info":[{"award-number":["2023BAB083"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755810","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:47:18Z","timestamp":1761374838000},"page":"2264-2273","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["MAP: Parameter-Efficient Tuning for Referring Expression Comprehension via Multi-Modal Adaptive Positional Encoding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-6654-2294","authenticated-orcid":false,"given":"Ruilin","family":"Yao","sequence":"first","affiliation":[{"name":"Wuhan University of Technology, Wuhan, Hubei, China, Foundation Model Research Center, Institute of Automation, Chinese Academy of Sciences, Beijing, China, and Shanghai Artificial Intelligence Laboratory, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4867-6811","authenticated-orcid":false,"given":"Yi","family":"Rong","sequence":"additional","affiliation":[{"name":"Wuhan University of Technology, Wuhan, Hubei, China and Sanya Science and Education Innovation Park, Wuhan University of Technology, Sanya, Hainan, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-1451-0019","authenticated-orcid":false,"given":"Tianyu","family":"Zou","sequence":"additional","affiliation":[{"name":"Wuhan University of Technology, Wuhan, Hubei, China and Sanya Science and Education Innovation Park, Wuhan University of Technology, Sanya, Hainan, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7302-4627","authenticated-orcid":false,"given":"Bo","family":"Zhang","sequence":"additional","affiliation":[{"name":"Sanya Science and Education Innovation Park, Wuhan University of Technology, Sanya, Hainan, China and Wuhan University of Technology, Wuhan, Hubei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0242-6481","authenticated-orcid":false,"given":"Jian","family":"Li","sequence":"additional","affiliation":[{"name":"Nanjing University, Nanjing, Jiangsu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3836-0664","authenticated-orcid":false,"given":"Shengwu","family":"Xiong","sequence":"additional","affiliation":[{"name":"Interdisciplinary Artificial Intelligence Research Institute, Wuhan College, Wuhan, Hubei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1167-525X","authenticated-orcid":false,"given":"Shili","family":"Xiong","sequence":"additional","affiliation":[{"name":"School of Mathematics and Statistics, Wuhan University of Technology, Wuhan, Hubei, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Jinze Bai Shuai Bai Shusheng Yang Shijie Wang Sinan Tan Peng Wang Junyang Lin Chang Zhou and Jingren Zhou. 2023. Qwen-VL: A Versatile Vision-Language Model for Understanding Localization Text Reading and Beyond. arXiv:2308.12966 [cs.CV] https:\/\/arxiv.org\/abs\/2308.12966"},{"key":"e_1_3_2_1_2_1","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang et al. 2025. Qwen2. 5-vl technical report. arXiv preprint arXiv:2502.13923 (2025)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_4_1","volume-title":"Shikra: Unleashing multimodal llm's referential dialogue magic. arXiv preprint arXiv:2306.15195","author":"Chen Keqin","year":"2023","unstructured":"Keqin Chen, Zhao Zhang, Weili Zeng, Richong Zhang, Feng Zhu, and Rui Zhao. 2023. Shikra: Unleashing multimodal llm's referential dialogue magic. arXiv preprint arXiv:2306.15195 (2023)."},{"key":"e_1_3_2_1_5_1","first-page":"16664","article-title":"Adaptformer: Adapting vision transformers for scalable visual recognition","volume":"35","author":"Chen Shoufa","year":"2022","unstructured":"Shoufa Chen, Chongjian Ge, Zhan Tong, Jiangliu Wang, Yibing Song, Jue Wang, and Ping Luo. 2022. Adaptformer: Adapting vision transformers for scalable visual recognition. Advances in Neural Information Processing Systems, Vol. 35 (2022), 16664-16678.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_6_1","volume-title":"An Efficient and Effective Transformer Decoder-Based Framework for Multi-task Visual Grounding. In European Conference on Computer Vision. 125-141","author":"Chen Wei","year":"2024","unstructured":"Wei Chen, Long Chen, and Yu Wu. 2024. An Efficient and Effective Transformer Decoder-Based Framework for Multi-task Visual Grounding. In European Conference on Computer Vision. 125-141."},{"key":"e_1_3_2_1_7_1","volume-title":"Real-time referring expression comprehension by single-stage grounding network. arXiv preprint arXiv:1812.03426","author":"Chen Xinpeng","year":"2018","unstructured":"Xinpeng Chen, Lin Ma, Jingyuan Chen, Zequn Jie, Wei Liu, and Jiebo Luo. 2018. Real-time referring expression comprehension by single-stage grounding network. arXiv preprint arXiv:1812.03426 (2018)."},{"key":"e_1_3_2_1_8_1","volume-title":"SimVG: A Simple Framework for Visual Grounding with Decoupled Multi-modal Fusion. In The Thirty-eighth Annual Conference on Neural Information Processing Systems.","author":"Dai Ming","year":"2024","unstructured":"Ming Dai, Lingfeng Yang, Yihao Xu, Zhenhua Feng, and Wankou Yang. 2024. SimVG: A Simple Framework for Visual Grounding with Decoupled Multi-modal Fusion. In The Thirty-eighth Annual Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00179"},{"key":"e_1_3_2_1_10_1","volume-title":"Transvg: End-to-end visual grounding with language conditioned vision transformer","author":"Deng Jiajun","year":"2023","unstructured":"Jiajun Deng, Zhengyuan Yang, Daqing Liu, et al., 2023. Transvg: End-to-end visual grounding with language conditioned vision transformer. IEEE Transactions on Pattern Analysis and Machine Intelligence (2023)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Hugo Jair Escalante Carlos A Hern\u00e1ndez Jesus A Gonzalez et al. 2010. The segmented and annotated IAPR TC-12 benchmark. Computer vision and image understanding Vol. 114 4 (2010) 419-428.","DOI":"10.1016\/j.cviu.2009.03.008"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"e_1_3_2_1_13_1","volume-title":"LoRA: Low-Rank Adaptation of Large Language Models. In International Conference on Learning Representations.","author":"Hu Edward J","year":"2022","unstructured":"Edward J Hu, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, et al., 2022a. LoRA: Low-Rank Adaptation of Large Language Models. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_14_1","first-page":"9853","article-title":"Sparse structure search for delta tuning","volume":"35","author":"Hu Shengding","year":"2022","unstructured":"Shengding Hu, Zhen Zhang, Ning Ding, Yadao Wang, Yasheng Wang, Zhiyuan Liu, and Maosong Sun. 2022b. Sparse structure search for delta tuning. Advances in Neural Information Processing Systems, Vol. 35 (2022), 9853-9865.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2024.111144"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1086"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01832"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Ranjay Krishna Yuke Zhu Oliver Groth Justin Johnson Kenji Hata et al. 2017. Visual genome: Connecting language and vision using crowdsourced dense image annotations. International journal of computer vision Vol. 123 (2017) 32-73.","DOI":"10.1007\/s11263-016-0981-7"},{"key":"e_1_3_2_1_21_1","volume-title":"Referring transformer: A one-step approach to multi-task visual grounding. Advances in neural information processing systems","author":"Li Muchen","year":"2021","unstructured":"Muchen Li and Leonid Sigal. 2021. Referring transformer: A one-step approach to multi-task visual grounding. Advances in neural information processing systems, Vol. 34 (2021), 19652-19664."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01089"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00477"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i2.25261"},{"key":"e_1_3_2_1_25_1","volume-title":"European Conference on Computer Vision. Springer, 38-55","author":"Liu Shilong","year":"2024","unstructured":"Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, et al., 2024c. Grounding dino: Marrying dino with grounded pre-training for open-set object detection. In European Conference on Computer Vision. Springer, 38-55."},{"key":"e_1_3_2_1_26_1","volume-title":"DARA: Domain- and Relation-Aware Adapters Make Parameter-Efficient Tuning for Visual Grounding. In 2024 IEEE International Conference on Multimedia and Expo (ICME). 1-6.","author":"Liu Ting","year":"2024","unstructured":"Ting Liu, Xuyang Liu, Siteng Huang, et al., 2024a. DARA: Domain- and Relation-Aware Adapters Make Parameter-Efficient Tuning for Visual Grounding. In 2024 IEEE International Conference on Multimedia and Expo (ICME). 1-6."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.287"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.504"},{"key":"e_1_3_2_1_29_1","volume-title":"European Conference on Computer Vision. Springer, 417-435","author":"Ma Chuofan","year":"2024","unstructured":"Chuofan Ma, Yi Jiang, Jiannan Wu, Zehuan Yuan, and Xiaojuan Qi. 2024. Groma: Localized visual tokenization for grounding multimodal large language models. In European Conference on Computer Vision. Springer, 417-435."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.9"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_48"},{"key":"e_1_3_2_1_32_1","unstructured":"Maxime Oquab Timoth\u00e9e Darcet Th\u00e9o Moutakanni et al. 2024. DINOv2: Learning Robust Visual Features without Supervision. Transactions on Machine Learning Research (2024)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.303"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00075"},{"key":"e_1_3_2_1_35_1","volume-title":"Dynamic mdetr: A dynamic multimodal transformer decoder for visual grounding","author":"Shi Fengyuan","year":"2023","unstructured":"Fengyuan Shi, Ruopeng Gao, Weilin Huang, and Limin Wang. 2023. Dynamic mdetr: A dynamic multimodal transformer decoder for visual grounding. IEEE Transactions on Pattern Analysis and Machine Intelligence (2023)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01045"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00516"},{"key":"e_1_3_2_1_38_1","volume-title":"International Conference on Machine Learning. PMLR, 23318-23340","author":"Wang Peng","year":"2022","unstructured":"Peng Wang, An Yang, Rui Men, Junyang Lin, Shuai Bai, et al., 2022. Ofa: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. In International Conference on Machine Learning. PMLR, 23318-23340."},{"key":"e_1_3_2_1_39_1","first-page":"47378","article-title":"Referencing where to focus: Improving visual grounding with referential query","volume":"37","author":"Wang Yabing","year":"2024","unstructured":"Yabing Wang, Zhuotao Tian, Qingpei Guo, Zheng Qin, Sanping Zhou, Ming Yang, and Le Wang. 2024. Referencing where to focus: Improving visual grounding with referential query. Advances in Neural Information Processing Systems, Vol. 37 (2024), 47378-47399.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"crossref","unstructured":"Linhui Xiao Xiaoshan Yang Fang Peng et al. 2023. CLIP-VG: Self-paced Curriculum Adapting of CLIP for Visual Grounding. IEEE Transactions on Multimedia (2023).","DOI":"10.1109\/TMM.2023.3321501"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681071"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00928"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00474"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58568-6_23"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20059-5_30"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00478"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3422879"},{"key":"e_1_3_2_1_48_1","volume-title":"Visual Grounding with Multi-modal Conditional Adaptation. In 32nd ACM International Conference on Multimedia.","author":"Yao Ruilin","year":"2024","unstructured":"Ruilin Yao, Shengwu Xiong, Yichen Zhao, and Yi Rong. 2024b. Visual Grounding with Multi-modal Conditional Adaptation. In 32nd ACM International Conference on Multimedia."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01506"},{"key":"e_1_3_2_1_50_1","volume-title":"The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=2msbbX3ydD","author":"You Haoxuan","year":"2024","unstructured":"Haoxuan You, Haotian Zhang, Zhe Gan, Xianzhi Du, Bowen Zhang, Zirui Wang, Liangliang Cao, Shih-Fu Chang, and Yinfei Yang. 2024. Ferret: Refer and Ground Anything Anywhere at Any Granularity. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=2msbbX3ydD"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00142"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_5"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2023.3308969"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00437"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_35"},{"key":"e_1_3_2_1_57_1","volume-title":"Visual Grounding with Joint Multi-modal Representation and Interaction","author":"Zhu Hong","year":"2023","unstructured":"Hong Zhu, Qingyang Lu, Lei Xue, Mogen Xue, Guanglin Yuan, and Bineng Zhong. 2023. Visual Grounding with Joint Multi-modal Representation and Interaction. IEEE Transactions on Instrumentation and Measurement (2023)."},{"key":"e_1_3_2_1_58_1","unstructured":"Jinguo Zhu Weiyun Wang Zhe Chen Zhaoyang Liu Shenglong Ye Lixin Gu Hao Tian Yuchen Duan Weijie Su Jie Shao Zhangwei Gao Erfei Cui Xuehui Wang Yue Cao Yangzhou Liu Xingguang Wei Hongjie Zhang Haomin Wang Weiye Xu Hao Li Jiahao Wang Nianchen Deng Songze Li Yinan He Tan Jiang Jiapeng Luo Yi Wang Conghui He Botian Shi Xingcheng Zhang Wenqi Shao Junjun He Yingtong Xiong Wenwen Qu Peng Sun Penglong Jiao Han Lv Lijun Wu Kaipeng Zhang Huipeng Deng Jiaye Ge Kai Chen Limin Wang Min Dou Lewei Lu Xizhou Zhu Tong Lu Dahua Lin Yu Qiao Jifeng Dai and Wenhai Wang. 2025. InternVL3: Exploring Advanced Training and Test-Time Recipes for Open-Source Multimodal Models. arXiv:2504.10479 [cs.CV] https:\/\/arxiv.org\/abs\/2504.10479"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755810","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:16:06Z","timestamp":1765340166000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755810"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":58,"alternative-id":["10.1145\/3746027.3755810","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755810","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}