{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,7]],"date-time":"2026-05-07T16:24:56Z","timestamp":1778171096716,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":80,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"Pengcheng Laboratory Research Project","award":["PCL2023A08"],"award-info":[{"award-number":["PCL2023A08"]}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100018537","name":"National Science and Technology Major Project","doi-asserted-by":"publisher","award":["2021ZD0112200"],"award-info":[{"award-number":["2021ZD0112200"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100018537","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62036012, U23A20387, 62322212, 62072455"],"award-info":[{"award-number":["62036012, U23A20387, 62322212, 62072455"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681071","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"5460-5469","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":25,"title":["HiVG: Hierarchical Multimodal Fine-grained Modulation for Visual Grounding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2592-5264","authenticated-orcid":false,"given":"Linhui","family":"Xiao","sequence":"first","affiliation":[{"name":"MAIS, Institute of Automation, Chinese Academy of Sciences &amp; Pengcheng Laboratory, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5453-9755","authenticated-orcid":false,"given":"Xiaoshan","family":"Yang","sequence":"additional","affiliation":[{"name":"MAIS, Institute of Automation, Chinese Academy of Sciences &amp; Pengcheng Laboratory, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3948-7413","authenticated-orcid":false,"given":"Fang","family":"Peng","sequence":"additional","affiliation":[{"name":"MAIS, Institute of Automation, Chinese Academy of Sciences &amp; Pengcheng Laboratory, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6110-4036","authenticated-orcid":false,"given":"Yaowei","family":"Wang","sequence":"additional","affiliation":[{"name":"Pengcheng Laboratory &amp; Harbin Institute of Technology, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8343-9665","authenticated-orcid":false,"given":"Changsheng","family":"Xu","sequence":"additional","affiliation":[{"name":"MAIS, Institute of Automation, Chinese Academy of Sciences &amp; Pengcheng Laboratory, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"e_1_3_2_1_2_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. Advances in neural information processing systems Vol. 33 (2020) 1877--1901."},{"key":"e_1_3_2_1_3_1","volume-title":"Proceedings, Part VI 16","author":"Cao Jize","year":"2020","unstructured":"Jize Cao, Zhe Gan, Yu Cheng, Licheng Yu, Yen-Chun Chen, and Jingjing Liu. 2020. Behind the scene: Revealing the secrets of pre-trained vision-and-language models. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part VI 16. Springer, 565--580."},{"key":"e_1_3_2_1_4_1","volume-title":"Proceedings, Part I 16","author":"Carion Nicolas","year":"2020","unstructured":"Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. 2020. End-to-end object detection with transformers. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part I 16. Springer, 213--229."},{"key":"e_1_3_2_1_5_1","volume-title":"LION: Empowering multimodal large language model with dual-level visual knowledge. arXiv preprint arXiv:2311.11860","author":"Chen Gongwei","year":"2023","unstructured":"Gongwei Chen, Leyang Shen, Rui Shao, Xiang Deng, and Liqiang Nie. 2023. LION: Empowering multimodal large language model with dual-level visual knowledge. arXiv preprint arXiv:2311.11860 (2023)."},{"key":"e_1_3_2_1_6_1","volume-title":"Shikra: Unleashing Multimodal LLM's Referential Dialogue Magic. arXiv preprint arXiv:2306.15195","author":"Chen Keqin","year":"2023","unstructured":"Keqin Chen, Zhao Zhang, Weili Zeng, Richong Zhang, Feng Zhu, and Rui Zhao. 2023. Shikra: Unleashing Multimodal LLM's Referential Dialogue Magic. arXiv preprint arXiv:2306.15195 (2023)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.893"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00179"},{"key":"e_1_3_2_1_10_1","volume-title":"Transvg: End-to-end visual grounding with language conditioned vision transformer","author":"Deng Jiajun","year":"2023","unstructured":"Jiajun Deng, Zhengyuan Yang, Daqing Liu, Tianlang Chen, Wengang Zhou, Yanyong Zhang, Houqiang Li, and Wanli Ouyang. 2023. Transvg: End-to-end visual grounding with language conditioned vision transformer. IEEE Transactions on Pattern Analysis and Machine Intelligence (2023)."},{"key":"e_1_3_2_1_11_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_12_1","volume-title":"International Conference on Learning Representations.","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, et al. 2020. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3358415"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-25085-9_1"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2911066"},{"key":"e_1_3_2_1_18_1","volume-title":"LoRA: Low-Rank Adaptation of Large Language Models. In International Conference on Learning Representations.","author":"Hu Edward J","year":"2021","unstructured":"Edward J Hu, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen, et al. 2021. LoRA: Low-Rank Adaptation of Large Language Models. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.493"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"e_1_3_2_1_21_1","volume-title":"Referitgame: Referring to objects in photographs of natural scenes. 787--798.","author":"Kazemzadeh Sahar","year":"2014","unstructured":"Sahar Kazemzadeh, Vicente Ordonez, Mark Matten, and Tamara Berg. 2014. Referitgame: Referring to objects in photographs of natural scenes. 787--798."},{"key":"e_1_3_2_1_22_1","volume-title":"CLIPREC: Graph-Based Domain Adaptive Network for Zero-Shot Referring Expression Comprehension","author":"Ke Jingcheng","year":"2023","unstructured":"Jingcheng Ke, Jia Wang, Jun-Cheng Chen, I-Hong Jhuo, Chia-Wen Lin, and Yen-Yu Lin. 2023. CLIPREC: Graph-Based Domain Adaptive Network for Zero-Shot Referring Expression Comprehension. IEEE Transactions on Multimedia (2023)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-long.258"},{"key":"e_1_3_2_1_24_1","volume-title":"International Conference on Machine Learning. PMLR, 5583--5594","author":"Kim Wonjae","year":"2021","unstructured":"Wonjae Kim, Bokyung Son, and Ildoo Kim. 2021. Vilt: Vision-and-language transformer without convolution or region supervision. In International Conference on Machine Learning. PMLR, 5583--5594."},{"key":"e_1_3_2_1_25_1","volume-title":"Imagenet classification with deep convolutional neural networks. Advances in neural information processing systems","author":"Krizhevsky Alex","year":"2012","unstructured":"Alex Krizhevsky, Ilya Sutskever, and Geoffrey E Hinton. 2012. Imagenet classification with deep convolutional neural networks. Advances in neural information processing systems, Vol. 25 (2012)."},{"key":"e_1_3_2_1_26_1","volume-title":"Adapting clip for phrase localization without further training. arXiv preprint arXiv:2204.03647","author":"Li Jiahao","year":"2022","unstructured":"Jiahao Li, Greg Shakhnarovich, and Raymond A Yeh. 2022. Adapting clip for phrase localization without further training. arXiv preprint arXiv:2204.03647 (2022)."},{"key":"e_1_3_2_1_27_1","first-page":"19652","article-title":"Referring transformer: A one-step approach to multi-task visual grounding","volume":"34","author":"Li Muchen","year":"2021","unstructured":"Muchen Li and Leonid Sigal. 2021. Referring transformer: A one-step approach to multi-task visual grounding. Advances in Neural Information Processing Systems, Vol. 34 (2021), 19652--19664.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2024.3440924"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_17"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/WCSP.2019.8928098"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.324"},{"key":"e_1_3_2_1_32_1","unstructured":"Daqing Liu Hanwang Zhang Feng Wu and Zheng-Jun Zha. 2019. Learning to assemble neural module tree networks for visual grounding. In ICCV. 4673--4682."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i2.25261"},{"key":"e_1_3_2_1_34_1","volume-title":"2023 e. Grounding dino: Marrying dino with grounded pre-training for open-set object detection. arXiv preprint arXiv:2303.05499","author":"Liu Shilong","year":"2023","unstructured":"Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, et al. 2023 e. Grounding dino: Marrying dino with grounded pre-training for open-set object detection. arXiv preprint arXiv:2303.05499 (2023)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00205"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1088\/1361-6501\/acadfb"},{"key":"e_1_3_2_1_37_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02277"},{"key":"e_1_3_2_1_39_1","volume-title":"2023 d. Foregroundness-Aware Task Disentanglement and Self-Paced Curriculum Learning for Domain Adaptive Object Detection","author":"Liu Yabo","year":"2023","unstructured":"Yabo Liu, Jinghua Wang, Linhui Xiao, Chengliang Liu, Zhihao Wu, and Yong Xu. 2023 d. Foregroundness-Aware Task Disentanglement and Self-Paced Curriculum Learning for Domain Adaptive Object Detection. IEEE Transactions on Neural Networks and Learning Systems (2023)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_41_1","volume-title":"LGR-NET: Language Guided Reasoning Network for Referring Expression Comprehension","author":"Lu Mingcong","year":"2024","unstructured":"Mingcong Lu, Ruifan Li, Fangxiang Feng, Zhanyu Ma, and Xiaojie Wang. 2024. LGR-NET: Language Guided Reasoning Network for Referring Expression Comprehension. IEEE Transactions on Circuits and Systems for Video Technology (2024)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.9"},{"key":"e_1_3_2_1_43_1","volume-title":"Self-Paced Multi-Grained Cross-Modal Interaction Modeling for Referring Expression Comprehension","author":"Miao Peihan","year":"2023","unstructured":"Peihan Miao, Wei Su, Gaoang Wang, Xuewei Li, and Xi Li. 2023. Self-Paced Multi-Grained Cross-Modal Interaction Modeling for Referring Expression Comprehension. IEEE Transactions on Image Processing (2023)."},{"key":"e_1_3_2_1_44_1","volume-title":"V-net: Fully convolutional neural networks for volumetric medical image segmentation. In 2016 fourth international conference on 3D vision (3DV). Ieee, 565--571.","author":"Milletari Fausto","year":"2016","unstructured":"Fausto Milletari, Nassir Navab, and Seyed-Ahmad Ahmadi. 2016. V-net: Fully convolutional neural networks for volumetric medical image segmentation. In 2016 fourth international conference on 3D vision (3DV). Ieee, 565--571."},{"key":"e_1_3_2_1_45_1","volume-title":"Sgva-clip: Semantic-guided visual adapting of vision-language models for few-shot image classification","author":"Peng Fang","year":"2023","unstructured":"Fang Peng, Xiaoshan Yang, Linhui Xiao, Yaowei Wang, and Changsheng Xu. 2023. Sgva-clip: Semantic-guided visual adapting of vision-language models for few-shot image classification. IEEE Transactions on Multimedia (2023)."},{"key":"e_1_3_2_1_46_1","volume-title":"Kosmos-2: Grounding multimodal large language models to the world. arXiv preprint arXiv:2306.14824","author":"Peng Zhiliang","year":"2023","unstructured":"Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, and Furu Wei. 2023. Kosmos-2: Grounding multimodal large language models to the world. arXiv preprint arXiv:2306.14824 (2023)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.303"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.3042066"},{"key":"e_1_3_2_1_49_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_50_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language models are unsupervised multitask learners. OpenAI blog Vol. 1 8 (2019) 9."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00075"},{"key":"e_1_3_2_1_52_1","volume-title":"Laion-400m: Open dataset of clip-filtered 400 million image-text pairs. arXiv preprint arXiv:2111.02114","author":"Schuhmann Christoph","year":"2021","unstructured":"Christoph Schuhmann, Richard Vencu, Romain Beaumont, Robert Kaczmarczyk, Clayton Mullis, Aarush Katta, Theo Coombes, Jenia Jitsev, and Aran Komatsuzaki. 2021. Laion-400m: Open dataset of clip-filtered 400 million image-text pairs. arXiv preprint arXiv:2111.02114 (2021)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-1162"},{"key":"e_1_3_2_1_54_1","volume-title":"Dynamic mdetr: A dynamic multimodal transformer decoder for visual grounding","author":"Shi Fengyuan","year":"2022","unstructured":"Fengyuan Shi, Ruopeng Gao, Weilin Huang, and Limin Wang. 2022. Dynamic mdetr: A dynamic multimodal transformer decoder for visual grounding. IEEE Transactions on Pattern Analysis and Machine Intelligence (2022)."},{"key":"e_1_3_2_1_55_1","volume-title":"Continual diffusion: Continual customization of text-to-image diffusion with c-lora. arXiv preprint arXiv:2304.06027","author":"Smith James Seale","year":"2023","unstructured":"James Seale Smith, Yen-Chang Hsu, Lingyu Zhang, Ting Hua, Zsolt Kira, Yilin Shen, and Hongxia Jin. 2023. Continual diffusion: Continual customization of text-to-image diffusion with c-lora. arXiv preprint arXiv:2304.06027 (2023)."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01045"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.357"},{"key":"e_1_3_2_1_58_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_59_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017), 5998--6008."},{"key":"e_1_3_2_1_60_1","volume-title":"NeurIPS","volume":"30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. NeurIPS, Vol. 30 (2017)."},{"key":"e_1_3_2_1_61_1","volume-title":"International Conference on Machine Learning. PMLR, 23318--23340","author":"Wang Peng","year":"2022","unstructured":"Peng Wang, An Yang, Rui Men, Junyang Lin, Shuai Bai, Zhikang Li, Jianxin Ma, Chang Zhou, Jingren Zhou, and Hongxia Yang. 2022. Ofa: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. In International Conference on Machine Learning. PMLR, 23318--23340."},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01139"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.robot.2019.03.012"},{"key":"e_1_3_2_1_64_1","volume-title":"CLIP-VG: Self-paced Curriculum Adapting of CLIP for Visual Grounding","author":"Xiao Linhui","year":"2023","unstructured":"Linhui Xiao, Xiaoshan Yang, Fang Peng, Ming Yan, Yaowei Wang, and Changsheng Xu. 2023. CLIP-VG: Self-paced Curriculum Adapting of CLIP for Visual Grounding. IEEE Transactions on Multimedia (2023)."},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611757"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02524"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462823"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3140611"},{"key":"e_1_3_2_1_69_1","volume-title":"Proceedings, Part XIV 16","author":"Yang Zhengyuan","year":"2020","unstructured":"Zhengyuan Yang, Tianlang Chen, Liwei Wang, and Jiebo Luo. 2020. Improving one-stage visual grounding by recursive sub-query construction. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XIV 16. Springer, 387--404."},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20059-5_30"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"crossref","unstructured":"Zhengyuan Yang Boqing Gong Liwei Wang Wenbing Huang Dong Yu and Jiebo Luo. 2019. A fast and accurate one-stage approach to visual grounding. In ICCV. 4683--4693.","DOI":"10.1109\/ICCV.2019.00478"},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01506"},{"key":"e_1_3_2_1_73_1","unstructured":"Jiabo Ye Junfeng Tian Ming Yan Xiaoshan Yang Xuwu Wang Ji Zhang Liang He and Xin Lin. 2022. Shifting More Attention to Visual Backbone: Query-modulated Refinement Networks for End-to-End Visual Grounding. In CVPR. 15502--15512."},{"key":"e_1_3_2_1_74_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"You Haoxuan","year":"2024","unstructured":"Haoxuan You, Haotian Zhang, Zhe Gan, Xianzhi Du, Bowen Zhang, Zirui Wang, Liangliang Cao, Shih-Fu Chang, and Yinfei Yang. 2024. Ferret: Refer and Ground Anything Anywhere at Any Granularity. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00142"},{"key":"e_1_3_2_1_76_1","volume-title":"Proceedings, Part II 14","author":"Yu Licheng","year":"2016","unstructured":"Licheng Yu, Patrick Poirson, Shan Yang, Alexander C Berg, and Tamara L Berg. 2016. Modeling context in referring expressions. In Computer Vision--ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11--14, 2016, Proceedings, Part II 14. Springer, 69--85."},{"key":"e_1_3_2_1_77_1","volume-title":"Joey Tianyi Zhou, and Yew-Soon Ong","author":"Zhao Heng","year":"2022","unstructured":"Heng Zhao, Joey Tianyi Zhou, and Yew-Soon Ong. 2022. Word2pix: Word to pixel cross-attention transformer in visual grounding. IEEE Transactions on Neural Networks and Learning Systems (2022)."},{"key":"e_1_3_2_1_78_1","unstructured":"Yiyi Zhou Rongrong Ji Gen Luo Xiaoshuai Sun Jinsong Su Xinghao Ding Chia-Wen Lin and Qi Tian. 2021. A real-time global inference network for one-stage referring expression comprehension. (2021)."},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_35"},{"key":"e_1_3_2_1_80_1","volume-title":"Visual Grounding with Joint Multi-modal Representation and Interaction","author":"Zhu Hong","year":"2023","unstructured":"Hong Zhu, Qingyang Lu, Lei Xue, Mogen Xue, Guanglin Yuan, and Bineng Zhong. 2023. Visual Grounding with Joint Multi-modal Representation and Interaction. IEEE Transactions on Instrumentation and Measurement (2023)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681071","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681071","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:57:52Z","timestamp":1750294672000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681071"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":80,"alternative-id":["10.1145\/3664647.3681071","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681071","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}