{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:50:23Z","timestamp":1765309823383,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":59,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62306309?62406037?U23B2054?62276263"],"award-info":[{"award-number":["62306309?62406037?U23B2054?62276263"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755279","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:38Z","timestamp":1761377198000},"page":"4010-4019","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["ReMeREC: Relation-aware and Multi-entity Referring Expression Comprehension"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-4538-3842","authenticated-orcid":false,"given":"Yizhi","family":"Hu","sequence":"first","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-0605-3867","authenticated-orcid":false,"given":"Zezhao","family":"Tian","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9772-5707","authenticated-orcid":false,"given":"Xingqun","family":"Qi","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-8689-4521","authenticated-orcid":false,"given":"Chen","family":"Su","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-3704-8037","authenticated-orcid":false,"given":"Bingkun","family":"Yang","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8469-0472","authenticated-orcid":false,"given":"Junhui","family":"Yin","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9506-7643","authenticated-orcid":false,"given":"Muyi","family":"Sun","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3043-2122","authenticated-orcid":false,"given":"Man","family":"Zhang","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4029-9935","authenticated-orcid":false,"given":"Zhenan","family":"Sun","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00438"},{"key":"e_1_3_2_1_2_1","first-page":"2544","article-title":"Visual object tracking using adaptive correlation filters. In 2010 IEEE computer society conference on computer vision and pattern recognition","author":"Bolme David S","year":"2010","unstructured":"David S Bolme, J Ross Beveridge, Bruce A Draper, and Yui Man Lui. 2010. Visual object tracking using adaptive correlation filters. In 2010 IEEE computer society conference on computer vision and pattern recognition. IEEE, 2544-2550.","journal-title":"IEEE"},{"key":"e_1_3_2_1_3_1","volume-title":"Shikra: Unleashing multimodal llm's referential dialogue magic. arXiv preprint arXiv:2306.15195","author":"Chen Keqin","year":"2023","unstructured":"Keqin Chen, Zhao Zhang, Weili Zeng, Richong Zhang, Feng Zhu, and Rui Zhao. 2023. Shikra: Unleashing multimodal llm's referential dialogue magic. arXiv preprint arXiv:2306.15195 (2023)."},{"key":"e_1_3_2_1_4_1","volume-title":"Real-time referring expression comprehension by single-stage grounding network. arXiv preprint arXiv:1812.03426","author":"Chen Xinpeng","year":"2018","unstructured":"Xinpeng Chen, Lin Ma, Jingyuan Chen, Zequn Jie, Wei Liu, and Jiebo Luo. 2018. Real-time referring expression comprehension by single-stage grounding network. arXiv preprint arXiv:1812.03426 (2018)."},{"key":"e_1_3_2_1_5_1","volume-title":"Simvg: A simple framework for visual grounding with decoupled multi-modal fusion. Advances in neural information processing systems 37","author":"Dai Ming","year":"2024","unstructured":"Ming Dai, Lingfeng Yang, Yihao Xu, Zhenhua Feng, and Wankou Yang. 2024. Simvg: A simple framework for visual grounding with decoupled multi-modal fusion. Advances in neural information processing systems 37 (2024), 121670-121698."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00179"},{"key":"e_1_3_2_1_7_1","volume-title":"Transvg: End-to-end visual grounding with language conditioned vision transformer","author":"Deng Jiajun","year":"2023","unstructured":"Jiajun Deng, Zhengyuan Yang, Daqing Liu, Tianlang Chen, Wengang Zhou, Yanyong Zhang, Houqiang Li, and Wanli Ouyang. 2023. Transvg: End-to-end visual grounding with language conditioned vision transformer. IEEE transactions on pattern analysis and machine intelligence 45, 11 (2023), 13636-13652."},{"key":"e_1_3_2_1_8_1","first-page":"4171","volume-title":"Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies","volume":"1","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers). 4171-4186."},{"key":"e_1_3_2_1_9_1","volume-title":"Proceedings of the IEEE\/CVF international conference on computer vision. 16321-16330","author":"Ding Henghui","year":"2021","unstructured":"Henghui Ding, Chang Liu, Suchen Wang, and Xudong Jiang. 2021. Visionlanguage transformer and query generation for referring segmentation. In Proceedings of the IEEE\/CVF international conference on computer vision. 16321-16330."},{"key":"e_1_3_2_1_10_1","volume-title":"Visionand- language navigation: A survey of tasks, methods, and future directions. arXiv preprint arXiv:2203.12667","author":"Gu Jing","year":"2022","unstructured":"Jing Gu, Eliana Stefani, QiWu, Jesse Thomason, and Xin EricWang. 2022. Visionand- language navigation: A survey of tasks, methods, and future directions. arXiv preprint arXiv:2203.12667 (2022)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01362"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01244"},{"key":"e_1_3_2_1_13_1","volume-title":"GREC: Generalized referring expression comprehension. arXiv preprint arXiv:2308.16182","author":"He Shuting","year":"2023","unstructured":"Shuting He, Henghui Ding, Chang Liu, and Xudong Jiang. 2023. GREC: Generalized referring expression comprehension. arXiv preprint arXiv:2308.16182 (2023)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.1214"},{"key":"e_1_3_2_1_15_1","volume-title":"High-speed tracking with kernelized correlation filters","author":"Henriques Jo\u00e3o F","year":"2014","unstructured":"Jo\u00e3o F Henriques, Rui Caseiro, Pedro Martins, and Jorge Batista. 2014. High-speed tracking with kernelized correlation filters. IEEE transactions on pattern analysis and machine intelligence 37, 3 (2014), 583-596."},{"key":"e_1_3_2_1_16_1","volume-title":"European Conference on Computer Vision. Springer, 3-23","author":"Ho Chih-Hui","year":"2022","unstructured":"Chih-Hui Ho, Srikar Appalaraju, Bhavan Jasani, R Manmatha, and Nuno Vasconcelos. 2022. Yoro-lightweight end to end visual grounding. In European Conference on Computer Vision. Springer, 3-23."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2911066"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00376"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01661"},{"key":"e_1_3_2_1_20_1","volume-title":"European Conference on Computer Vision. Springer, 290-305","author":"Huang Yi-Xin","year":"2024","unstructured":"Yi-Xin Huang, Hou-I Liu, Hong-Han Shuai, and Wen-Huang Cheng. 2024. Dqdetr: Detr with dynamic query for tiny object detection. In European Conference on Computer Vision. Springer, 290-305."},{"key":"e_1_3_2_1_21_1","volume-title":"Proceedings of the IEEE\/CVF international conference on computer vision. 1780-1790","author":"Kamath Aishwarya","year":"2021","unstructured":"Aishwarya Kamath, Mannat Singh, Yann LeCun, Gabriel Synnaeve, Ishan Misra, and Nicolas Carion. 2021. Mdetr-modulated detection for end-to-end multimodal understanding. In Proceedings of the IEEE\/CVF international conference on computer vision. 1780-1790."},{"key":"e_1_3_2_1_22_1","volume-title":"International conference on machine learning. PMLR, 5583-5594","author":"Kim Wonjae","year":"2021","unstructured":"Wonjae Kim, Bokyung Son, and Ildoo Kim. 2021. Vilt: Vision-and-language transformer without convolution or region supervision. In International conference on machine learning. PMLR, 5583-5594."},{"key":"e_1_3_2_1_23_1","volume-title":"Referring transformer: A one-step approach to multi-task visual grounding. Advances in neural information processing systems 34","author":"Li Muchen","year":"2021","unstructured":"Muchen Li and Leonid Sigal. 2021. Referring transformer: A one-step approach to multi-task visual grounding. Advances in neural information processing systems 34 (2021), 19652-19664."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_17"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01089"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02259"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00477"},{"key":"e_1_3_2_1_28_1","volume-title":"European Conference on Computer Vision. Springer, 38-55","author":"Liu Shilong","year":"2024","unstructured":"Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Qing Jiang, Chunyuan Li, Jianwei Yang, Hang Su, et al. 2024. Grounding dino: Marrying dino with grounded pre-training for open-set object detection. In European Conference on Computer Vision. Springer, 38-55."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00205"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_31_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01005"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.9"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00331"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_48"},{"key":"e_1_3_2_1_36_1","volume-title":"Yolov3: An incremental improvement. arXiv preprint arXiv:1804.02767","author":"Redmon Joseph","year":"2018","unstructured":"Joseph Redmon and Ali Farhadi. 2018. Yolov3: An incremental improvement. arXiv preprint arXiv:1804.02767 (2018)."},{"key":"e_1_3_2_1_37_1","volume-title":"Faster R-CNN: Towards real-time object detection with region proposal networks","author":"Ren Shaoqing","year":"2016","unstructured":"Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. 2016. Faster R-CNN: Towards real-time object detection with region proposal networks. IEEE transactions on pattern analysis and machine intelligence 39, 6 (2016), 1137-1149."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3328185"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i2.25331"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01045"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01384"},{"key":"e_1_3_2_1_42_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_43_1","volume-title":"Theo Gevers, and Arnold WM Smeulders.","author":"Uijlings Jasper RR","year":"2013","unstructured":"Jasper RR Uijlings, Koen EA Van De Sande, Theo Gevers, and Arnold WM Smeulders. 2013. Selective search for object recognition. International journal of computer vision 104 (2013), 154-171."},{"key":"e_1_3_2_1_44_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 1960-1968","author":"Wang Peng","key":"e_1_3_2_1_45_1","unstructured":"Peng Wang, Qi Wu, Jiewei Cao, Chunhua Shen, Lianli Gao, and Anton van den Hengel. 2019. Neighbourhood watch: Referring expression comprehension via language-guided graph attention networks. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 1960-1968."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681071"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3321501"},{"key":"e_1_3_2_1_48_1","first-page":"79095","article-title":"Described object detection: Liberating object detection with flexible expressions","volume":"36","author":"Xie Chi","year":"2023","unstructured":"Chi Xie, Zhao Zhang, Yixuan Wu, Feng Zhu, Rui Zhao, and Shuang Liang. 2023. Described object detection: Liberating object detection with flexible expressions. Advances in Neural Information Processing Systems 36 (2023), 79095-79107.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01471"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00928"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00474"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58568-6_23"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00478"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01506"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00142"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_5"},{"key":"e_1_3_2_1_57_1","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 6281-6290","author":"Yu Zhou","year":"2019","unstructured":"Zhou Yu, Jun Yu, Yuhao Cui, Dacheng Tao, and Qi Tian. 2019. Deep modular coattention networks for visual question answering. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 6281-6290."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2022.3183827"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIM.2023.3324362"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755279","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:45:40Z","timestamp":1765309540000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755279"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":59,"alternative-id":["10.1145\/3746027.3755279","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755279","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}