{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T20:13:53Z","timestamp":1776888833670,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":57,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"National Key R&D Program of China","award":["021YFA0715202"],"award-info":[{"award-number":["021YFA0715202"]}]},{"name":"Shenzhen Science and Technology Program","award":["KQTD20170810150821146"],"award-info":[{"award-number":["KQTD20170810150821146"]}]},{"name":"Beijing Nova Program","award":["20220484098"],"award-info":[{"award-number":["20220484098"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612584","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"5174-5184","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["LUNA: Language as Continuing Anchors for Referring Expression Comprehension"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-6708-458X","authenticated-orcid":false,"given":"Yaoyuan","family":"Liang","sequence":"first","affiliation":[{"name":"Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4376-0109","authenticated-orcid":false,"given":"Zhao","family":"Yang","sequence":"additional","affiliation":[{"name":"University of Oxford, Oxford, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1534-4549","authenticated-orcid":false,"given":"Yansong","family":"Tang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-8087-3409","authenticated-orcid":false,"given":"Jiashuo","family":"Fan","sequence":"additional","affiliation":[{"name":"Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0620-3561","authenticated-orcid":false,"given":"Ziran","family":"Li","sequence":"additional","affiliation":[{"name":"Meituan Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5758-0083","authenticated-orcid":false,"given":"Jingang","family":"Wang","sequence":"additional","affiliation":[{"name":"Meituan Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-0259-5732","authenticated-orcid":false,"given":"Philip H.S.","family":"Torr","sequence":"additional","affiliation":[{"name":"University of Oxford, Oxford, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2827-4022","authenticated-orcid":false,"given":"Shao-Lun","family":"Huang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Shenzhen, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Vqa: Visual question answering. In ICCV.","author":"Antol Stanislaw","year":"2015","unstructured":"Stanislaw Antol, Aishwarya Agrawal, Jiasen Lu, Margaret Mitchell, Dhruv Batra, C Lawrence Zitnick, and Devi Parikh. 2015. Vqa: Visual question answering. In ICCV."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Nicolas Carion Francisco Massa Gabriel Synnaeve Nicolas Usunier Alexander Kirillov and Sergey Zagoruyko. 2020. End-to-End Object Detection with Transformers. In ECCV.","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_3_1","volume-title":"Faisal Ahmed, Zhe Gan, Yu Cheng, and Jingjing Liu.","author":"Chen Yen-Chun","year":"2020","unstructured":"Yen-Chun Chen, Linjie Li, Licheng Yu, Ahmed El Kholy, Faisal Ahmed, Zhe Gan, Yu Cheng, and Jingjing Liu. 2020. Uniter: Universal image-text representation learning. In ECCV."},{"key":"e_1_3_2_1_4_1","volume-title":"Transvg: End-to-end visual grounding with transformers. In ICCV.","author":"Deng Jiajun","year":"2021","unstructured":"Jiajun Deng, Zhengyuan Yang, Tianlang Chen, Wengang Zhou, and Houqiang Li. 2021. Transvg: End-to-end visual grounding with transformers. In ICCV."},{"key":"e_1_3_2_1_5_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NAACL.","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NAACL."},{"key":"e_1_3_2_1_6_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly Jakob Uszkoreit and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In ICLR."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","unstructured":"Hugo Jair Escalante Carlos A Hern\u00e1ndez Jesus A Gonzalez Aurelio L\u00f3pez-L\u00f3pez Manuel Montes Eduardo F Morales L Enrique Sucar Luis Villasenor and Michael Grubinger. 2010. The segmented and annotated IAPR TC-12 benchmark. In CVIU.","DOI":"10.1016\/j.cviu.2009.03.008"},{"key":"e_1_3_2_1_8_1","unstructured":"Zhe Gan Yen-Chun Chen Linjie Li Chen Zhu Yu Cheng and Jingjing Liu. 2020. Large-scale adversarial training for vision-and-language representation learning. In NeurIPS."},{"key":"e_1_3_2_1_9_1","volume-title":"Tall: Temporal activity localization via language query. In ICCV.","author":"Gao Jiyang","year":"2017","unstructured":"Jiyang Gao, Chen Sun, Zhenheng Yang, and Ram Nevatia. 2017. Tall: Temporal activity localization via language query. In ICCV."},{"key":"e_1_3_2_1_10_1","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep residual learning for image recognition. In CVPR."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Sepp Hochreiter and J\u00fcrgen Schmidhuber. 1997. Long Short-Term Memory. In Neural Computation.","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"e_1_3_2_1_12_1","unstructured":"Richang Hong Daqing Liu Xiaoyu Mo Xiangnan He and Hanwang Zhang. 2019. Learning to compose and reason with language tree structures for visual grounding. In TPAMI."},{"key":"e_1_3_2_1_13_1","unstructured":"Ronghang Hu and Amanpreet Singh. 2021. UniT: Multimodal Multitask Learning with a Unified Transformer. In ICCV."},{"key":"e_1_3_2_1_14_1","unstructured":"Ronghang Hu Huazhe Xu Marcus Rohrbach Jiashi Feng Kate Saenko and Trevor Darrell. 2016. Natural language object retrieval. In CVPR."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Aishwarya Kamath Mannat Singh Yann LeCun Gabriel Synnaeve Ishan Misra and Nicolas Carion. 2021. MDETR-modulated detection for end-to-end multi-modal understanding. In ICCV.","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Sahar Kazemzadeh Vicente Ordonez Mark Matten and Tamara Berg. 2014. ReferItGame: Referring to Objects in Photographs of Natural Scenes. In EMNLP.","DOI":"10.3115\/v1\/D14-1086"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"Ranjay Krishna Yuke Zhu Oliver Groth Justin Johnson Kenji Hata Joshua Kravitz Stephanie Chen Yannis Kalantidis Li-Jia Li David A Shamma et al. 2017. Visual genome: Connecting language and vision using crowdsourced dense image annotations. International journal of computer vision Vol. 123 1 (2017) 32--73.","DOI":"10.1007\/s11263-016-0981-7"},{"key":"e_1_3_2_1_18_1","volume-title":"Referring Transformer: A One-step Approach to Multi-task Visual Grounding. In NeurIPS.","author":"Li Muchen","year":"2021","unstructured":"Muchen Li and Leonid Sigal. 2021. Referring Transformer: A One-step Approach to Multi-task Visual Grounding. In NeurIPS."},{"key":"e_1_3_2_1_19_1","unstructured":"Shuang Li Tong Xiao Hongsheng Li Wei Yang and Xiaogang Wang. 2017. Identity-aware textual-visual matching with latent co-attention. In ICCV."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Yue Liao Si Liu Guanbin Li Fei Wang Yanjie Chen Chen Qian and Bo Li. 2020. A real-time cross-modality correlation filtering method for referring expression comprehension. In CVPR.","DOI":"10.1109\/CVPR42600.2020.01089"},{"key":"e_1_3_2_1_21_1","unstructured":"Daqing Liu Hanwang Zhang Feng Wu and Zheng-Jun Zha. 2019b. Learning to assemble neural module tree networks for visual grounding. In ICCV."},{"key":"e_1_3_2_1_22_1","unstructured":"Shilong Liu Feng Li Hao Zhang Xiao Yang Xianbiao Qi Hang Su Jun Zhu and Lei Zhang. 2022. DAB-DETR: Dynamic Anchor Boxes are Better Queries for DETR. In ICLR."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"Xihui Liu Zihao Wang Jing Shao Xiaogang Wang and Hongsheng Li. 2019a. Improving referring expression grounding with cross-modal attention-guided erasing. In CVPR.","DOI":"10.1109\/CVPR.2019.00205"},{"key":"e_1_3_2_1_24_1","volume-title":"Swin Transformer: Hierarchical Vision Transformer Using Shifted Windows. In ICCV.","author":"Liu Ze","year":"2021","unstructured":"Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, and Baining Guo. 2021. Swin Transformer: Hierarchical Vision Transformer Using Shifted Windows. In ICCV."},{"key":"e_1_3_2_1_25_1","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. Decoupled weight decay regularization. In ICLR."},{"key":"e_1_3_2_1_26_1","volume-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In NeurIPS.","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In NeurIPS."},{"key":"e_1_3_2_1_27_1","unstructured":"Jiasen Lu Vedanuj Goswami Marcus Rohrbach Devi Parikh and Stefan Lee. 2020. 12-in-1: Multi-task vision and language representation learning. In CVPR."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"Gen Luo Yiyi Zhou Xiaoshuai Sun Liujuan Cao Chenglin Wu Cheng Deng and Rongrong Ji. 2020. Multi-Task Collaborative Network for Joint Referring Expression Comprehension and Segmentation. In CVPR.","DOI":"10.1109\/CVPR42600.2020.01005"},{"key":"e_1_3_2_1_29_1","unstructured":"Junhua Mao Jonathan Huang Alexander Toshev Oana Camburu Alan L Yuille and Kevin Murphy. 2016. Generation and comprehension of unambiguous object descriptions. In CVPR."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00363"},{"key":"e_1_3_2_1_31_1","volume-title":"Davis","author":"Nagaraja Varun K.","year":"2016","unstructured":"Varun K. Nagaraja, Vlad I. Morariu, and Larry S. Davis. 2016. Modeling Context Between Objects for Referring Expression Understanding. In ECCV."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Bryan A Plummer Paige Kordas M Hadi Kiapour Shuai Zheng Robinson Piramuthu and Svetlana Lazebnik. 2018. Conditional image-text embedding networks. In ECCV.","DOI":"10.1007\/978-3-030-01258-8_16"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Bryan A Plummer Liwei Wang Chris M Cervantes Juan C Caicedo Julia Hockenmaier and Svetlana Lazebnik. 2015. Flickr30k entities: Collecting region-to-phrase correspondences for richer image-to-sentence models. In ICCV.","DOI":"10.1109\/ICCV.2015.303"},{"key":"e_1_3_2_1_34_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In ICML."},{"key":"e_1_3_2_1_35_1","unstructured":"Shaoqing Ren Kaiming He Ross Girshick and Jian Sun. 2015. Faster r-cnn: Towards real-time object detection with region proposal networks. In NeurIPS."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"crossref","unstructured":"Hamid Rezatofighi Nathan Tsoi JunYoung Gwak Amir Sadeghian Ian Reid and Silvio Savarese. 2019. Generalized intersection over union: A metric and a loss for bounding box regression. In CVPR.","DOI":"10.1109\/CVPR.2019.00075"},{"key":"e_1_3_2_1_37_1","volume-title":"Grounding of Textual Phrases in Images by Reconstruction. ArXiv","author":"Rohrbach Anna","year":"2016","unstructured":"Anna Rohrbach, Marcus Rohrbach, Ronghang Hu, Trevor Darrell, and Bernt Schiele. 2016. Grounding of Textual Phrases in Images by Reconstruction. ArXiv, Vol. abs\/1511.03745 (2016)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","unstructured":"Jing Shi Ning Xu Yihang Xu Trung Bui Franck Dernoncourt and Chenliang Xu. 2021. Learning by planning: Language-guided global image editing. In CVPR.","DOI":"10.1109\/CVPR46437.2021.01338"},{"key":"e_1_3_2_1_39_1","volume-title":"LXMERT: Learning Cross-Modality Encoder Representations from Transformers. In EMNLP.","author":"Tan Hao","year":"2019","unstructured":"Hao Tan and Mohit Bansal. 2019. LXMERT: Learning Cross-Modality Encoder Representations from Transformers. In EMNLP."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"crossref","unstructured":"Stefanie Tellex Nakul Gopalan Hadas Kress-Gazit and Cynthia Matuszek. 2020. Robots that use language. In Annual Review of Control Robotics and Autonomous Systems.","DOI":"10.1146\/annurev-control-101119-071628"},{"key":"e_1_3_2_1_41_1","volume-title":"Theo Gevers, and Arnold WM Smeulders.","author":"Uijlings Jasper RR","year":"2013","unstructured":"Jasper RR Uijlings, Koen EA Van De Sande, Theo Gevers, and Arnold WM Smeulders. 2013. Selective search for object recognition. In IJCV."},{"key":"e_1_3_2_1_42_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N. Gomez Lukasz Kaiser and Illia Polosukhin. 2017. Attention Is All You Need. In NeurIPS."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Oriol Vinyals Alexander Toshev Samy Bengio and Dumitru Erhan. 2015. Show and tell: A neural image caption generator. In CVPR.","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"e_1_3_2_1_44_1","unstructured":"Liwei Wang Yin Li Jing Huang and Svetlana Lazebnik. 2018. Learning two-branch neural networks for image-text matching tasks. In TPAMI."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00928"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","unstructured":"Sibei Yang Guanbin Li and Yizhou Yu. 2019b. Dynamic graph attention for referring expression comprehension. In ICCV.","DOI":"10.1109\/ICCV.2019.00474"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"crossref","unstructured":"Zhengyuan Yang Tianlang Chen Liwei Wang and Jiebo Luo. 2020. Improving One-stage Visual Grounding by Recursive Sub-query Construction. In ECCV.","DOI":"10.1007\/978-3-030-58568-6_23"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"crossref","unstructured":"Zhengyuan Yang Boqing Gong Liwei Wang Wenbing Huang Dong Yu and Jiebo Luo. 2019a. A fast and accurate one-stage approach to visual grounding. In ICCV.","DOI":"10.1109\/ICCV.2019.00478"},{"key":"e_1_3_2_1_49_1","volume-title":"Ernie-vil: Knowledge enhanced vision-language representations through scene graph. In AAAI.","author":"Yu Fei","year":"2021","unstructured":"Fei Yu, Jiji Tang, Weichong Yin, Yu Sun, Hao Tian, Hua Wu, and Haifeng Wang. 2021. Ernie-vil: Knowledge enhanced vision-language representations through scene graph. In AAAI."},{"key":"e_1_3_2_1_50_1","unstructured":"Licheng Yu Zhe Lin Xiaohui Shen Jimei Yang Xin Lu Mohit Bansal and Tamara L Berg. 2018a. Mattnet: attention network for referring expression comprehension. In CVPR."},{"key":"e_1_3_2_1_51_1","unstructured":"Licheng Yu Patrick Poirson Shan Yang Alexander C Berg and Tamara L Berg. 2016. Modeling context in referring expressions. In ECCV."},{"key":"e_1_3_2_1_52_1","volume-title":"Rethinking Diversified and Discriminative Proposal Generation for Visual Grounding. IJCAI","author":"Yu Zhou","year":"2018","unstructured":"Zhou Yu, Jun Yu, Chenchao Xiang, Zhou Zhao, Qi Tian, and Dacheng Tao. 2018b. Rethinking Diversified and Discriminative Proposal Generation for Visual Grounding. IJCAI (2018)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"crossref","unstructured":"Luowei Zhou Hamid Palangi Lei Zhang Houdong Hu Jason Corso and Jianfeng Gao. 2020. Unified vision-language pre-training for image captioning and vqa. In AAAI.","DOI":"10.1609\/aaai.v34i07.7005"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_35"},{"key":"e_1_3_2_1_55_1","unstructured":"Xizhou Zhu Weijie Su Lewei Lu Bin Li Xiaogang Wang and Jifeng Dai. 2021. Deformable DETR: Deformable Transformers for End-to-End Object Detection. In ICLR."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"crossref","unstructured":"Yukun Zhu Ryan Kiros Rich Zemel Ruslan Salakhutdinov Raquel Urtasun Antonio Torralba and Sanja Fidler. 2015. Aligning Books and Movies: Towards Story-Like Visual Explanations by Watching Movies and Reading Books. In ICCV.","DOI":"10.1109\/ICCV.2015.11"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"crossref","unstructured":"C Lawrence Zitnick and Piotr Doll\u00e1r. 2014. Edge boxes: Locating object proposals from edges. In ECCV.","DOI":"10.1007\/978-3-319-10602-1_26"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612584","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612584","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:01:43Z","timestamp":1755820903000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612584"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":57,"alternative-id":["10.1145\/3581783.3612584","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612584","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}