{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,9]],"date-time":"2026-06-09T12:03:21Z","timestamp":1781006601266,"version":"3.54.1"},"reference-count":58,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Knowledge-Based Systems"],"published-print":{"date-parts":[[2026,6]]},"DOI":"10.1016\/j.knosys.2026.116102","type":"journal-article","created":{"date-parts":[[2026,5,9]],"date-time":"2026-05-09T13:07:25Z","timestamp":1778332045000},"page":"116102","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Improving visual grounding with expression-relevant object refinement and multi-order iterative reasoning"],"prefix":"10.1016","volume":"345","author":[{"given":"Qun","family":"Wang","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Feng","family":"Zhu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiang","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ruida","family":"Ye","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jianyu","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Pengfei","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.knosys.2026.116102_b1","doi-asserted-by":"crossref","unstructured":"J. Deng, Z. Yang, T. Chen, W. Zhou, H. Li, TransVG: End-to-End Visual Grounding with Transformers, in: 2021 IEEE\/CVF International Conference on Computer Vision, ICCV, 2021, pp. 1749\u20131759, http:\/\/dx.doi.org\/10.1109\/ICCV48922.2021.00179.","DOI":"10.1109\/ICCV48922.2021.00179"},{"key":"10.1016\/j.knosys.2026.116102_b2","series-title":"2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"9489","article-title":"Improving visual grounding with visual-linguistic verification and iterative reasoning","author":"Yang","year":"2022"},{"key":"10.1016\/j.knosys.2026.116102_b3","article-title":"Attention is all you need","volume":"30","author":"Vaswani","year":"2017","journal-title":"Adv. Neural Inf. Process. Syst."},{"issue":"11","key":"10.1016\/j.knosys.2026.116102_b4","doi-asserted-by":"crossref","first-page":"13636","DOI":"10.1109\/TPAMI.2023.3296823","article-title":"TransVG++: End-to-end visual grounding with language conditioned vision transformer","volume":"45","author":"Deng","year":"2023","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.knosys.2026.116102_b5","series-title":"2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"10857","article-title":"Language adaptive weight generation for multi-task visual grounding","author":"Su","year":"2023"},{"key":"10.1016\/j.knosys.2026.116102_b6","doi-asserted-by":"crossref","unstructured":"X. Liu, Z. Wang, J. Shao, X. Wang, H. Li, Improving referring expression grounding with cross-modal attention-guided erasing, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 1950\u20131959.","DOI":"10.1109\/CVPR.2019.00205"},{"issue":"2","key":"10.1016\/j.knosys.2026.116102_b7","doi-asserted-by":"crossref","first-page":"684","DOI":"10.1109\/TPAMI.2019.2911066","article-title":"Learning to compose and reason with language tree structures for visual grounding","volume":"44","author":"Hong","year":"2019","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.knosys.2026.116102_b8","series-title":"2022 IEEE International Conference on Multimedia and Expo","first-page":"1","article-title":"LPGN: Language-guided proposal generation network for referring expression comprehension","author":"Wang","year":"2022"},{"key":"10.1016\/j.knosys.2026.116102_b9","doi-asserted-by":"crossref","unstructured":"R. Girshick, Fast r-cnn, in: Proceedings of the IEEE International Conference on Computer Vision, 2015, pp. 1440\u20131448.","DOI":"10.1109\/ICCV.2015.169"},{"key":"10.1016\/j.knosys.2026.116102_b10","doi-asserted-by":"crossref","unstructured":"K. He, G. Gkioxari, P. Doll\u00e1r, R. Girshick, Mask r-cnn, in: Proceedings of the IEEE International Conference on Computer Vision, 2017, pp. 2961\u20132969.","DOI":"10.1109\/ICCV.2017.322"},{"key":"10.1016\/j.knosys.2026.116102_b11","series-title":"2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"1307","article-title":"MAttNet: Modular attention network for referring expression comprehension","author":"Yu","year":"2018"},{"key":"10.1016\/j.knosys.2026.116102_b12","doi-asserted-by":"crossref","unstructured":"J. Andreas, M. Rohrbach, T. Darrell, D. Klein, Neural module networks, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2016, pp. 39\u201348.","DOI":"10.1109\/CVPR.2016.12"},{"key":"10.1016\/j.knosys.2026.116102_b13","series-title":"2019 IEEE\/CVF International Conference on Computer Vision","first-page":"4643","article-title":"Dynamic graph attention for referring expression comprehension","author":"Yang","year":"2019"},{"key":"10.1016\/j.knosys.2026.116102_b14","doi-asserted-by":"crossref","unstructured":"P. Wang, Q. Wu, J. Cao, C. Shen, L. Gao, A.v.d. Hengel, Neighbourhood watch: Referring expression comprehension via language-guided graph attention networks, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 1960\u20131968.","DOI":"10.1109\/CVPR.2019.00206"},{"key":"10.1016\/j.knosys.2026.116102_b15","doi-asserted-by":"crossref","unstructured":"S. Yang, G. Li, Y. Yu, Graph-structured referring expression reasoning in the wild, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 9952\u20139961.","DOI":"10.1109\/CVPR42600.2020.00997"},{"issue":"8","key":"10.1016\/j.knosys.2026.116102_b16","doi-asserted-by":"crossref","first-page":"2765","DOI":"10.1109\/TPAMI.2020.2973983","article-title":"Relationship-embedded representation learning for grounding referring expressions","volume":"43","author":"Yang","year":"2021","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.knosys.2026.116102_b17","doi-asserted-by":"crossref","unstructured":"L. Chen, W. Ma, J. Xiao, H. Zhang, S.-F. Chang, Ref-nms: Breaking proposal bottlenecks in two-stage referring expression grounding, in: Proceedings of the AAAI Conference on Artificial Intelligence, Vol. 35, 2021, pp. 1036\u20131044.","DOI":"10.1609\/aaai.v35i2.16188"},{"issue":"5s","key":"10.1016\/j.knosys.2026.116102_b18","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3579095","article-title":"Vl-nms: Breaking proposal bottlenecks in two-stage visual-language matching","volume":"19","author":"Zhang","year":"2023","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl."},{"key":"10.1016\/j.knosys.2026.116102_b19","series-title":"Real-time referring expression comprehension by single-stage grounding network","author":"Chen","year":"2018"},{"key":"10.1016\/j.knosys.2026.116102_b20","doi-asserted-by":"crossref","unstructured":"Y. Liao, S. Liu, G. Li, F. Wang, Y. Chen, C. Qian, B. Li, A real-time cross-modality correlation filtering method for referring expression comprehension, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 10880\u201310889.","DOI":"10.1109\/CVPR42600.2020.01089"},{"key":"10.1016\/j.knosys.2026.116102_b21","article-title":"A real-time global inference network for one-stage referring expression comprehension","author":"Zhou","year":"2021","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"10.1016\/j.knosys.2026.116102_b22","doi-asserted-by":"crossref","unstructured":"Z. Yang, B. Gong, L. Wang, W. Huang, D. Yu, J. Luo, A fast and accurate one-stage approach to visual grounding, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019, pp. 4683\u20134693.","DOI":"10.1109\/ICCV.2019.00478"},{"key":"10.1016\/j.knosys.2026.116102_b23","series-title":"Yolov3: An incremental improvement","author":"Redmon","year":"2018"},{"key":"10.1016\/j.knosys.2026.116102_b24","series-title":"European Conference on Computer Vision","first-page":"387","article-title":"Improving one-stage visual grounding by recursive sub-query construction","author":"Yang","year":"2020"},{"key":"10.1016\/j.knosys.2026.116102_b25","article-title":"Xlnet: Generalized autoregressive pretraining for language understanding","volume":"32","author":"Yang","year":"2019","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2026.116102_b26","series-title":"Roberta: A robustly optimized bert pretraining approach","author":"Liu","year":"2019"},{"key":"10.1016\/j.knosys.2026.116102_b27","article-title":"Sampling based spherical transformer for 360 degree image classification","author":"Cho","year":"2023","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.knosys.2026.116102_b28","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2023.119997","article-title":"ETAM: Ensemble transformer with attention modules for detection of small objects","volume":"224","author":"Zhang","year":"2023","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.knosys.2026.116102_b29","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2022.118665","article-title":"Tiny object detection with context enhancement and feature purification","volume":"211","author":"Xiao","year":"2023","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.knosys.2026.116102_b30","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2023.120539","article-title":"HA-transformer: Harmonious aggregation from local to global for object detection","author":"Chen","year":"2023","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.knosys.2026.116102_b31","series-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2020"},{"key":"10.1016\/j.knosys.2026.116102_b32","series-title":"European Conference on Computer Vision","first-page":"213","article-title":"End-to-end object detection with transformers","author":"Carion","year":"2020"},{"key":"10.1016\/j.knosys.2026.116102_b33","doi-asserted-by":"crossref","unstructured":"A. Kamath, M. Singh, Y. LeCun, G. Synnaeve, I. Misra, N. Carion, Mdetr-modulated detection for end-to-end multi-modal understanding, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 1780\u20131790.","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"10.1016\/j.knosys.2026.116102_b34","series-title":"Referring expression comprehension via cross-level multi-modal fusion","author":"Miao","year":"2022"},{"key":"10.1016\/j.knosys.2026.116102_b35","series-title":"European Conference on Computer Vision","first-page":"3","article-title":"Yoro-lightweight end to end visual grounding","author":"Ho","year":"2022"},{"key":"10.1016\/j.knosys.2026.116102_b36","doi-asserted-by":"crossref","first-page":"854","DOI":"10.1109\/TIP.2022.3227466","article-title":"Rethinking and improving feature pyramids for one-stage referring expression comprehension","volume":"32","author":"Suo","year":"2023","journal-title":"IEEE Trans. Image Process."},{"issue":"6","key":"10.1016\/j.knosys.2026.116102_b37","first-page":"1","article-title":"Transformer-based visual grounding with cross-modality interaction","volume":"19","author":"Li","year":"2023","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl."},{"key":"10.1016\/j.knosys.2026.116102_b38","doi-asserted-by":"crossref","first-page":"4334","DOI":"10.1109\/TMM.2023.3321501","article-title":"CLIP-VG: Self-paced curriculum adapting of CLIP for visual grounding","volume":"26","author":"Xiao","year":"2024","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.knosys.2026.116102_b39","series-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018"},{"key":"10.1016\/j.knosys.2026.116102_b40","doi-asserted-by":"crossref","unstructured":"J. Ye, J. Tian, M. Yan, X. Yang, X. Wang, J. Zhang, L. He, X. Lin, Shifting more attention to visual backbone: Query-modulated refinement networks for end-to-end visual grounding, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 15502\u201315512.","DOI":"10.1109\/CVPR52688.2022.01506"},{"key":"10.1016\/j.knosys.2026.116102_b41","series-title":"2021 IEEE\/CVF International Conference on Computer Vision","first-page":"9992","article-title":"Swin transformer: Hierarchical vision transformer using shifted windows","author":"Liu","year":"2021"},{"key":"10.1016\/j.knosys.2026.116102_b42","doi-asserted-by":"crossref","unstructured":"K. He, X. Zhang, S. Ren, J. Sun, Deep residual learning for image recognition, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2016, pp. 770\u2013778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"10.1016\/j.knosys.2026.116102_b43","doi-asserted-by":"crossref","unstructured":"H. Rezatofighi, N. Tsoi, J. Gwak, A. Sadeghian, I. Reid, S. Savarese, Generalized intersection over union: A metric and a loss for bounding box regression, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 658\u2013666.","DOI":"10.1109\/CVPR.2019.00075"},{"key":"10.1016\/j.knosys.2026.116102_b44","series-title":"European Conference on Computer Vision","first-page":"69","article-title":"Modeling context in referring expressions","author":"Yu","year":"2016"},{"key":"10.1016\/j.knosys.2026.116102_b45","doi-asserted-by":"crossref","unstructured":"J. Mao, J. Huang, A. Toshev, O. Camburu, A.L. Yuille, K. Murphy, Generation and comprehension of unambiguous object descriptions, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2016, pp. 11\u201320.","DOI":"10.1109\/CVPR.2016.9"},{"key":"10.1016\/j.knosys.2026.116102_b46","doi-asserted-by":"crossref","unstructured":"B. Huang, D. Lian, W. Luo, S. Gao, Look before you leap: Learning landmark features for one-stage visual grounding, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021, pp. 16888\u201316897.","DOI":"10.1109\/CVPR46437.2021.01661"},{"key":"10.1016\/j.knosys.2026.116102_b47","doi-asserted-by":"crossref","first-page":"4266","DOI":"10.1109\/TIP.2022.3181516","article-title":"Progressive language-customized visual feature learning for one-stage visual grounding","volume":"31","author":"Liao","year":"2022","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.knosys.2026.116102_b48","series-title":"European Conference on Computer Vision","first-page":"598","article-title":"Seqtr: A simple yet universal network for visual grounding","author":"Zhu","year":"2022"},{"key":"10.1016\/j.knosys.2026.116102_b49","doi-asserted-by":"crossref","unstructured":"T. Liu, X. Liu, S. Huang, H. Chen, Q. Yin, L. Qin, D. Wang, Y. Hu, DARA: Domain- and Relation-Aware Adapters Make Parameter-Efficient Tuning for Visual Grounding, in: 2024 IEEE International Conference on Multimedia and Expo, ICME, 2024, pp. 1\u20136.","DOI":"10.1109\/ICME57554.2024.10688132"},{"key":"10.1016\/j.knosys.2026.116102_b50","doi-asserted-by":"crossref","unstructured":"T. Liu, Z. Xu, Y. Hu, L. Shi, Z. Wang, Q. Yin, MaPPER: Multimodal Prior-guided Parameter Efficient Tuning for Referring Expression Comprehension, in: Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, 2024, pp. 4984\u20134994.","DOI":"10.18653\/v1\/2024.emnlp-main.287"},{"key":"10.1016\/j.knosys.2026.116102_b51","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2024.123223","article-title":"Improving visual grounding with multi-scale discrepancy information and centralized-transformer","volume":"247","author":"Wu","year":"2024","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.knosys.2026.116102_b52","doi-asserted-by":"crossref","unstructured":"Z. Cheng, K. Li, P. Jin, S. Li, X. Ji, L. Yuan, C. Liu, J. Chen, Parallel vertex diffusion for unified visual grounding, in: Proceedings of the AAAI Conference on Artificial Intelligence, Vol. 38, 2024, pp. 1326\u20131334.","DOI":"10.1609\/aaai.v38i2.27896"},{"key":"10.1016\/j.knosys.2026.116102_b53","doi-asserted-by":"crossref","unstructured":"R. Yao, S. Xiong, Y. Zhao, Y. Rong, Visual Grounding with Multi-modal Conditional Adaptation, in: Proceedings of the 32nd ACM International Conference on Multimedia, 2024, pp. 3877\u20133886.","DOI":"10.1145\/3664647.3681256"},{"key":"10.1016\/j.knosys.2026.116102_b54","article-title":"LGR-NET: Language guided reasoning network for referring expression comprehension","author":"Lu","year":"2024","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.knosys.2026.116102_b55","series-title":"Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, the Netherlands, October 11\u201314, 2016, Proceedings, Part IV 14","first-page":"792","article-title":"Modeling context between objects for referring expression understanding","author":"Nagaraja","year":"2016"},{"key":"10.1016\/j.knosys.2026.116102_b56","series-title":"Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13","first-page":"740","article-title":"Microsoft coco: Common objects in context","author":"Lin","year":"2014"},{"key":"10.1016\/j.knosys.2026.116102_b57","series-title":"Decoupled weight decay regularization","author":"Loshchilov","year":"2017"},{"key":"10.1016\/j.knosys.2026.116102_b58","series-title":"European Conference on Computer Vision","first-page":"280","article-title":"Exploring plain vision transformer backbones for object detection","author":"Li","year":"2022"}],"container-title":["Knowledge-Based Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705126008282?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705126008282?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,9]],"date-time":"2026-06-09T11:29:29Z","timestamp":1781004569000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0950705126008282"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6]]},"references-count":58,"alternative-id":["S0950705126008282"],"URL":"https:\/\/doi.org\/10.1016\/j.knosys.2026.116102","relation":{},"ISSN":["0950-7051"],"issn-type":[{"value":"0950-7051","type":"print"}],"subject":[],"published":{"date-parts":[[2026,6]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Improving visual grounding with expression-relevant object refinement and multi-order iterative reasoning","name":"articletitle","label":"Article Title"},{"value":"Knowledge-Based Systems","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.knosys.2026.116102","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"116102"}}