{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T17:06:13Z","timestamp":1780074373439,"version":"3.54.0"},"reference-count":55,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100012165","name":"Key Technologies Research and Development Program","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100012165","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition"],"published-print":{"date-parts":[[2026,12]]},"DOI":"10.1016\/j.patcog.2026.114075","type":"journal-article","created":{"date-parts":[[2026,5,28]],"date-time":"2026-05-28T14:57:59Z","timestamp":1779980279000},"page":"114075","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"PB","title":["A fine-grained entity understanding network for weakly supervised phrase grounding"],"prefix":"10.1016","volume":"180","author":[{"given":"Pengyue","family":"Lin","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3543-6272","authenticated-orcid":false,"given":"Ruifan","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Fangxiang","family":"Feng","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Lun","family":"Ke","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhanyu","family":"Ma","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiaojie","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.patcog.2026.114075_b1","doi-asserted-by":"crossref","DOI":"10.1016\/j.inffus.2025.103625","article-title":"Visual grounding in 2D and 3D: A unified perspective and survey","volume":"126","author":"Guo","year":"2026","journal-title":"Inf. Fusion"},{"key":"10.1016\/j.patcog.2026.114075_b2","doi-asserted-by":"crossref","unstructured":"P. Dogan, L. Sigal, M. Gross, Neural sequential phrase grounding (seqground), in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 4175\u20134184.","DOI":"10.1109\/CVPR.2019.00430"},{"key":"10.1016\/j.patcog.2026.114075_b3","first-page":"13587","article-title":"Disentangled motif-aware graph learning for phrase grounding","volume":"vol. 35","author":"Mu","year":"2021"},{"key":"10.1016\/j.patcog.2026.114075_b4","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.111222","article-title":"Graph-based referring expression comprehension with expression-guided selective filtering and noun-oriented reasoning","volume":"161","author":"Ke","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.114075_b5","doi-asserted-by":"crossref","unstructured":"Z. Yang, B. Gong, L. Wang, W. Huang, D. Yu, J. Luo, A fast and accurate one-stage approach to visual grounding, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019, pp. 4683\u20134693.","DOI":"10.1109\/ICCV.2019.00478"},{"key":"10.1016\/j.patcog.2026.114075_b6","doi-asserted-by":"crossref","first-page":"4266","DOI":"10.1109\/TIP.2022.3181516","article-title":"Progressive language-customized visual feature learning for one-stage visual grounding","volume":"31","author":"Liao","year":"2022","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.patcog.2026.114075_b7","first-page":"1728","article-title":"DQ-DETR: Dual query detection transformer for phrase extraction and grounding","volume":"vol. 37","author":"Liu","year":"2023"},{"key":"10.1016\/j.patcog.2026.114075_b8","article-title":"DCART: A dual contrastive alignment residual transformer model for visual grounding","author":"Zhu","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.114075_b9","first-page":"36067","article-title":"GLIPv2: Unifying localization and vision-language understanding","volume":"35","author":"Zhang","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2026.114075_b10","article-title":"Phrase grounding-based style transfer for single-domain generalized object detection","author":"Li","year":"2025","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.patcog.2026.114075_b11","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2025.111663","article-title":"Language\u2013Image consistency augmentation and distillation network for visual grounding","volume":"166","author":"Ke","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.114075_b12","article-title":"Visual instruction tuning","volume":"36","author":"Liu","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2026.114075_b13","doi-asserted-by":"crossref","unstructured":"H. Rasheed, M. Maaz, S. Shaji, A. Shaker, S. Khan, H. Cholakkal, R.M. Anwer, E. Xing, M.-H. Yang, F.S. Khan, Glamm: Pixel grounding large multimodal model, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 13009\u201313018.","DOI":"10.1109\/CVPR52733.2024.01236"},{"key":"10.1016\/j.patcog.2026.114075_b14","series-title":"Qwen2.5-VL","author":"Team","year":"2025"},{"key":"10.1016\/j.patcog.2026.114075_b15","series-title":"DeepSeek-VL2: Mixture-of-experts vision-language models for advanced multimodal understanding","author":"Wu","year":"2024"},{"key":"10.1016\/j.patcog.2026.114075_b16","doi-asserted-by":"crossref","unstructured":"Y. Liu, B. Wan, L. Ma, X. He, Relation-aware instance refinement for weakly supervised visual grounding, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021, pp. 5612\u20135621.","DOI":"10.1109\/CVPR46437.2021.00556"},{"key":"10.1016\/j.patcog.2026.114075_b17","doi-asserted-by":"crossref","unstructured":"K. Chen, R. Zhang, S. Mensah, Y. Mao, Contrastive learning with expectation-maximization for weakly supervised phrase grounding, in: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, 2022, pp. 8549\u20138559.","DOI":"10.18653\/v1\/2022.emnlp-main.586"},{"key":"10.1016\/j.patcog.2026.114075_b18","first-page":"24348","article-title":"Momentum pseudo-labeling for weakly supervised phrase grounding","volume":"vol. 39","author":"Kuang","year":"2025"},{"key":"10.1016\/j.patcog.2026.114075_b19","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"12476","article-title":"Multi-level multimodal common semantic space for image-phrase grounding","author":"Akbari","year":"2019"},{"key":"10.1016\/j.patcog.2026.114075_b20","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"1801","article-title":"Detector-free weakly supervised grounding by separation","author":"Arbelle","year":"2021"},{"key":"10.1016\/j.patcog.2026.114075_b21","doi-asserted-by":"crossref","unstructured":"T. Shaharabany, L. Wolf, Similarity Maps for Self-Training Weakly-Supervised Phrase Grounding, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 6925\u20136934.","DOI":"10.1109\/CVPR52729.2023.00669"},{"key":"10.1016\/j.patcog.2026.114075_b22","series-title":"ACM Multimedia 2024","article-title":"Triple alignment strategies for zero-shot phrase grounding under weak supervision","author":"Lin","year":"2024"},{"key":"10.1016\/j.patcog.2026.114075_b23","doi-asserted-by":"crossref","unstructured":"Y. Zeng, Y. Huang, J. Zhang, Z. Jie, Z. Chai, L. Wang, Investigating Compositional Challenges in Vision-Language Models for Visual Grounding, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 14141\u201314151.","DOI":"10.1109\/CVPR52733.2024.01341"},{"key":"10.1016\/j.patcog.2026.114075_b24","doi-asserted-by":"crossref","unstructured":"M. Li, C. Wang, W. Feng, S. Lyu, G. Cheng, X. Li, B. Liu, Q. Zhao, Iterative robust visual grounding with masked reference based centerpoint supervision, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 4651\u20134656.","DOI":"10.1109\/ICCVW60793.2023.00501"},{"issue":"1","key":"10.1016\/j.patcog.2026.114075_b25","doi-asserted-by":"crossref","first-page":"75","DOI":"10.1109\/TCSVT.2024.3452418","article-title":"A masked reference token supervision-based iterative visual-language framework for robust visual grounding","volume":"35","author":"Wang","year":"2024","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.patcog.2026.114075_b26","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2026.113555","article-title":"PLRVG: Progressive layer-wise refinement for visual grounding via deep-to-shallow decoding","volume":"179","author":"Cheng","year":"2026","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.114075_b27","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2025.112484","article-title":"Learning unified patterns of multimodalities for video temporal grounding","volume":"172","author":"Yang","year":"2026","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.114075_b28","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2025.112926","article-title":"LSVG: Language-guided scene graphs with 2D-assisted multi-modal encoding for 3D visual grounding","volume":"174","author":"Xiao","year":"2026","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.114075_b29","series-title":"Adapting clip for phrase localization without further training","author":"Li","year":"2022"},{"key":"10.1016\/j.patcog.2026.114075_b30","series-title":"European Conference on Computer Vision","first-page":"696","article-title":"Extract free dense labels from clip","author":"Zhou","year":"2022"},{"key":"10.1016\/j.patcog.2026.114075_b31","doi-asserted-by":"crossref","unstructured":"J. Lu, V. Goswami, M. Rohrbach, D. Parikh, S. Lee, 12-in-1: Multi-task vision and language representation learning, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 10437\u201310446.","DOI":"10.1109\/CVPR42600.2020.01045"},{"key":"10.1016\/j.patcog.2026.114075_b32","doi-asserted-by":"crossref","unstructured":"L. Wang, J. Huang, Y. Li, K. Xu, Z. Yang, D. Yu, Improving weakly supervised visual grounding by contrastive knowledge distillation, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021, pp. 14090\u201314100.","DOI":"10.1109\/CVPR46437.2021.01387"},{"key":"10.1016\/j.patcog.2026.114075_b33","series-title":"European Conference on Computer Vision","first-page":"752","article-title":"Contrastive learning for weakly supervised phrase grounding","author":"Gupta","year":"2020"},{"key":"10.1016\/j.patcog.2026.114075_b34","doi-asserted-by":"crossref","unstructured":"K. Chen, J. Gao, R. Nevatia, Knowledge aided consistency for weakly supervised phrase grounding, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2018, pp. 4042\u20134050.","DOI":"10.1109\/CVPR.2018.00425"},{"key":"10.1016\/j.patcog.2026.114075_b35","doi-asserted-by":"crossref","unstructured":"S. Datta, K. Sikka, A. Roy, K. Ahuja, D. Parikh, A. Divakaran, Align2ground: Weakly supervised phrase grounding guided by image-caption alignment, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019, pp. 2601\u20132610.","DOI":"10.1109\/ICCV.2019.00269"},{"key":"10.1016\/j.patcog.2026.114075_b36","doi-asserted-by":"crossref","unstructured":"H. Fang, S. Gupta, F. Iandola, R.K. Srivastava, L. Deng, P. Doll\u00e1r, J. Gao, X. He, M. Mitchell, J.C. Platt, et al., From captions to visual concepts and back, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2015, pp. 1473\u20131482.","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"10.1016\/j.patcog.2026.114075_b37","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"5945","article-title":"Weakly-supervised visual grounding of phrases with linguistic structures","author":"Xiao","year":"2017"},{"key":"10.1016\/j.patcog.2026.114075_b38","doi-asserted-by":"crossref","unstructured":"S.A. Javed, S. Saxena, V. Gandhi, Learning unsupervised visual grounding through semantic self-supervision, in: Proceedings of the 28th International Joint Conference on Artificial Intelligence, 2019, pp. 796\u2013802.","DOI":"10.24963\/ijcai.2019\/112"},{"issue":"10","key":"10.1016\/j.patcog.2026.114075_b39","doi-asserted-by":"crossref","first-page":"1084","DOI":"10.1007\/s11263-017-1059-x","article-title":"Top-down neural attention by excitation backprop","volume":"126","author":"Zhang","year":"2018","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.patcog.2026.114075_b40","series-title":"International Conference on Machine Learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.patcog.2026.114075_b41","first-page":"9694","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","volume":"34","author":"Li","year":"2021","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2026.114075_b42","unstructured":"R. He, Z. Yang, P. Cascante-Bonilla, A.C. Berg, V. Ordonez, Learning from Synthetic Data for Visual Grounding, in: Synthetic Data for Computer Vision Workshop@ CVPR 2025, 2025."},{"key":"10.1016\/j.patcog.2026.114075_b43","series-title":"Barking up the syntactic tree: Enhancing VLM training with syntactic losses","author":"Luo","year":"2025"},{"key":"10.1016\/j.patcog.2026.114075_b44","doi-asserted-by":"crossref","unstructured":"R. He, P. Cascante-Bonilla, Z. Yang, A.C. Berg, V. Ordonez, Improved Visual Grounding through Self-Consistent Explanations, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 13095\u201313105.","DOI":"10.1109\/CVPR52733.2024.01244"},{"key":"10.1016\/j.patcog.2026.114075_b45","series-title":"2024 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"7895","article-title":"Visual prompt tuning for weakly supervised phrase grounding","author":"Lin","year":"2024"},{"key":"10.1016\/j.patcog.2026.114075_b46","doi-asserted-by":"crossref","first-page":"28222","DOI":"10.52202\/068431-2046","article-title":"What is where by looking: Weakly-supervised open-world phrase-grounding without text inputs","volume":"35","author":"Shaharabany","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2026.114075_b47","doi-asserted-by":"crossref","unstructured":"E. Gomel, T. Shaharbany, L. Wolf, Box-based Refinement for Weakly Supervised and Unsupervised Localization Tasks, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 16044\u201316054.","DOI":"10.1109\/ICCV51070.2023.01470"},{"key":"10.1016\/j.patcog.2026.114075_b48","series-title":"European Conference on Computer Vision","first-page":"315","article-title":"Sclip: Rethinking self-attention for dense vision-language inference","author":"Wang","year":"2024"},{"key":"10.1016\/j.patcog.2026.114075_b49","doi-asserted-by":"crossref","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","article-title":"Visual genome: Connecting language and vision using crowdsourced dense image annotations","volume":"123","author":"Krishna","year":"2017","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.patcog.2026.114075_b50","series-title":"European Conference on Computer Vision","first-page":"740","article-title":"Microsoft coco: Common objects in context","author":"Lin","year":"2014"},{"key":"10.1016\/j.patcog.2026.114075_b51","series-title":"Representation learning with contrastive predictive coding","author":"Oord","year":"2018"},{"key":"10.1016\/j.patcog.2026.114075_b52","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"16044","article-title":"Box-based refinement for weakly supervised and unsupervised localization tasks","author":"Gomel","year":"2023"},{"key":"10.1016\/j.patcog.2026.114075_b53","series-title":"Proceedings of the IEEE International Conference on Computer Vision","first-page":"2641","article-title":"Flickr30k entities: Collecting region-to-phrase correspondences for richer image-to-sentence models","author":"Plummer","year":"2015"},{"key":"10.1016\/j.patcog.2026.114075_b54","doi-asserted-by":"crossref","unstructured":"S. Kazemzadeh, V. Ordonez, M. Matten, T. Berg, Referitgame: Referring to objects in photographs of natural scenes, in: Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing, EMNLP, 2014, pp. 787\u2013798.","DOI":"10.3115\/v1\/D14-1086"},{"key":"10.1016\/j.patcog.2026.114075_b55","series-title":"International Conference on Machine Learning","first-page":"12888","article-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation","author":"Li","year":"2022"}],"container-title":["Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S003132032601040X?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S003132032601040X?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T16:59:20Z","timestamp":1780073960000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S003132032601040X"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,12]]},"references-count":55,"alternative-id":["S003132032601040X"],"URL":"https:\/\/doi.org\/10.1016\/j.patcog.2026.114075","relation":{},"ISSN":["0031-3203"],"issn-type":[{"value":"0031-3203","type":"print"}],"subject":[],"published":{"date-parts":[[2026,12]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"A fine-grained entity understanding network for weakly supervised phrase grounding","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patcog.2026.114075","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"114075"}}