{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,21]],"date-time":"2026-04-21T12:40:53Z","timestamp":1776775253330,"version":"3.51.2"},"reference-count":56,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neurocomputing"],"published-print":{"date-parts":[[2026,6]]},"DOI":"10.1016\/j.neucom.2026.133497","type":"journal-article","created":{"date-parts":[[2026,3,31]],"date-time":"2026-03-31T06:49:52Z","timestamp":1774939792000},"page":"133497","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["GTA: Geometric transform-attention network for enhanced spatial reasoning in image captioning"],"prefix":"10.1016","volume":"683","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-9339-4393","authenticated-orcid":false,"given":"Mohammad Alamgir","family":"Hossain","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhongfu","family":"Ye","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8546-5426","authenticated-orcid":false,"given":"Md. Bipul","family":"Hossen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2854-4419","authenticated-orcid":false,"given":"Md Shohidul","family":"Islam","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Md. Ibrahim","family":"Abdullah","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"78","reference":[{"key":"10.1016\/j.neucom.2026.133497_bib0005","doi-asserted-by":"crossref","DOI":"10.1016\/j.cviu.2024.104165","article-title":"Triple-stream commonsense circulation transformer network for image captioning","volume":"249","author":"Li","year":"2024","journal-title":"Comput. Vis. Image Underst."},{"key":"10.1016\/j.neucom.2026.133497_bib0010","doi-asserted-by":"crossref","DOI":"10.1016\/j.displa.2024.102798","article-title":"Iceap: an advanced fine-grained image captioning network with enhanced attribute predictor","volume":"84","author":"Hossen","year":"2024","journal-title":"Displays"},{"key":"10.1016\/j.neucom.2026.133497_bib0015","doi-asserted-by":"crossref","first-page":"2851","DOI":"10.1109\/TMM.2022.3152086","article-title":"Self-supervised correlation learning for cross-modal retrieval","volume":"25","author":"Liu","year":"2023","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.neucom.2026.133497_bib0020","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.110555","article-title":"Cast: cross-modal retrieval and visual conditioning for image captioning","volume":"153","author":"Cao","year":"2024","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.neucom.2026.133497_bib0025","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"3156","article-title":"Show and tell: a neural image caption generator","author":"Vinyals","year":"2015"},{"key":"10.1016\/j.neucom.2026.133497_bib0030","series-title":"International Conference on Machine Learning","first-page":"2048","article-title":"Show, attend and tell: neural image caption generation with visual attention","author":"Xu","year":"2015"},{"key":"10.1016\/j.neucom.2026.133497_bib0035","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2023.109420","article-title":"Towards local visual modeling for image captioning","volume":"138","author":"Ma","year":"2023","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.neucom.2026.133497_bib0040","doi-asserted-by":"crossref","first-page":"3962","DOI":"10.1109\/TMM.2022.3169061","article-title":"Multi-branch distance-sensitive self-attention network for image captioning","volume":"25","author":"Ji","year":"2023","journal-title":"IEEE Trans. Multimedia"},{"key":"10.1016\/j.neucom.2026.133497_bib0045","series-title":"Advances in Neural Information Processing Systems","article-title":"Attention is all you need","author":"Vaswani","year":"2017"},{"key":"10.1016\/j.neucom.2026.133497_bib0050","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"10971","article-title":"X-linear attention networks for image captioning","author":"Pan","year":"2020"},{"key":"10.1016\/j.neucom.2026.133497_bib0055","author":"Chen"},{"key":"10.1016\/j.neucom.2026.133497_bib0060","author":"Yang"},{"key":"10.1016\/j.neucom.2026.133497_bib0065","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","article-title":"Meacap: memory-augmented zero-shot image captioning","author":"Zeng","year":"2024"},{"key":"10.1016\/j.neucom.2026.133497_bib0070","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","article-title":"Evcap: retrieval-augmented image captioning with external visual-name memory for open-world comprehension","author":"Li","year":"2024"},{"key":"10.1016\/j.neucom.2026.133497_bib0075","series-title":"European Conference on Computer Vision (ECCV)","article-title":"Unleashing text-to-image diffusion prior for zero-shot image captioning","author":"Luo","year":"2024"},{"key":"10.1016\/j.neucom.2026.133497_bib0080","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2022.117174","article-title":"Geometry attention transformer with position-aware lstms for image captioning","volume":"201","author":"Wang","year":"2022","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.neucom.2026.133497_bib0085","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2025.126692","article-title":"Geoscn: a novel multimodal self-attention to integrate geometric information on spatial-channel network for fine-grained image captioning","volume":"272","author":"Hossain","year":"2025","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.neucom.2026.133497_bib0090","series-title":"Computer Vision \u2013 ECCV 2010","first-page":"15","article-title":"Every picture tells a story: generating sentences from images","author":"Farhadi","year":"2010"},{"key":"10.1016\/j.neucom.2026.133497_bib0095","series-title":"2011 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","article-title":"Baby talk: understanding and generating simple image descriptions","author":"Kulkarni","year":"2011"},{"key":"10.1016\/j.neucom.2026.133497_bib0100","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"6077","article-title":"Bottom-up and top-down attention for image captioning and visual question answering","author":"Anderson","year":"2018"},{"key":"10.1016\/j.neucom.2026.133497_bib0105","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"3588","article-title":"Relation networks for object detection","author":"Hu","year":"2018"},{"key":"10.1016\/j.neucom.2026.133497_bib0110","series-title":"Proceedings of the 39th International Conference on Machine Learning (ICML)","first-page":"12888","article-title":"BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation","author":"Li","year":"2022"},{"key":"10.1016\/j.neucom.2026.133497_bib0115","series-title":"2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"1179","article-title":"Self-critical sequence training for image captioning","author":"Rennie","year":"2017"},{"key":"10.1016\/j.neucom.2026.133497_bib0120","doi-asserted-by":"crossref","DOI":"10.1016\/j.dsp.2025.105435","article-title":"Gava: spatial awareness in image captioning with geometric-aware visual attention","volume":"167","author":"Hossain","year":"2025","journal-title":"Digit. Signal Process."},{"key":"10.1016\/j.neucom.2026.133497_bib0125","doi-asserted-by":"crossref","DOI":"10.1016\/j.engappai.2026.114014","article-title":"Hierarchical region-context attention for image captioning","volume":"168","author":"Hossain","year":"2026","journal-title":"Eng. Appl. Artif. Intell."},{"key":"10.1016\/j.neucom.2026.133497_bib0130","series-title":"Advances in Neural Information Processing Systems","first-page":"91","article-title":"Faster r-Cnn: towards real-time object detection with region proposal networks","author":"Ren","year":"2015"},{"key":"10.1016\/j.neucom.2026.133497_bib0135","series-title":"Proceedings of the European Conference on Computer Vision (ECCV)","first-page":"684","article-title":"Exploring visual relationship for image captioning","author":"Yao","year":"2018"},{"key":"10.1016\/j.neucom.2026.133497_bib0140","series-title":"Proceedings of the 34th International Conference on Machine Learning","first-page":"933","article-title":"Language modeling with gated convolutional networks","author":"Dauphin","year":"2017"},{"key":"10.1016\/j.neucom.2026.133497_bib0145","series-title":"Deep Learning","author":"Goodfellow","year":"2016"},{"key":"10.1016\/j.neucom.2026.133497_bib0150","series-title":"2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"4566","article-title":"Cider: consensus-based image description evaluation","author":"Vedantam","year":"2015"},{"key":"10.1016\/j.neucom.2026.133497_bib0155","series-title":"Computer Vision \u2013 ECCV 2014","first-page":"740","article-title":"Microsoft COCO: common objects in context","author":"Lin","year":"2014"},{"key":"10.1016\/j.neucom.2026.133497_bib0160","series-title":"Proceedings of ACL 2002","article-title":"BLEU: a method for automatic evaluation of machine translation","author":"Papineni","year":"2002"},{"key":"10.1016\/j.neucom.2026.133497_bib0165","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"3128","article-title":"Deep visual-semantic alignments for generating image descriptions","author":"Karpathy","year":"2015"},{"key":"10.1016\/j.neucom.2026.133497_bib0170","doi-asserted-by":"crossref","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","article-title":"Visual genome: connecting language and vision using crowdsourced dense image annotations","volume":"123","author":"Krishna","year":"2017","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.neucom.2026.133497_bib0175","series-title":"Advances in Neural Information Processing Systems (NeurIPS)","first-page":"8024","article-title":"Pytorch: an imperative style, high-performance deep learning library","author":"Paszke","year":"2019"},{"key":"10.1016\/j.neucom.2026.133497_bib0180","author":"Kingma"},{"key":"10.1016\/j.neucom.2026.133497_bib0185","series-title":"Advances in Neural Information Processing Systems (NeurIPS)","article-title":"Scheduled sampling for sequence prediction with recurrent neural networks","author":"Bengio","year":"2015"},{"key":"10.1016\/j.neucom.2026.133497_bib0190","series-title":"Proceedings of the Ninth Workshop on Statistical Machine Translation","first-page":"376","article-title":"Meteor Universal: language specific translation evaluation for any target language","author":"Denkowski","year":"2014"},{"key":"10.1016\/j.neucom.2026.133497_bib0195","series-title":"Proceedings of the ACL Workshop on Text Summarization Branches Out","first-page":"74","article-title":"ROUGE: a package for automatic evaluation of summaries","author":"Lin","year":"2004"},{"key":"10.1016\/j.neucom.2026.133497_bib0200","series-title":"Computer Vision \u2013 ECCV 2016","article-title":"Spice: semantic propositional image caption evaluation","author":"Anderson","year":"2016"},{"key":"10.1016\/j.neucom.2026.133497_bib0205","series-title":"2019 IEEE\/CVF International Conference on Computer Vision (ICCV)","first-page":"4634","article-title":"Attention on attention for image captioning","author":"Huang","year":"2019"},{"key":"10.1016\/j.neucom.2026.133497_bib0210","series-title":"2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","article-title":"Meshed-memory transformer for image captioning","author":"Cornia","year":"2020"},{"key":"10.1016\/j.neucom.2026.133497_bib0215","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"13041","article-title":"Unified vision-language pre-training for image captioning and VQA","author":"Zhou","year":"2020"},{"key":"10.1016\/j.neucom.2026.133497_bib0220","doi-asserted-by":"crossref","first-page":"18413","DOI":"10.1007\/s11042-021-10578-9","article-title":"Mrrc: multiple role representation crossover interpretation for image captioning","volume":"80","author":"Sur","year":"2021","journal-title":"Multim. Tools Appl."},{"key":"10.1016\/j.neucom.2026.133497_bib0225","doi-asserted-by":"crossref","DOI":"10.1016\/j.engappai.2022.105194","article-title":"Dynamic-balanced double-attention fusion for image captioning","volume":"114","author":"Wang","year":"2022","journal-title":"Eng. Appl. Artif. Intell."},{"key":"10.1016\/j.neucom.2026.133497_bib0230","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"23359","article-title":"Semantic-conditional diffusion networks for image captioning","author":"Luo","year":"2023"},{"key":"10.1016\/j.neucom.2026.133497_bib0235","doi-asserted-by":"crossref","first-page":"812","DOI":"10.1016\/j.ins.2022.12.018","article-title":"Label-attention transformer with geometrically coherent objects for image captioning","volume":"623","author":"Dubey","year":"2023","journal-title":"Inf. Sci."},{"key":"10.1016\/j.neucom.2026.133497_bib0240","doi-asserted-by":"crossref","DOI":"10.1016\/j.engappai.2024.109134","article-title":"Attribute-driven filtering: a new attributes predicting approach for fine-grained image captioning","volume":"137","author":"Hossen","year":"2024","journal-title":"Eng. Appl. Artif. Intell."},{"key":"10.1016\/j.neucom.2026.133497_bib0245","article-title":"From multi-scale grids to dynamic regions: dual-relation enhanced transformer for image captioning","author":"Zhou","year":"2025","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.neucom.2026.133497_bib0250","article-title":"From grids to pseudo-regions: dynamic memory augmented image captioning with dual relation transformer","author":"Zhou","year":"2025","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.neucom.2026.133497_bib0255","article-title":"M3ixup: a multi-modal data augmentation approach for image captioning","author":"Li","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.neucom.2026.133497_bib0260","doi-asserted-by":"crossref","DOI":"10.1016\/j.dsp.2025.105155","article-title":"Arafnet: an attribute refinement attention fusion network for advanced visual captioning","volume":"162","author":"Hossen","year":"2025","journal-title":"Digit. Signal Process."},{"key":"10.1016\/j.neucom.2026.133497_bib0265","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"10685","article-title":"Auto-encoding scene graphs for image captioning","author":"Yang","year":"2019"},{"key":"10.1016\/j.neucom.2026.133497_bib0270","doi-asserted-by":"crossref","first-page":"6575","DOI":"10.1007\/s10489-021-02734-3","article-title":"Image captioning with adaptive incremental global context attention","volume":"52","author":"Wang","year":"2022","journal-title":"Appl. Intell."},{"key":"10.1016\/j.neucom.2026.133497_bib0275","doi-asserted-by":"crossref","first-page":"2088","DOI":"10.1109\/TPAMI.2022.3159811","article-title":"On distinctive image captioning via comparing and reweighting","volume":"45","author":"Wang","year":"2023","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.neucom.2026.133497_bib0280","author":"Wang"}],"container-title":["Neurocomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0925231226008945?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0925231226008945?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,21]],"date-time":"2026-04-21T11:46:19Z","timestamp":1776771979000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0925231226008945"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6]]},"references-count":56,"alternative-id":["S0925231226008945"],"URL":"https:\/\/doi.org\/10.1016\/j.neucom.2026.133497","relation":{},"ISSN":["0925-2312"],"issn-type":[{"value":"0925-2312","type":"print"}],"subject":[],"published":{"date-parts":[[2026,6]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"GTA: Geometric transform-attention network for enhanced spatial reasoning in image captioning","name":"articletitle","label":"Article Title"},{"value":"Neurocomputing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neucom.2026.133497","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"133497"}}