{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,12]],"date-time":"2026-06-12T16:52:05Z","timestamp":1781283125363,"version":"3.54.1"},"reference-count":36,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2021,11,27]],"date-time":"2021-11-27T00:00:00Z","timestamp":1637971200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,11,27]],"date-time":"2021-11-27T00:00:00Z","timestamp":1637971200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["SN COMPUT. SCI."],"published-print":{"date-parts":[[2022,1]]},"DOI":"10.1007\/s42979-021-00975-0","type":"journal-article","created":{"date-parts":[[2021,11,27]],"date-time":"2021-11-27T13:03:40Z","timestamp":1638018220000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":18,"title":["Bornon: Bengali Image Captioning with Transformer-Based Deep Learning Approach"],"prefix":"10.1007","volume":"3","author":[{"given":"Faisal","family":"Muhammad Shah","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6179-800X","authenticated-orcid":false,"given":"Mayeesha","family":"Humaira","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Md Abidur Rahman Khan","family":"Jim","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Amit","family":"Saha Ami","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shimul","family":"Paul","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2021,11,27]]},"reference":[{"key":"975_CR1","doi-asserted-by":"publisher","unstructured":"Subash R, Jebakumar R, Kamdar Y, Bhatt N. Automatic image captioning using convolution neural networks and LSTM. J Phys Conf Ser. 2019. https:\/\/doi.org\/10.1088\/1742-6596\/1362\/1\/012096.","DOI":"10.1088\/1742-6596\/1362\/1\/012096"},{"key":"975_CR2","doi-asserted-by":"publisher","unstructured":"Wang C, Yang H, Meinel C. Image captioning with deep bidirectional LSTMs and multi-task learning. ACM Trans Multimed Comput Commun Appl. 2018. https:\/\/doi.org\/10.1145\/3115432.","DOI":"10.1145\/3115432"},{"issue":"2","key":"975_CR3","doi-asserted-by":"publisher","first-page":"698","DOI":"10.14569\/IJACSA.2021.0120287","volume":"12","author":"M Humaira","year":"2021","unstructured":"Humaira M, Paul S, Jim MARK, Ami AS, Shah FM. A hybridized deep learning method for Bengali image captioning. IJACSA 2021;12(2):698\u2013707.","journal-title":"IJACSA"},{"key":"975_CR4","unstructured":"Vaswani A, et al. Attention Is all you need. arXiv:1706.03762, 2017."},{"key":"975_CR5","doi-asserted-by":"crossref","unstructured":"Zhang W, Nie W, Li X, Yu Y. Image caption generation with adaptive transformer. IEEE 2019; pp. 521\u201326.","DOI":"10.1109\/YAC.2019.8787715"},{"key":"975_CR6","doi-asserted-by":"crossref","unstructured":"He S, Liao W, Tavakoli HR, Yang M. Image captioning through image transformer, University of Twente, 2021. pp.153\u201369 .","DOI":"10.1007\/978-3-030-69538-5_10"},{"key":"975_CR7","doi-asserted-by":"publisher","unstructured":"Ami AS, Humaira M, Jim MARK, Paul S, Shah FM: Bengali image captioning with visual attention. 2020 23rd International Conference on Computer and Information Technology (ICCIT), 2020, pp. 1\u20135. https:\/\/doi.org\/10.1109\/ICCIT51783.2020.9392709.","DOI":"10.1109\/ICCIT51783.2020.9392709"},{"key":"975_CR8","doi-asserted-by":"publisher","first-page":"636","DOI":"10.1016\/j.procs.2019.06.100","volume":"154","author":"M Rahman","year":"2018","unstructured":"Rahman M, Mohammed N, Mansoor N, Momen S. Chittron. An automatic Bangla image captioning system. Procedia Comput Sci. 2018;154:636\u201342. https:\/\/doi.org\/10.1016\/j.procs.2019.06.100.","journal-title":"Procedia Comput Sci"},{"issue":"6","key":"975_CR9","doi-asserted-by":"publisher","first-page":"7427","DOI":"10.3233\/JIFS-179351","volume":"37","author":"T Deb","year":"2019","unstructured":"Deb T, et al. Oboyob: A sequential-semantic Bengali image captioning engine. J Intell Fuzzy Syst. 2019;37(6):7427\u201339. https:\/\/doi.org\/10.3233\/JIFS-179351.","journal-title":"J Intell Fuzzy Syst"},{"key":"975_CR10","doi-asserted-by":"publisher","unstructured":"Kamal AH, Jishan MA, Mansoor N. TextMage: the automated Bangla caption generator based on deep learning. In: 2020 International Conference on Decision Aid Sciences and Application (DASA), Sakheer, Bahrain, 2020; pp. 822\u201326. https:\/\/doi.org\/10.1109\/DASA51403.2020.9317108.","DOI":"10.1109\/DASA51403.2020.9317108."},{"key":"975_CR11","doi-asserted-by":"crossref","unstructured":"Khan MF. Improved Bengali image captioning via deep convolutional neural network based encoder\u2013decoder model improved Bengali image captioning via deep convolutional neural network based encoder\u2013decoder model. In: Proceedings of International Joint Conference on Advances in Computational Intelligence, 2020. pp. 217\u201329.","DOI":"10.1007\/978-981-16-0586-4_18"},{"key":"975_CR12","doi-asserted-by":"publisher","unstructured":"Kalam A, Azad A, Paul B. Bangla language textual image description by hybrid neural network model Bangla language textual image description by hybrid neural network model no. February, 2021; pp. 757\u201367. https:\/\/doi.org\/10.11591\/ijeecs.v21.i2.pp757-767.","DOI":"10.11591\/ijeecs.v21.i2.pp757-767."},{"key":"975_CR13","doi-asserted-by":"publisher","unstructured":"Aneja J, Deshpande A, Schwing AG. Convolutional image captioning. In: Proceedings of IEEE Comput Soc Conf Comput Vis Pattern Recognit. 2018; pp.5561\u201370. https:\/\/doi.org\/10.1109\/CVPR.2018.00583.","DOI":"10.1109\/CVPR.2018.00583"},{"key":"975_CR14","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1051\/matecconf\/201823201052","volume":"232","author":"S Liu","year":"2018","unstructured":"Liu S, Bai L, Hu Y, Wang H. Image captioning based on deep neural networks. MATEC Web Conf. 2018;232:1\u20137. https:\/\/doi.org\/10.1051\/matecconf\/201823201052.","journal-title":"MATEC Web Conf"},{"key":"975_CR15","doi-asserted-by":"publisher","unstructured":"Lan W, Li X, Dong J. Fluency-guided cross-lingual image captioning, MM 2017. In: Proc. 2017 ACM Multimed. Conf., 2017. pp. 1549\u201357. 2017.https:\/\/doi.org\/10.1145\/3123266.3123366.","DOI":"10.1145\/3123266.3123366."},{"key":"975_CR16","doi-asserted-by":"publisher","unstructured":"Li X, Lan W, Dong J, Liu H. Adding Chinese captions to images, ICMR 2016. In: Proc. 2016 ACM Int. Conf. Multimed. Retr., 2016; pp. 271\u201375, doi: https:\/\/doi.org\/10.1145\/2911996.2912049.","DOI":"10.1145\/2911996.2912049."},{"key":"975_CR17","doi-asserted-by":"publisher","unstructured":"Yoshikawa Y, Shigeto Y, Takeuchi A. STAIR captions: constructing a large-scale Japanese image caption dataset. ACL 2017. In: 55th Annu. Meet. Assoc. Comput. Linguist. Proc. Conf. (Long Pap.,2017; vol. 2, pp. 417\u201321. https:\/\/doi.org\/10.18653\/v1\/P17-2066.","DOI":"10.18653\/v1\/P17-2066"},{"key":"975_CR18","doi-asserted-by":"crossref","unstructured":"Jindal V. Generating image captions in Arabic using root-word based recurrent neural networks and deep neural networks. In: Association for Computational Linguistics: Student Research Workshop; 2018. pp. 144\u201351.","DOI":"10.18653\/v1\/N18-4020"},{"key":"975_CR19","doi-asserted-by":"publisher","unstructured":"Nugraha AA, Arifianto A, Suyanto. Generating image description on Indonesian language using convolutional neural network and gated recurrent unit. In: 7th Int. Conf. Inf. Commun. Technol. ICoICT; 2019, pp. 1\u20136. https:\/\/doi.org\/10.1109\/ICoICT.2019.8835370.","DOI":"10.1109\/ICoICT.2019.8835370"},{"key":"975_CR20","doi-asserted-by":"crossref","unstructured":"Li G, Zhu L, Liu P, Yang Y. Entangled transformer for image captioning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), 2019, pp. 8928\u201337.","DOI":"10.1109\/ICCV.2019.00902"},{"key":"975_CR21","unstructured":"Herdade S, Kappeler A, Boakye K, Soares J. Image captioning: transforming objects into words; 2019. arXiv:1906.05963"},{"key":"975_CR22","doi-asserted-by":"publisher","unstructured":"Atliha V, \u0160e\u0161ok D. Applied sciences text augmentation using BERT for image captioning; 2020. https:\/\/doi.org\/10.3390\/app10175978.","DOI":"10.3390\/app10175978."},{"key":"975_CR23","doi-asserted-by":"crossref","unstructured":"Lee H, Yoon S, Dernoncourt F, Kim DS, Bui T, Jung K. ViLBERTScore: evaluating image caption using vision-and-language BERT; 2020; pp. 34\u20139.","DOI":"10.18653\/v1\/2020.eval4nlp-1.4"},{"key":"975_CR24","doi-asserted-by":"publisher","unstructured":"Zhu X, Li L, Liu J, Peng H, Niu X. Applied sciences captioning transformer with stacked attention modules. https:\/\/doi.org\/10.3390\/app8050739.","DOI":"10.3390\/app8050739."},{"key":"975_CR25","unstructured":"Xu K, Ba J, Kiros R, Cho K. Conference on machine, and undefined 2015. Show, attend and tell: Neural image caption generation with visual attention. jmlr.org. Accessed 15 Aug 2020. http:\/\/www.jmlr.org\/proceedings\/papers\/v37\/xuc15.pdf."},{"key":"975_CR26","doi-asserted-by":"publisher","unstructured":"Chen L et al..SCA-CNN: Spatial and channel-wise attention in convolutional networks for image captioning. In: Proceedings of 30th IEEE Conf. Comput. Vis. Pattern Recognition, CVPR 2017, 2017, vol. 2017, pp. 6298\u2013306, https:\/\/doi.org\/10.1109\/CVPR.2017.667.","DOI":"10.1109\/CVPR.2017.667."},{"key":"975_CR27","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2018\/84","author":"H Chen","year":"2018","unstructured":"Chen H, Ding G, Lin Z, Zhao S, Han J. Show, observe and tell: Attribute-driven attention model for image captioning. IJCAI. 2018. https:\/\/doi.org\/10.24963\/ijcai.2018\/84.","journal-title":"IJCAI"},{"key":"975_CR28","doi-asserted-by":"crossref","unstructured":"You Q, et al. Image Captioning with Semantic Attention. In: Proceedings of the IEEE conference on computer vision and pattern recognition. 2016;4651\u20139.","DOI":"10.1109\/CVPR.2016.503"},{"key":"975_CR29","unstructured":"Chung J, Gulcehre C, Cho K, Bengio Y. Empirical evaluation of gated recurrent neural networks on sequence modeling, 2014. pp. 1\u20139. arXiv:1412.3555."},{"key":"975_CR30","doi-asserted-by":"publisher","first-page":"2818","DOI":"10.1109\/CVPR.2016.308","volume":"2016","author":"C Szegedy","year":"2016","unstructured":"Szegedy C, Vanhoucke V, Ioffe S, Shlens J, Wojna Z. Rethinking the inception architecture for computer vision. Proc IEEE Comput Soc Conf Comput Vis Pattern Recognit. 2016;2016:2818\u201326. https:\/\/doi.org\/10.1109\/CVPR.2016.308.","journal-title":"Proc IEEE Comput Soc Conf Comput Vis Pattern Recognit"},{"issue":"8","key":"975_CR31","doi-asserted-by":"publisher","first-page":"1037","DOI":"10.1167\/9.8.1037","volume":"9","author":"L Fei-Fei","year":"2010","unstructured":"Fei-Fei L, Deng J, Li K. ImageNet: constructing a large-scale image database. J Vis. 2010;9(8):1037. https:\/\/doi.org\/10.1167\/9.8.1037.","journal-title":"J Vis"},{"key":"975_CR32","doi-asserted-by":"publisher","unstructured":"Chollet F. Xception: deep learning with depthwise separable convolutions. Proceedings of 30th IEEE Conference of Computer Vision and Pattern Recognition, CVPR 2017, vol. 2017, pp. 1800\u20137, 2017. https:\/\/doi.org\/10.1109\/CVPR.2017.195.","DOI":"10.1109\/CVPR.2017.195."},{"key":"975_CR33","unstructured":"Manning CD, Pham H, Luong MT. Effective approaches to attention-based neural machine translation, 2015. arXiv:1508.04025."},{"key":"975_CR34","unstructured":"Bahdanau D, Cho K, Bengio Y. Neural machine translation by jointly learning to align and translate, 2015. pp. 1\u201315. arXiv:1409.0473"},{"key":"975_CR35","doi-asserted-by":"publisher","first-page":"1","DOI":"10.3115\/1073083.1073135","volume":"22176","author":"K Papineni","year":"2001","unstructured":"Papineni K, Roukos S, Ward T, Zhu W, Heights Y. IBM Research Report Bleu: a method for automatic evaluation of machine translation. Science 2001;22176:1\u201310. https:\/\/doi.org\/10.3115\/1073083.1073135.","journal-title":"Science"},{"key":"975_CR36","doi-asserted-by":"publisher","unstructured":"Denkowski M, Lavie A. Meteor universal: language specific translation evaluation for any target language. Language Technologies Institute; 2015. pp. 376\u201380 https:\/\/doi.org\/10.3115\/v1\/w14-3348.","DOI":"10.3115\/v1\/w14-3348."}],"container-title":["SN Computer Science"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42979-021-00975-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s42979-021-00975-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42979-021-00975-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,1,10]],"date-time":"2022-01-10T18:15:28Z","timestamp":1641838528000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s42979-021-00975-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,11,27]]},"references-count":36,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2022,1]]}},"alternative-id":["975"],"URL":"https:\/\/doi.org\/10.1007\/s42979-021-00975-0","relation":{},"ISSN":["2662-995X","2661-8907"],"issn-type":[{"value":"2662-995X","type":"print"},{"value":"2661-8907","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,11,27]]},"assertion":[{"value":"2 September 2021","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 November 2021","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 November 2021","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"There are no conflicts of interest to disclose in the subject matter or materials discussed in this manuscript.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of Interest"}}],"article-number":"90"}}