{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T10:43:54Z","timestamp":1776077034559,"version":"3.50.1"},"reference-count":23,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,1,9]],"date-time":"2025-01-09T00:00:00Z","timestamp":1736380800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,1,9]],"date-time":"2025-01-09T00:00:00Z","timestamp":1736380800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,1,9]]},"DOI":"10.1109\/ipas63548.2025.10924492","type":"proceedings-article","created":{"date-parts":[[2025,3,22]],"date-time":"2025-03-22T00:29:13Z","timestamp":1742603353000},"page":"1-5","source":"Crossref","is-referenced-by-count":4,"title":["Optimizing Multimodal Transformers for Medical Image Captioning: Enhancing Automated Descriptions via AI Systems"],"prefix":"10.1109","author":[{"given":"Mithila","family":"Arman","sequence":"first","affiliation":[{"name":"BRAC University,Dept. of CSE,Dhaka,Bangladesh"}]},{"given":"Md. Khurshid","family":"Jahan","sequence":"additional","affiliation":[{"name":"North South University,Dept. of ECE,Dhaka,Bangladesh"}]},{"given":"Ahmed Faizul Haque","family":"Dhrubo","sequence":"additional","affiliation":[{"name":"North South University,Dept. of ECE,Dhaka,Bangladesh"}]},{"given":"Md. Mahfuzur","family":"Rhaman","sequence":"additional","affiliation":[{"name":"George Mason University,Dept. of Computational Data Science,Fairfax,Virginia,USA"}]},{"given":"Sumaya Binte Zilani","family":"Choya","sequence":"additional","affiliation":[{"name":"American International University-Bangladesh,Dept. of CSE,Dhaka,Bangladesh"}]},{"given":"Din Mohammad","family":"Dohan","sequence":"additional","affiliation":[{"name":"BRAC University,Dept. of CSE,Dhaka,Bangladesh"}]},{"given":"Md. Ashiq","family":"Ul Islam Sajid","sequence":"additional","affiliation":[{"name":"BRAC University,Dept. of CSE,Dhaka,Bangladesh"}]},{"given":"Md. Golam","family":"Rabiul Alam","sequence":"additional","affiliation":[{"name":"BRAC University,Dept. of CSE,Dhaka,Bangladesh"}]}],"member":"263","reference":[{"key":"ref1","article-title":"X-Ray: What It Is, Types, Preparation and Risks","volume-title":"Cleveland Clinic","year":"2022"},{"issue":"2","key":"ref2","first-page":"112","article-title":"Multimodality and Efficiency in Natural Language Processing","volume-title":"INTERNATIONAL JOURNAL OF COMPUTER ENGINEERING & TECHNOLOGY","volume":"14","author":"Thatikonda","year":"2023"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.3390\/sym13071184"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1016\/b978-0-12-824020-5.00015-6"},{"key":"ref5","article-title":"Papers with Code - Image Captioning"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1016\/j.image.2023.117071"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2023.126287"},{"key":"ref8","article-title":"CLIP: Connecting text and images","year":"2021"},{"key":"ref9","article-title":"Work in Progress VISUALBERT: A SIMPLE AND PERFOR-MANT BASELINE FOR VISION AND LANGUAGE","author":"Liunian"},{"key":"ref10","doi-asserted-by":"crossref","article-title":"UNITER: UNiversal Image-TExt Representation Learning","author":"Chen","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"ref11","article-title":"ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision","author":"Kim","year":"2021"},{"key":"ref12","article-title":"GitHub - microsoft\/Swin-Transformer: This is an official implementation for \u2018Swin Transformer: Hierarchical Vision Transformer using Shifted Windows\u2019","volume-title":"GitHub","year":"2021"},{"key":"ref13","article-title":"Pixel-BERT: Aligning Image Pixels with Text by Deep Multi-Modal Transformers","author":"Huang","year":"2020"},{"key":"ref14","doi-asserted-by":"crossref","DOI":"10.1109\/ICCV.2019.00756","article-title":"VideoBERT: A Joint Model for Video and Language Representation Learning","author":"Sun","year":"2019"},{"key":"ref15","article-title":"RoBERTa: A Robustly Optimized BERT Pretraining Approach","author":"Liu","year":"2019"},{"key":"ref16","article-title":"Vision Transformer (ViT)"},{"key":"ref17","article-title":"BART"},{"key":"ref18","article-title":"DeiT","year":"2020"},{"key":"ref19","article-title":"MBart and MBart-50"},{"key":"ref20","article-title":"OpenAI GPT2"},{"key":"ref21","article-title":"roco-dataset\/data\/test\/radiology\/keywords.txt at master \u00b7 razorx89\/roco-dataset","volume-title":"GitHub","year":"2024"},{"key":"ref22","article-title":"BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation","author":"Li"},{"key":"ref23","article-title":"GIT: A Generative Image-to-text Transformer for Vision and Language","author":"Wang"}],"event":{"name":"2025 IEEE 6th International Conference on Image Processing, Applications and Systems (IPAS)","location":"Lyon, France","start":{"date-parts":[[2025,1,9]]},"end":{"date-parts":[[2025,1,11]]}},"container-title":["2025 IEEE 6th International Conference on Image Processing, Applications and Systems (IPAS)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10924471\/10924472\/10924492.pdf?arnumber=10924492","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,24]],"date-time":"2025-03-24T17:52:42Z","timestamp":1742838762000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10924492\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,1,9]]},"references-count":23,"URL":"https:\/\/doi.org\/10.1109\/ipas63548.2025.10924492","relation":{},"subject":[],"published":{"date-parts":[[2025,1,9]]}}}