{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,19]],"date-time":"2026-02-19T03:24:35Z","timestamp":1771471475582,"version":"3.50.1"},"reference-count":20,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,6,24]],"date-time":"2024-06-24T00:00:00Z","timestamp":1719187200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,6,24]],"date-time":"2024-06-24T00:00:00Z","timestamp":1719187200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,6,24]]},"DOI":"10.1109\/icccnt61001.2024.10724799","type":"proceedings-article","created":{"date-parts":[[2024,11,4]],"date-time":"2024-11-04T23:06:46Z","timestamp":1730761606000},"page":"1-5","source":"Crossref","is-referenced-by-count":3,"title":["Integrating Convolutional and Recurrent Networks for Image Caption Generation: A Unified Approach"],"prefix":"10.1109","author":[{"given":"I","family":"Nandhini","sequence":"first","affiliation":[{"name":"Vel Tech Rangarajan Dr. Sagunthala R&#x0026;D Institute of Science and Technology,Department of Electronics &#x0026; Communication Engineering,Chennai,India"}]},{"given":"L","family":"Leo Prasanth","sequence":"additional","affiliation":[{"name":"Anna University,Research Scholar Department of Information Science and Technology,Chennai,India"}]},{"given":"T","family":"Nagalakshmi","sequence":"additional","affiliation":[{"name":"Vel Tech Rangarajan Dr. Sagunthala R&#x0026;D Institute of Science and Technology,Department of Mathematics,Chennai,India"}]},{"given":"D","family":"Manjula","sequence":"additional","affiliation":[{"name":"VIT University,Department of Computer Science and Engineering,Chennai,India"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3148210"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2023.126287"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1145\/3617592"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1051\/matecconf\/201823201052"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.7005"},{"key":"ref6","article-title":"ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision","volume-title":"Proceedings of the 38 th International Conference on Machine Learning","author":"Kim"},{"key":"ref7","article-title":"Vision transformer for image captioning and visual question answering: A comprehensive study","author":"Chen","journal-title":"arXiv:2203.01594."},{"key":"ref8","article-title":"Bidirectional vision transformer for image captioning and image generation","author":"Zhou","year":"2022","journal-title":"Computer Vision and Pattern Recognition"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICPCSN58827.2023.00062"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"ref11","first-page":"701","article-title":"BUTD: Bottom- up and top-down attention for image captioning","volume-title":"Proceedings of the IEEE International Conference on Computer Vision (ICCV)","author":"Chen"},{"key":"ref12","article-title":"ViLBERT: A vision and language transformer for image captioning","author":"Lu","year":"2019","journal-title":"Computer Vision and Pattern Recognition"},{"key":"ref13","first-page":"1194","article-title":"An image is worth 16x16 words: Transformersfor image recognition at scale","volume-title":"Proceedings of the IEEE International Conference on Computer Vision (ICCV)","author":"Dosovitskiy"},{"key":"ref14","first-page":"11178","article-title":"LXMERT: Learning cross- modality encoder representations from transformers","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Hu"},{"key":"ref15","article-title":"CLIP: Contrastive language- image pre- training","author":"Radford","year":"2021","journal-title":"arXiv:2201.07261"},{"key":"ref16","article-title":"Graph attention networks","author":"Velickovic","year":"2017","journal-title":"Machine Learning"},{"key":"ref17","first-page":"11231","article-title":"Multimodal Transformer: A unified transformer framework for multimodal tasks","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","author":"Du"},{"issue":"10","key":"ref18","first-page":"2203","article-title":"Transforming Visual Understanding: A Comprehensive Survey of Image Captioning with Transformers","volume":"4","author":"Latha","year":"2023","journal-title":"International Journal of Research Publication and Reviews"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.17762\/ijritcc.v11i9.8981"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.14569\/IJACSA.2023.0140326"}],"event":{"name":"2024 15th International Conference on Computing Communication and Networking Technologies (ICCCNT)","location":"Kamand, India","start":{"date-parts":[[2024,6,24]]},"end":{"date-parts":[[2024,6,28]]}},"container-title":["2024 15th International Conference on Computing Communication and Networking Technologies (ICCCNT)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10723818\/10723316\/10724799.pdf?arnumber=10724799","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T06:57:19Z","timestamp":1732690639000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10724799\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,24]]},"references-count":20,"URL":"https:\/\/doi.org\/10.1109\/icccnt61001.2024.10724799","relation":{},"subject":[],"published":{"date-parts":[[2024,6,24]]}}}