{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T17:22:00Z","timestamp":1765387320638,"version":"3.46.0"},"reference-count":33,"publisher":"Springer Science and Business Media LLC","issue":"37","license":[{"start":{"date-parts":[[2025,6,21]],"date-time":"2025-06-21T00:00:00Z","timestamp":1750464000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,6,21]],"date-time":"2025-06-21T00:00:00Z","timestamp":1750464000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"DOI":"10.1007\/s11042-025-20990-0","type":"journal-article","created":{"date-parts":[[2025,6,21]],"date-time":"2025-06-21T05:21:22Z","timestamp":1750483282000},"page":"46173-46189","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Text-conditioned image generation using diffusion models"],"prefix":"10.1007","volume":"84","author":[{"given":"Dhanya","family":"Srinivasan","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6433-8842","authenticated-orcid":false,"given":"P.","family":"Mirunalini","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Karthik","family":"Desingu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Maheshwari","family":"M R","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,6,21]]},"reference":[{"key":"20990_CR1","doi-asserted-by":"publisher","unstructured":"Alhabeeb SK, Al-Shargabi AA (2024) Text-to-image synthesis with generative models: Methods, datasets, performance metrics, challenges, and future direction. IEEE Access PP:1\u20131, 01. https:\/\/doi.org\/10.1109\/ACCESS.2024.3365043","DOI":"10.1109\/ACCESS.2024.3365043"},{"key":"20990_CR2","unstructured":"Banerjee S, Lavie A (2005) Meteor: An automatic metric for mt evaluation with improved correlation with human judgments. In: Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization, pp 65\u201372"},{"key":"20990_CR3","unstructured":"Chen X, Mishra N, Rohaninejad M, Abbeel P (2018) Pixelsnail: An improved autoregressive generative model. In: International conference on machine learning, pp 864\u2013872. PMLR"},{"key":"20990_CR4","unstructured":"Devlin J, Chang MW, Lee K, Toutanova K (2018) Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv:1810.04805"},{"key":"20990_CR5","unstructured":"Srinivasan D, Subhashree M, Mirunalini P, Jaisakthi SM (2024) Multimodal learning for image-text matching: A blip-based approach. In: https:\/\/ceur-ws.org\/Vol-3658\/paper23.pdf CEUR"},{"key":"20990_CR6","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Dehghani M, Minderer M, Heigold G, Gelly S et al (2020) An image is worth 16x16 words: Transformers for image recognition at scale. arXiv:2010.11929"},{"key":"20990_CR7","doi-asserted-by":"crossref","unstructured":"Gu S, Chen D, Bao J, Wen F, Zhang B, Chen D, Yuan L, Guo B (2022) Vector quantized diffusion model for text-to-image synthesis. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 10696\u201310706","DOI":"10.1109\/CVPR52688.2022.01043"},{"key":"20990_CR8","doi-asserted-by":"publisher","first-page":"04","DOI":"10.1016\/j.autcon.2024.105430","volume":"163","author":"B Huang","year":"2024","unstructured":"Huang B, Kang F, Li X, Zhu S (2024) Underwater dam crack image generation based on unsupervised image-to-image translation. Autom Constr 163:04. https:\/\/doi.org\/10.1016\/j.autcon.2024.105430","journal-title":"Autom Constr"},{"key":"20990_CR9","doi-asserted-by":"publisher","unstructured":"Huang M, Mao Z, Chen Z, Zhang Y (2023) Towards accurate image coding: Improved autoregressive image generation with dynamic vector quantization. In: 2023 IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp 22596\u201322605. https:\/\/doi.org\/10.1109\/CVPR52729.2023.02164","DOI":"10.1109\/CVPR52729.2023.02164"},{"key":"20990_CR10","unstructured":"Kazemi H, Soleymani S, Taherkhani F, Iranmanesh S, Nasrabadi N (2018) Unsupervised image-to-image translation using domain-specific variational information bound. Advances in Neural Information Processing Systems, 31"},{"key":"20990_CR11","doi-asserted-by":"crossref","unstructured":"Kocasari U, Dirik A, Tiftikci M, Yanardag P (2022) Stylemc: Multi-channel based fast text-guided image generation and manipulation. In: Proceedings of the IEEE\/CVF winter conference on applications of computer vision, pp 895\u2013904","DOI":"10.1109\/WACV51458.2022.00350"},{"key":"20990_CR12","unstructured":"Li J, Li D, Xiong C, Hoi S (2022) Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International conference on machine learning, pp 12888\u201312900. PMLR"},{"key":"20990_CR13","unstructured":"Lin C-Y (2004) Rouge: A package for automatic evaluation of summaries. In: Text summarization branches out, pp 74\u201381"},{"key":"20990_CR14","doi-asserted-by":"crossref","unstructured":"Lin TY, Maire M, Belongie S, Hays J, Perona P, Ramanan D, Doll\u00e1r P, Zitnick CL (2014) Microsoft coco: Common objects in context. In: Computer vision\u2013ECCV 2014: 13th european conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13, pp 740\u2013755. Springer","DOI":"10.1007\/978-3-319-10602-1_48"},{"issue":"7","key":"20990_CR15","doi-asserted-by":"publisher","first-page":"935","DOI":"10.1007\/s42979-024-03289-z","volume":"5","author":"R Mehmood","year":"2024","unstructured":"Mehmood R, Bashir R, Giri KJ (2024) Text conditioned generative adversarial networks generating images and videos: A critical review. SN Comput Sci 5(7):935","journal-title":"SN Comput Sci"},{"key":"20990_CR16","doi-asserted-by":"crossref","unstructured":"Mirunalini P, Karthik Desingu S, Aswatha R Deepika, Deepika V, Jaisakthi SM (2024) Conditional adversarial segmentation and deep learning approach for skin lesion sub-typing from dermoscopic images. Neural Computing and Applications, pp 1\u201319","DOI":"10.1007\/s00521-024-09964-9"},{"key":"20990_CR17","unstructured":"Mirunalini P, Sanjhay V, Rohitram S, Rohith M (2024b) Musti multimodal understanding of smells in texts and images using clip. In: https:\/\/ceur-ws.org\/Vol-3658\/paper17.pdf CEUR"},{"key":"20990_CR18","unstructured":"Nichol A, Dhariwal P, Ramesh A, Shyam P, Mishkin P, McGrew B, Sutskever I, Chen M (2021) Glide: Towards photorealistic image generation and editing with text-guided diffusion models. arXiv:2112.10741"},{"key":"20990_CR19","doi-asserted-by":"crossref","unstructured":"Papineni K, Roukos S, Ward T, Zhu WJ (2002) Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th annual meeting of the association for computational linguistics, pp 311\u2013318","DOI":"10.3115\/1073083.1073135"},{"key":"20990_CR20","doi-asserted-by":"crossref","unstructured":"Qiao T, Zhang J, Xu D, Tao D (2019) Mirrorgan: Learning text-to-image generation by redescription. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 1505\u20131514","DOI":"10.1109\/CVPR.2019.00160"},{"key":"20990_CR21","unstructured":"Radford A, Kim JW, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J et al (2021) Learning transferable visual models from natural language supervision. In: International conference on machine learning, pp 8748\u20138763. PMLR"},{"key":"20990_CR22","unstructured":"Ramesh A, Dhariwal P, Nichol A, Chu C, Chen M (2022) Hierarchical text-conditional image generation with clip latents. 1(2):3. arXiv:2204.06125"},{"issue":"1","key":"20990_CR23","first-page":"16","volume":"20","author":"S Ramzan","year":"2022","unstructured":"Ramzan S, Iqbal MM, Kalsum T (2022) Text-to-image generation using deep learning. Eng Proc 20(1):16","journal-title":"Eng Proc"},{"key":"20990_CR24","unstructured":"Razavi A, Van den Oord A, Vinyals O (2019) Generating diverse high-fidelity images with vq-vae-2. Advances in Neural Information Processing Systems 32"},{"key":"20990_CR25","unstructured":"Reed S, Akata Z, Yan X, Logeswaran L, Schiele B, Lee H (2016) Generative adversarial text to image synthesis. In: International conference on machine learning, pp 1060\u20131069. PMLR"},{"key":"20990_CR26","unstructured":"Shuai X, Ding H, Ma X, Tu R, Jiang YG, Tao D (2024) A survey of multimodal-guided image editing with text-to-image diffusion models. arXiv:2406.14555"},{"key":"20990_CR27","unstructured":"Van Den Oord A, Vinyals O et al (2017) Neural discrete representation learning. Advances in Neural Information Processing Systems, 30"},{"key":"20990_CR28","doi-asserted-by":"crossref","unstructured":"Xia W, Yang Y, Xue JH, Wu B (2021) Tedigan: Text-guided diverse face image generation and manipulation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 2256\u20132265","DOI":"10.1109\/CVPR46437.2021.00229"},{"key":"20990_CR29","doi-asserted-by":"crossref","unstructured":"Zhang R, Isola P, Efros AA, Shechtman E, Wang O (2018) The unreasonable effectiveness of deep features as a perceptual metric. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 586\u2013595","DOI":"10.1109\/CVPR.2018.00068"},{"key":"20990_CR30","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1109\/TGRS.2024.3510781","volume":"62","author":"Z Zhang","year":"2024","unstructured":"Zhang Z, Li X, Li H, Dunkin F, Li B, Li Z (2024) Dual-branch sparse self-learning with instance binding augmentation for adversarial detection in remote sensing images. IEEE Transactions on Geoscience and Remote Sensing 62:1\u201313","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"20990_CR31","doi-asserted-by":"crossref","unstructured":"Zhou Y, Zhang R, Chen C, Li C, Tensmeyer C, Yu T, Gu J, Xu J, Sun T (2022) Towards language-free training for text-to-image generation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 17907\u201317917","DOI":"10.1109\/CVPR52688.2022.01738"},{"key":"20990_CR32","doi-asserted-by":"crossref","unstructured":"Zhu JY, Park T, Isola P, Efros AA (2017) Unpaired image-to-image translation using cycle-consistent adversarial networks. In: Proceedings of the IEEE international conference on computer vision, pp 2223\u20132232","DOI":"10.1109\/ICCV.2017.244"},{"key":"20990_CR33","doi-asserted-by":"crossref","unstructured":"Zinnen M, Madhu P, Kosti R, Bell P, Maier A, Christlein V (2022) Odor: The icpr2022 odeuropa challenge on olfactory object recognition. In: 2022 26th International conference on pattern recognition (ICPR), pp 4989\u20134994. IEEE","DOI":"10.1109\/ICPR56361.2022.9956542"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-025-20990-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-025-20990-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-025-20990-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T17:17:25Z","timestamp":1765387045000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-025-20990-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,21]]},"references-count":33,"journal-issue":{"issue":"37","published-online":{"date-parts":[[2025,11]]}},"alternative-id":["20990"],"URL":"https:\/\/doi.org\/10.1007\/s11042-025-20990-0","relation":{},"ISSN":["1573-7721"],"issn-type":[{"type":"electronic","value":"1573-7721"}],"subject":[],"published":{"date-parts":[[2025,6,21]]},"assertion":[{"value":"25 July 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 January 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 June 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 June 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"Not applicable.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics statement"}},{"value":"The authors declare that they have no conflict of interest.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}