{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,4]],"date-time":"2025-12-04T10:02:52Z","timestamp":1764842572265,"version":"3.40.3"},"publisher-location":"Cham","reference-count":49,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783030926588"},{"type":"electronic","value":"9783030926595"}],"license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021]]},"DOI":"10.1007\/978-3-030-92659-5_25","type":"book-chapter","created":{"date-parts":[[2022,1,13]],"date-time":"2022-01-13T07:09:18Z","timestamp":1642057758000},"page":"392-404","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["CAGAN: Text-To-Image Generation with\u00a0Combined Attention Generative Adversarial Networks"],"prefix":"10.1007","author":[{"given":"Henning","family":"Schulze","sequence":"first","affiliation":[]},{"given":"Dogucan","family":"Yaman","sequence":"additional","affiliation":[]},{"given":"Alexander","family":"Waibel","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,1,13]]},"reference":[{"key":"25_CR1","doi-asserted-by":"publisher","first-page":"291","DOI":"10.1016\/j.neucom.2018.05.080","volume":"311","author":"S Bai","year":"2018","unstructured":"Bai, S., An, S.: A survey on automatic image caption generation. Neurocomputing 311, 291\u2013304 (2018)","journal-title":"Neurocomputing"},{"key":"25_CR2","unstructured":"Barratt, S.T., Sharma, R.: A note on the inception score. CoRR abs\/1801.01973 (2018)"},{"key":"25_CR3","doi-asserted-by":"publisher","first-page":"183706","DOI":"10.1109\/ACCESS.2019.2958864","volume":"7","author":"Y Cai","year":"2019","unstructured":"Cai, Y., et al.: Dualattn-GAN: text to image synthesis with dual attentional generative adversarial network. IEEE Access 7, 183706\u2013183716 (2019)","journal-title":"IEEE Access"},{"key":"25_CR4","doi-asserted-by":"crossref","unstructured":"Cheng, J., Wu, F., Tian, Y., Wang, L., Tao, D.: RiFeGAN: rich feature generation for text-to-image synthesis from prior knowledge. In: CVPR, pp. 10908\u201310917 (2020)","DOI":"10.1109\/CVPR42600.2020.01092"},{"key":"25_CR5","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"483","DOI":"10.1007\/978-3-030-30493-5_47","volume-title":"Artificial Neural Networks and Machine Learning \u2013 ICANN 2019: Workshop and Special Sessions","author":"Q Cheng","year":"2019","unstructured":"Cheng, Q., Gu, X.: Hybrid attention driven text-to-image synthesis via generative adversarial networks. In: Tetko, I.V., K\u016frkov\u00e1, V., Karpov, P., Theis, F. (eds.) ICANN 2019. LNCS, vol. 11731, pp. 483\u2013495. Springer, Cham (2019). https:\/\/doi.org\/10.1007\/978-3-030-30493-5_47"},{"key":"25_CR6","doi-asserted-by":"crossref","unstructured":"Dorta, G., Vicente, S., Agapito, L., Campbell, N.D.F., Prince, S., Simpson, I.: Laplacian pyramid of conditional variational autoencoders. In: CVMP, pp. 7:1\u20137:9 (2017)","DOI":"10.1145\/3150165.3150172"},{"key":"25_CR7","doi-asserted-by":"crossref","unstructured":"Fang, H., et al.: From captions to visual concepts and back. In: CVPR, pp. 1473\u20131482 (2015)","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"25_CR8","unstructured":"Goodfellow, I.J., et al.: Generative adversarial nets. In: NIPS, pp. 2672\u20132680 (2014)"},{"key":"25_CR9","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"610","DOI":"10.1007\/978-3-030-01237-3_37","volume-title":"Computer Vision \u2013 ECCV 2018","author":"T Gupta","year":"2018","unstructured":"Gupta, T., Schwenk, D., Farhadi, A., Hoiem, D., Kembhavi, A.: Imagine this! Scripts to compositions to videos. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11212, pp. 610\u2013626. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01237-3_37"},{"key":"25_CR10","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"25_CR11","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: GANs trained by a two time-scale update rule converge to a local nash equilibrium. In: NIPS, pp. 6626\u20136637 (2017)"},{"key":"25_CR12","unstructured":"Hinz, T., Heinrich, S., Wermter, S.: Semantic object accuracy for generative text-to-image synthesis. CoRR abs\/1910.13321 (2019)"},{"key":"25_CR13","doi-asserted-by":"crossref","unstructured":"Hu, J., Shen, L., Sun, G.: Squeeze-and-excitation networks. In: CVPR, pp. 7132\u20137141 (2018)","DOI":"10.1109\/CVPR.2018.00745"},{"key":"25_CR14","doi-asserted-by":"crossref","unstructured":"Johnson, J., Gupta, A., Fei-Fei, L.: Image generation from scene graphs. In: CVPR, pp. 1219\u20131228 (2018)","DOI":"10.1109\/CVPR.2018.00133"},{"key":"25_CR15","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. In: ICLR (Poster) (2015)"},{"key":"25_CR16","doi-asserted-by":"crossref","unstructured":"Kulkarni, T.D., Kohli, P., Tenenbaum, J.B., Mansinghka, V.K.: Picture: a probabilistic programming language for scene perception. In: CVPR, pp. 4390\u20134399 (2015)","DOI":"10.1109\/CVPR.2015.7299068"},{"key":"25_CR17","unstructured":"Kulkarni, T.D., Whitney, W.F., Kohli, P., Tenenbaum, J.B.: Deep convolutional inverse graphics network. In: NIPS, pp. 2539\u20132547 (2015)"},{"key":"25_CR18","unstructured":"Li, B., Qi, X., Lukasiewicz, T., Torr, P.H.S.: Controllable text-to-image generation. In: NIPS, pp. 2063\u20132073 (2019)"},{"key":"25_CR19","doi-asserted-by":"crossref","unstructured":"Li, W., et al.: Object-driven text-to-image synthesis via adversarial training. In: CVPR, pp. 12174\u201312182 (2019)","DOI":"10.1109\/CVPR.2019.01245"},{"issue":"4","key":"25_CR20","first-page":"68","volume":"39","author":"Z Li","year":"2019","unstructured":"Li, Z., Wu, M., Zheng, J., Yu, H.: Perceptual adversarial networks with a feature pyramid for image translation. IEEE CG&A 39(4), 68\u201377 (2019)","journal-title":"IEEE CG&A"},{"key":"25_CR21","unstructured":"Liang, J., Pei, W., Lu, F.: CPGAN: full-spectrum content-parsing generative adversarial networks for text-to-image synthesis. CoRR abs\/1912.08562 (2019)"},{"key":"25_CR22","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"25_CR23","unstructured":"Lucic, M., Kurach, K., Michalski, M., Gelly, S., Bousquet, O.: Are GANs created equal? A large-scale study. In: NIPS, pp. 698\u2013707 (2018)"},{"key":"25_CR24","unstructured":"Miyato, T., Kataoka, T., Koyama, M., Yoshida, Y.: Spectral normalization for generative adversarial networks. In: ICLR, Conference Track Proceedings. OpenReview.net (2018)"},{"key":"25_CR25","unstructured":"Odena, A., Olah, C., Shlens, J.: Conditional image synthesis with auxiliary classifier GANs. In: Proceedings of Machine Learning Research, ICML, vol. 70, pp. 2642\u20132651. PMLR (2017)"},{"key":"25_CR26","unstructured":"van den Oord, A., Kalchbrenner, N., Espeholt, L., Kavukcuoglu, K., Vinyals, O., Graves, A.: Conditional image generation with PixelCNN decoders. In: NIPS, pp. 4790\u20134798 (2016)"},{"key":"25_CR27","unstructured":"Parmar, N., Ramachandran, P., Vaswani, A., Bello, I., Levskaya, A., Shlens, J.: Stand-alone self-attention in vision models. In: NIPS, pp. 68\u201380 (2019)"},{"key":"25_CR28","volume-title":"Physically Based Rendering: From Theory to Implementation","author":"M Pharr","year":"2016","unstructured":"Pharr, M., Jakob, W., Humphreys, G.: Physically Based Rendering: From Theory to Implementation. Morgan Kaufmann, Burlington (2016)"},{"key":"25_CR29","unstructured":"Qiao, T., Zhang, J., Xu, D., Tao, D.: Learn, imagine and create: text-to-image generation from prior knowledge. In: NIPS, pp. 885\u2013895 (2019)"},{"key":"25_CR30","doi-asserted-by":"crossref","unstructured":"Qiao, T., Zhang, J., Xu, D., Tao, D.: MirrorGAN: learning text-to-image generation by redescription. In: CVPR, pp. 1505\u20131514 (2019)","DOI":"10.1109\/CVPR.2019.00160"},{"key":"25_CR31","unstructured":"Reed, S.E., Akata, Z., Mohan, S., Tenka, S., Schiele, B., Lee, H.: Learning what and where to draw. In: NIPS, pp. 217\u2013225 (2016)"},{"key":"25_CR32","unstructured":"Reed, S.E., Akata, Z., Yan, X., Logeswaran, L., Schiele, B., Lee, H.: Generative adversarial text to image synthesis. In: JMLR Workshop and Conference Proceedings, ICML, vol. 48, pp. 1060\u20131069 (2016)"},{"key":"25_CR33","unstructured":"Reed, S.E., et al.: Parallel multiscale autoregressive density estimation. In: Proceedings of Machine Learning Research, ICML, vol. 70, pp. 2912\u20132921 (2017)"},{"key":"25_CR34","unstructured":"Rosca, M., Lakshminarayanan, B., Warde-Farley, D., Mohamed, S.: Variational approaches for auto-encoding generative adversarial networks. CoRR abs\/1706.04987 (2017)"},{"key":"25_CR35","unstructured":"Salimans, T., Goodfellow, I.J., Zaremba, W., Cheung, V., Radford, A., Chen, X.: Improved techniques for training GANs. In: NIPS, pp. 2226\u20132234 (2016)"},{"key":"25_CR36","doi-asserted-by":"crossref","unstructured":"Snell, J., Ridgeway, K., Liao, R., Roads, B.D., Mozer, M.C., Zemel, R.S.: Learning to generate images with perceptual similarity metrics. In: ICIP, pp. 4277\u20134281 (2017)","DOI":"10.1109\/ICIP.2017.8297089"},{"key":"25_CR37","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., Wojna, Z.: Rethinking the inception architecture for computer vision. In: CVPR, pp. 2818\u20132826 (2016)","DOI":"10.1109\/CVPR.2016.308"},{"key":"25_CR38","doi-asserted-by":"crossref","unstructured":"Tan, H., Liu, X., Li, X., Zhang, Y., Yin, B.: Semantics-enhanced adversarial nets for text-to-image synthesis. In: ICCV, pp. 10500\u201310509 (2019)","DOI":"10.1109\/ICCV.2019.01060"},{"key":"25_CR39","doi-asserted-by":"crossref","unstructured":"Tao, M., Tang, H., Wu, S., Sebe, N., Wu, F., Jing, X.: DF-GAN: deep fusion generative adversarial networks for text-to-image synthesis. CoRR abs\/2008.05865 (2020)","DOI":"10.1109\/ICIBA50161.2020.9277299"},{"key":"25_CR40","unstructured":"Theis, L., Bethge, M.: Generative image modeling using spatial LSTMs. In: NIPS, pp. 1927\u20131935 (2015)"},{"key":"25_CR41","unstructured":"Wah, C., Branson, S., Welinder, P., Perona, P., Belongie, S.: The Caltech-UCSD birds-200-2011 dataset. Technical report CNS-TR-2011-001 (2011)"},{"key":"25_CR42","doi-asserted-by":"crossref","unstructured":"Wu, J., Tenenbaum, J.B., Kohli, P.: Neural scene de-rendering. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.744"},{"key":"25_CR43","unstructured":"Xie, J., Lu, Y., Zhu, S., Wu, Y.N.: A theory of generative convnet. In: JMLR Workshop and Conference Proceedings, ICML, vol. 48, pp. 2635\u20132644 (2016)"},{"key":"25_CR44","unstructured":"Xu, K., et al.: Show, attend and tell: neural image caption generation with visual attention. In: JMLR Workshop and Conference Proceedings, ICML, vol. 37, pp. 2048\u20132057 (2015)"},{"key":"25_CR45","doi-asserted-by":"crossref","unstructured":"Xu, T., et al.: AttnGAN: fine-grained text to image generation with attentional generative adversarial networks. In: CVPR, pp. 1316\u20131324 (2018)","DOI":"10.1109\/CVPR.2018.00143"},{"key":"25_CR46","doi-asserted-by":"crossref","unstructured":"Yin, G., Liu, B., Sheng, L., Yu, N., Wang, X., Shao, J.: Semantics disentangling for text-to-image generation. In: CVPR, pp. 2327\u20132336 (2019)","DOI":"10.1109\/CVPR.2019.00243"},{"key":"25_CR47","doi-asserted-by":"crossref","unstructured":"Zhang, H., Xu, T., Li, H.: StackGAN: text to photo-realistic image synthesis with stacked generative adversarial networks. In: ICCV, pp. 5908\u20135916 (2017)","DOI":"10.1109\/ICCV.2017.629"},{"key":"25_CR48","doi-asserted-by":"crossref","unstructured":"Zhang, H., et al.: StackGAN++: realistic image synthesis with stacked generative adversarial networks. CoRR abs\/1710.10916 (2017)","DOI":"10.1109\/ICCV.2017.629"},{"key":"25_CR49","doi-asserted-by":"crossref","unstructured":"Zhu, M., Pan, P., Chen, W., Yang, Y.: DM-GAN: dynamic memory generative adversarial networks for text-to-image synthesis. In: CVPR, pp. 5802\u20135810 (2019)","DOI":"10.1109\/CVPR.2019.00595"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-92659-5_25","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,5]],"date-time":"2022-05-05T14:49:01Z","timestamp":1651762141000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-92659-5_25"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"ISBN":["9783030926588","9783030926595"],"references-count":49,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-92659-5_25","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2021]]},"assertion":[{"value":"13 January 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"DAGM GCPR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"DAGM German Conference on Pattern Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Bonn","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2021","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 September 2021","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 October 2021","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"43","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"dagm2021","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.dagm-gcpr.de\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"116","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"46","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"40% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2.95","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4.3","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"The conference was held virtually due to the COVID-19 pandemic.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}