{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,12]],"date-time":"2025-11-12T14:23:45Z","timestamp":1762957425771,"version":"3.37.3"},"reference-count":73,"publisher":"Springer Science and Business Media LLC","issue":"9","license":[{"start":{"date-parts":[[2024,4,2]],"date-time":"2024-04-02T00:00:00Z","timestamp":1712016000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,4,2]],"date-time":"2024-04-02T00:00:00Z","timestamp":1712016000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2024,9]]},"DOI":"10.1007\/s11263-024-02044-4","type":"journal-article","created":{"date-parts":[[2024,4,2]],"date-time":"2024-04-02T15:02:46Z","timestamp":1712070166000},"page":"3537-3565","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["MMoT: Mixture-of-Modality-Tokens Transformer for Composed Multimodal Conditional Image Synthesis"],"prefix":"10.1007","volume":"132","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-9835-3353","authenticated-orcid":false,"given":"Jianbin","family":"Zheng","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Daqing","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chaoyue","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Minghui","family":"Hu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zuopeng","family":"Yang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Changxing","family":"Ding","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dacheng","family":"Tao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,4,2]]},"reference":[{"issue":"11","key":"2044_CR1","doi-asserted-by":"publisher","first-page":"7327","DOI":"10.1109\/TPAMI.2021.3116668","volume":"44","author":"S Bond-Taylor","year":"2021","unstructured":"Bond-Taylor, S., Leach, A., Long, Y., & Willcocks, C. G. (2021). Deep generative modelling: a comparative review of VAEs, GANs, normalizing flows, energy-based and autoregressive models. Transactions on Pattern Analysis and Machine Intelligence, 44(11), 7327\u20137347.","journal-title":"Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2044_CR2","doi-asserted-by":"crossref","unstructured":"Caesar, H., Uijlings, J., & Ferrari, V. (2018). Cocostuff: Thing and stuff classes in context. In Conference on computer vision and pattern recognition (pp. 1209\u20131218).","DOI":"10.1109\/CVPR.2018.00132"},{"key":"2044_CR3","unstructured":"Chang, H., Zhang, H., Barber, J., Maschinot, A., Lezama, J., Jiang, L., Yang, M.-H., Murphy, K., Freeman, W. T., Rubinstein, M., Li, Y., & Krishnan, D. (2023). Muse: Text-to-image generation via masked generative transformers. arXiv preprint arXiv:2301.00704"},{"key":"2044_CR4","doi-asserted-by":"crossref","unstructured":"Chang, H., Zhang, H., Jiang, L., Liu, C., & Freeman, W.T. (2022). Maskgit: Masked generative image transformer. In Conference on computer vision and pattern recognition (pp. 11315\u2013 11325).","DOI":"10.1109\/CVPR52688.2022.01103"},{"issue":"4","key":"2044_CR5","doi-asserted-by":"publisher","first-page":"834","DOI":"10.1109\/TPAMI.2017.2699184","volume":"40","author":"L-C Chen","year":"2017","unstructured":"Chen, L.-C., Papandreou, G., Kokkinos, I., Murphy, K., & Yuille, A. L. (2017). DeepLab: Semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected CRFs. Transactions on Pattern Analysis and Machine Intelligence, 40(4), 834\u2013848.","journal-title":"Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2044_CR6","unstructured":"Chen, M., Radford, A., Child, R., Wu, J., Jun, H., Luan, D., & Sutskever, I. (2020). Generative pretraining from pixels. In International conference on machine learning (pp. 1691\u2013 1703)."},{"key":"2044_CR7","unstructured":"Child, R., Gray, S., Radford, A., & Sutskever, I. (2019). Generating long sequences with sparse transformers. arXiv preprint arXiv:1904.10509"},{"key":"2044_CR8","first-page":"8780","volume":"34","author":"P Dhariwal","year":"2021","unstructured":"Dhariwal, P., & Nichol, A. (2021). Diffusion models beat GANs on image synthesis. Advances in Neural Information Processing Systems, 34, 8780\u20138794.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2044_CR9","doi-asserted-by":"crossref","unstructured":"Esser, P., Rombach, R., & Ommer, B. (2021). Taming transformers for high-resolution image synthesis. In Conference on computer vision and pattern recognition (pp. 12873\u201312883).","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"2044_CR10","doi-asserted-by":"crossref","unstructured":"Gafni, O., Polyak, A., Ashual, O., Sheynin, S., Parikh, D., & Taigman, Y. (2022). Make-ascene: Scene-based text-to-image generation with human priors. In European conference on computer vision (pp. 89\u2013106).","DOI":"10.1007\/978-3-031-19784-0_6"},{"issue":"11","key":"2044_CR11","doi-asserted-by":"publisher","first-page":"139","DOI":"10.1145\/3422622","volume":"63","author":"I Goodfellow","year":"2020","unstructured":"Goodfellow, I., Pouget-Abadie, J., Mirza, M., Xu, B., Warde-Farley, D., Ozair, S., & Bengio, Y. (2020). Generative adversarial networks. Communications of the ACM, 63(11), 139\u2013144.","journal-title":"Communications of the ACM"},{"key":"2044_CR12","doi-asserted-by":"crossref","unstructured":"He, S., Liao, W., Yang, M.Y., Yang, Y., Song, Y.-Z., Rosenhahn, B., & Xiang, T. (2021). Context-aware layout to image generation with enhanced object appearance. In Conference on computer vision and pattern recognition (pp. 15049\u201315058).","DOI":"10.1109\/CVPR46437.2021.01480"},{"key":"2044_CR13","first-page":"6626","volume":"30","author":"M Heusel","year":"2017","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., & Hochreiter, S. (2017). GANs trained by a two time-scale update rule converge to a local Nash equilibrium. Advances in Neural Information Processing Systems, 30, 6626\u20136637.","journal-title":"Advances in Neural Information Processing Systems"},{"issue":"8","key":"2044_CR14","doi-asserted-by":"publisher","first-page":"1771","DOI":"10.1162\/089976602760128018","volume":"14","author":"GE Hinton","year":"2002","unstructured":"Hinton, G. E. (2002). Training products of experts by minimizing contrastive divergence. Neural Computation, 14(8), 1771\u20131800.","journal-title":"Neural Computation"},{"key":"2044_CR15","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., & Abbeel, P. (2020). Denoising diffusion probabilistic models. Advances in Neural Information Processing Systems, 33, 6840\u20136851.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2044_CR16","unstructured":"Huang, L., Chen, D., Liu, Y., Shen, Y., Zhao, D., & Zhou, J. (2023). Composer: Creative and controllable image synthesis with composable conditions. arXiv preprint arXiv:2302.09778"},{"key":"2044_CR17","doi-asserted-by":"crossref","unstructured":"Huang, X., Mallya, A., Wang, T.-C., & Liu, M.-Y. (2022). Multimodal conditional image synthesis with product-of-experts GANs. In European conference on computer vision (pp. 91\u2013109).","DOI":"10.1007\/978-3-031-19787-1_6"},{"key":"2044_CR18","unstructured":"Huang, Y., Du, C., Xue, Z., Chen, X., Zhao, H., & Huang, L. (2021). What makes multimodal learning better than single (provably). Advances in Neural Information Processing Systems, 34, 10944\u201310956."},{"key":"2044_CR19","unstructured":"Huang, Y., Lin, J., Zhou, C., Yang, H., & Huang, L. (2022). Modality competition: What makes joint training of multi-modal network fail in deep learning?(provably). In International conference on machine learning (pp. 9226\u20139259)."},{"key":"2044_CR20","unstructured":"Ismail, A.A., Hasan, M., & Ishtiaq, F. (2020). Improving multimodal accuracy through modality pre-training and attention. arXiv preprint arXiv:2011.06102"},{"key":"2044_CR21","doi-asserted-by":"crossref","unstructured":"Isola, P., Zhu, J.-Y., Zhou, T., Efros, A. A. (2017). Image-to-image translation with conditional adversarial networks. In Conference on computer vision and pattern recognition (pp. 1125\u20131134).","DOI":"10.1109\/CVPR.2017.632"},{"key":"2044_CR22","unstructured":"Jahn, M., Rombach, R., Ommer, B. (2021). Highresolution complex scene synthesis with transformers. In Conference on computer vision and pattern recognition workshop."},{"key":"2044_CR23","unstructured":"Kingma, D. P., & Welling, M. (2014). Autoencoding variational Bayes. In International conference on learning representations."},{"issue":"11","key":"2044_CR24","doi-asserted-by":"publisher","first-page":"3964","DOI":"10.1109\/TPAMI.2020.2992934","volume":"43","author":"I Kobyzev","year":"2020","unstructured":"Kobyzev, I., Prince, S. J., & Brubaker, M. A. (2020). Normalizing flows: An introduction and review of current methods. Transactions on Pattern Analysis and Machine Intelligence, 43(11), 3964\u20133979.","journal-title":"Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2044_CR25","doi-asserted-by":"crossref","unstructured":"LeCun, Y., Chopra, S., Hadsell, R., Ranzato, M., & Huang, F. (2006). A tutorial on energy-based learning. Predicting Structured Data, 1(0).","DOI":"10.7551\/mitpress\/7443.003.0014"},{"key":"2044_CR26","doi-asserted-by":"crossref","unstructured":"Li, Z., Wu, J., Koh, I., Tang, Y., & Sun, L. (2021). Image synthesis from layout with locality-aware mask adaption. In International conference on computer vision (pp. 13819\u201313828).","DOI":"10.1109\/ICCV48922.2021.01356"},{"key":"2044_CR27","unstructured":"Li, Z., Zhou, H., Bai, S., Li, P., Zhou, C., & Yang, H. (2022). M6-fashion: High-fidelity multimodal image generation and editing. arXiv preprint arXiv:2205.11705"},{"key":"2044_CR28","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Dollar, P., & Zitnick, C. L. (2014). Microsoft coco: Common objects in context. In European conference on computer vision (pp. 740\u2013755).","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"2044_CR29","first-page":"568","volume":"32","author":"X Liu","year":"2019","unstructured":"Liu, X., Yin, G., Shao, J., Wang, X., & Li, H. (2019). Learning to predict layout-to-image conditional convolutions for semantic image synthesis. Advances in Neural Information Processing Systems, 32, 568\u2013578.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2044_CR30","unstructured":"Loshchilov, I., & Hutter, F. (2017). Decoupled weight decay regularization. In International conference on learning representations."},{"key":"2044_CR31","doi-asserted-by":"crossref","unstructured":"Ma, M., Ren, J., Zhao, L., Testuggine, D., & Peng, X. (2022). Are multimodal transformers robust to missing modality? In Conference on computer vision and pattern recognition (pp. 18177\u201318186).","DOI":"10.1109\/CVPR52688.2022.01764"},{"key":"2044_CR32","unstructured":"Mirza, M., & Osindero, S. (2014). Conditional generative adversarial nets. In Advances in neural information processing systems workshop."},{"key":"2044_CR33","unstructured":"Miyato, T., & Koyama, M. (2018). cgans with projection discriminator. In International conference on learning representations."},{"key":"2044_CR34","doi-asserted-by":"crossref","unstructured":"Mou, C., Wang, X., Xie, L., Zhang, J., Qi, Z., Shan, Y., Qie, X. (2023). T2i-adapter: Learning adapters to dig out more controllable ability for text-to-image diffusion models. arXiv preprint arXiv:2302.08453","DOI":"10.1609\/aaai.v38i5.28226"},{"key":"2044_CR35","unstructured":"Odena, A., Olah, C., & Shlens, J. (2017). Conditional image synthesis with auxiliary classifier GANs. In International conference on machine learning (pp. 2642\u20132651)."},{"issue":"1","key":"2044_CR36","first-page":"2617","volume":"22","author":"G Papamakarios","year":"2021","unstructured":"Papamakarios, G., Nalisnick, E., Rezende, D. J., Mohamed, S., & Lakshminarayanan, B. (2021). Normalizing flows for probabilistic modeling and inference. The Journal of Machine Learning Research, 22(1), 2617\u20132680.","journal-title":"The Journal of Machine Learning Research"},{"key":"2044_CR37","doi-asserted-by":"crossref","unstructured":"Park, T., Liu, M.-Y., Wang, T.-C., & Zhu, J.- Y. (2019). Semantic image synthesis with spatially-adaptive normalization. In Conference on computer vision and pattern recognition (pp. 2337\u20132346).","DOI":"10.1109\/CVPR.2019.00244"},{"key":"2044_CR38","doi-asserted-by":"crossref","unstructured":"Parmar, G., Zhang, R., & Zhu, J.-Y. (2021). On buggy resizing libraries and surprising subtleties in fid calculation. arXiv preprint arXiv:2104.11222","DOI":"10.1109\/CVPR52688.2022.01112"},{"key":"2044_CR39","unstructured":"Parmar, N., Vaswani, A., Uszkoreit, J., Kaiser, L., Shazeer, N., Ku, A., & Tran, D. (2018). Image transformer. In International conference on machine learning (pp. 4055\u20134064)."},{"key":"2044_CR40","doi-asserted-by":"crossref","unstructured":"Peng, X., Wei, Y., Deng, A., Wang, D., & Hu, D. (2022). Balanced multimodal learning via on-the-fly gradient modulation. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 8238\u20138247).","DOI":"10.1109\/CVPR52688.2022.00806"},{"key":"2044_CR41","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., Krueger, G., & Sutskever, I. (2021). Learning transferable visual models from natural language supervision. In International conference on machine learning (pp. 8748\u20138763)."},{"key":"2044_CR42","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., & Chen, M. (2022). Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125"},{"key":"2044_CR43","unstructured":"Ramesh, A., Pavlov, M., Goh, G., Gray, S., Voss, C., Radford, A., Chen, M., & Sutskever, I. (2021). Zero-shot text-to-image generation. In International conference on machine learning (pp. 8821\u20138831)."},{"key":"2044_CR44","first-page":"14837","volume":"32","author":"A Razavi","year":"2019","unstructured":"Razavi, A., Van den Oord, A., & Vinyals, O. (2019). Generating diverse high-fidelity images with VQ-VAE-2. Advances in Neural Information Processing Systems, 32, 14837\u201314847.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2044_CR45","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., & Ommer, B. (2022). High-resolution image synthesis with latent diffusion models. In Conference on computer vision and pattern recognition (pp. 10684\u201310695).","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"2044_CR46","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia, C., Chan, W., Saxena, S., Li, L., Whang, J., Denton, E. L., Ghasemipour, K., Lopes, R. G., Ayan, B. K., Salimans, T., Ho, J., Fleet, D. J., & Norouzi, M. (2022). Photorealistic text-to-image diffusion models with deep language understanding. Advances in Neural Information Processing Systems, 35, 36479\u201336494.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2044_CR47","first-page":"2226","volume":"29","author":"T Salimans","year":"2016","unstructured":"Salimans, T., Goodfellow, I., Zaremba, W., Cheung, V., Radford, A., & Chen, X. (2016). Improved techniques for training GANs. Advances in Neural Information Processing Systems, 29, 2226\u20132234.","journal-title":"Advances in Neural Information Processing Systems"},{"issue":"4","key":"2044_CR48","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2897824.2925972","volume":"35","author":"E Simo-Serra","year":"2016","unstructured":"Simo-Serra, E., Iizuka, S., Sasaki, K., & Ishikawa, H. (2016). Learning to simplify: Fully convolutional networks for rough sketch cleanup. ACM Transactions on Graphics, 35(4), 1\u201311.","journal-title":"ACM Transactions on Graphics"},{"key":"2044_CR49","doi-asserted-by":"crossref","unstructured":"Skorokhodov, I., Sotnikov, G., & Elhoseiny, M. (2021). Aligning latent and image spaces to connect the unconnectable. In International conference on computer vision (pp. 14144\u201314153).","DOI":"10.1109\/ICCV48922.2021.01388"},{"key":"2044_CR50","first-page":"3483","volume":"28","author":"K Sohn","year":"2015","unstructured":"Sohn, K., Lee, H., & Yan, X. (2015). Learning structured output representation using deep conditional generative models. Advances in Neural Information Processing Systems, 28, 3483\u20133491.","journal-title":"Advances in Neural Information Processing Systems"},{"issue":"9","key":"2044_CR51","first-page":"5070","volume":"44","author":"W Sun","year":"2021","unstructured":"Sun, W., & Wu, T. (2021). Learning layout and style reconfigurable GANs for controllable image synthesis. Transactions on Pattern Analysis and Machine Intelligence, 44(9), 5070\u20135087.","journal-title":"Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2044_CR52","doi-asserted-by":"publisher","first-page":"1650","DOI":"10.1109\/LSP.2021.3101421","volume":"28","author":"Y Sun","year":"2021","unstructured":"Sun, Y., Mai, S., & Hu, H. (2021). Learning to balance the learning rates between various modalities via adaptive tracking factor. IEEE Signal Processing Letters, 28, 1650\u20131654.","journal-title":"IEEE Signal Processing Letters"},{"issue":"12","key":"2044_CR53","doi-asserted-by":"publisher","first-page":"2903","DOI":"10.1007\/s11263-022-01673-x","volume":"130","author":"V Sushko","year":"2022","unstructured":"Sushko, V., Sch\u00f6nfeld, E., Zhang, D., Gall, J., Schiele, B., & Khoreva, A. (2022). Oasis: Only adversarial supervision for semantic image synthesis. International Journal of Computer Vision, 130(12), 2903\u20132923.","journal-title":"International Journal of Computer Vision"},{"key":"2044_CR54","doi-asserted-by":"crossref","unstructured":"Suvorov, R., Logacheva, E., Mashikhin, A., Remizova, A., Ashukha, A., Silvestrov, A., Kong, N., Goka, H., Park, K., & Lempitsky, V. (2022). Resolution-robust large mask inpainting with Fourier convolutions. In Proceedings of the IEEE\/CVF winter conference on applications of computer vision (pp. 2149\u2013159).","DOI":"10.1109\/WACV51458.2022.00323"},{"key":"2044_CR55","doi-asserted-by":"crossref","unstructured":"Sylvain, T., Zhang, P., Bengio, Y., Hjelm, R. D., & Sharma, S. (2021). Object-centric image generation from layouts. In AAAI conference on artificial intelligence (vol. 35, pp. 2647\u20132655).","DOI":"10.1609\/aaai.v35i3.16368"},{"key":"2044_CR56","unstructured":"Tao, M., Tang, H., Wu, S., Sebe, N., Jing, X.-Y., Wu, F., & Bao, B. (2020). DF-GAN: Deep fusion generative adversarial networks for text-to-image synthesis. arXiv preprint arXiv:2008.05865"},{"key":"2044_CR57","first-page":"6306","volume":"30","author":"A Van Den Oord","year":"2017","unstructured":"Van Den Oord, A., Vinyals, O., & Kavukcuoglu, K. (2017). Neural discrete representation learning. Advances in Neural Information Processing Systems, 30, 6306\u20136315.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2044_CR58","first-page":"5998","volume":"30","author":"A Vaswani","year":"2017","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., Kaiser, \u0141, & Polosukhin, I. (2017). Attention is all you need. Advances in Neural Information Processing Systems, 30, 5998\u20136008.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2044_CR59","unstructured":"Wang, T., Zhang, T., Zhang, B., Ouyang, H., Chen, D., Chen, Q., & Wen, F. (2022). Pretraining is all you need for image-to-image translation. arXiv preprint arXiv:2205.12952"},{"key":"2044_CR60","doi-asserted-by":"crossref","unstructured":"Wang, T.-C., Liu, M.-Y., Zhu, J.-Y., Tao, A., Kautz, J., & Catanzaro, B. (2018). High-resolution image synthesis and semantic manipulation with conditional GANs. In Conference on computer vision and pattern recognition (pp. 8798\u20138807).","DOI":"10.1109\/CVPR.2018.00917"},{"key":"2044_CR61","doi-asserted-by":"crossref","unstructured":"Wang, W., Tran, D., & Feiszli, M. (2020). What makes training multi-modal classification networks hard?. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 12695\u201312705).","DOI":"10.1109\/CVPR42600.2020.01271"},{"key":"2044_CR62","unstructured":"Wu, C., Liang, J., Hu, X., Gan, Z., Wang, J., Wang, L., Liu, Z., Fang, Y., & Duan, N. (2022). Nuwa-infinity: Autoregressive over autoregressive generation for infinite visual synthesis. arXiv preprint arXiv:2207.09814"},{"key":"2044_CR63","doi-asserted-by":"crossref","unstructured":"Wu, C., Liang, J., Ji, L., Yang, F., Fang, Y., Jiang, D., & Duan, N. (2022). N\u00fcwa: Visual synthesis pre-training for neural visual world creation. In European conference on computer vision (pp. 720\u2013736).","DOI":"10.1007\/978-3-031-19787-1_41"},{"key":"2044_CR64","doi-asserted-by":"crossref","unstructured":"Xie, S., & Tu, Z. (2015). Holistically-nested edge detection. In International conference on computer vision (pp. 1395\u20131403).","DOI":"10.1109\/ICCV.2015.164"},{"key":"2044_CR65","doi-asserted-by":"publisher","first-page":"1451","DOI":"10.1007\/s11263-020-01429-5","volume":"129","author":"C Yang","year":"2021","unstructured":"Yang, C., Shen, Y., & Zhou, B. (2021). Semantic hierarchy emerges in deep generative representations for scene synthesis. International Journal of Computer Vision, 129, 1451\u20131466.","journal-title":"International Journal of Computer Vision"},{"key":"2044_CR66","doi-asserted-by":"crossref","unstructured":"Yang, Z., Liu, D., Wang, C., Yang, J., & Tao, D. (2022). Modeling image composition for complex scene generation. In Conference on computer vision and pattern recognition (pp. 7764\u20137773).","DOI":"10.1109\/CVPR52688.2022.00761"},{"key":"2044_CR67","unstructured":"Ye, H., Yang, X., Takac, M., Sunderraman, R., & Ji, S. (2021). Improving text-to-image synthesis using contrastive learning. In British machine vision conference."},{"key":"2044_CR68","unstructured":"Yu, J., Xu, Y., Koh, J.Y., Luong, T., Baid, G., Wang, Z., Vasudevan, V., Ku, A., Yang, Y., Ayan, B. K., Hutchinson, B., Han, W., Parekh, Z., Li, X., Zhang, H., Baldridge, J., & Wu, Y. (2022). Scaling autoregressive models for content-rich text-to-image generation. arXiv preprint arXiv:2206.10789"},{"key":"2044_CR69","doi-asserted-by":"crossref","unstructured":"Zhang, L., & Agrawala, M. (2023). Adding conditional control to text-to-image diffusion models. arXiv preprint arXiv:2302.05543","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"2044_CR70","doi-asserted-by":"crossref","unstructured":"Zhang, R., Isola, P., Efros, A. A., Shechtman, E., & Wang, O. (2018). The unreasonable effectiveness of deep features as a perceptual metric. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 586\u2013595).","DOI":"10.1109\/CVPR.2018.00068"},{"key":"2044_CR71","first-page":"27196","volume":"34","author":"Z Zhang","year":"2021","unstructured":"Zhang, Z., Ma, J., Zhou, C., Men, R., Li, Z., Ding, M., & Yang, H. (2021). UFC-BERT: Unifying multi-modal controls for conditional image synthesis. Advances in Neural Information Processing Systems, 34, 27196\u201327208.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2044_CR72","doi-asserted-by":"publisher","first-page":"2418","DOI":"10.1007\/s11263-020-01300-7","volume":"128","author":"B Zhao","year":"2020","unstructured":"Zhao, B., Yin, W., Meng, L., & Sigal, L. (2020). Layout2image: Image generation from layout. International Journal of Computer Vision, 128, 2418\u20132435.","journal-title":"International Journal of Computer Vision"},{"key":"2044_CR73","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Zhang, R., Chen, C., Li, C., Tensmeyer, C., Yu, T., Gu, J., Xu, J., & Sun, T. (2021). Lafite: Towards language-free training for text-to-image generation. In Conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR52688.2022.01738"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-024-02044-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-024-02044-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-024-02044-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,27]],"date-time":"2024-08-27T07:22:19Z","timestamp":1724743339000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-024-02044-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,2]]},"references-count":73,"journal-issue":{"issue":"9","published-print":{"date-parts":[[2024,9]]}},"alternative-id":["2044"],"URL":"https:\/\/doi.org\/10.1007\/s11263-024-02044-4","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"type":"print","value":"0920-5691"},{"type":"electronic","value":"1573-1405"}],"subject":[],"published":{"date-parts":[[2024,4,2]]},"assertion":[{"value":"2 May 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 February 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 April 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}