{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,4]],"date-time":"2025-09-04T13:44:56Z","timestamp":1756993496363,"version":"3.40.3"},"publisher-location":"Cham","reference-count":39,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031781711"},{"type":"electronic","value":"9783031781728"}],"license":[{"start":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:00:00Z","timestamp":1733184000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:00:00Z","timestamp":1733184000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-78172-8_9","type":"book-chapter","created":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T09:48:10Z","timestamp":1733132890000},"page":"130-145","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Text2Street: Controllable Text-to-Image Generation for\u00a0Street Views"],"prefix":"10.1007","author":[{"given":"Songen","family":"Gu","sequence":"first","affiliation":[]},{"given":"Jinming","family":"Su","sequence":"additional","affiliation":[]},{"given":"Yiting","family":"Duan","sequence":"additional","affiliation":[]},{"given":"Xingyue","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Junfeng","family":"Luo","sequence":"additional","affiliation":[]},{"given":"Hao","family":"Zhao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,3]]},"reference":[{"key":"9_CR1","unstructured":"Stability AI: Stable diffusion 2.1. https:\/\/huggingface.co\/stabilityai\/stable-diffusion-2-1"},{"key":"9_CR2","unstructured":"Betker, J., et al.: Improving image generation with better captions (2023)"},{"key":"9_CR3","doi-asserted-by":"crossref","unstructured":"Caesar, H., et al.: nuScenes: a multimodal dataset for autonomous driving. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11621\u201311631 (2020)","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"9_CR4","series-title":"Advances in Experimental Medicine and Biology","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1007\/978-3-030-33128-3_1","volume-title":"Deep Learning in Medical Image Analysis","author":"H-P Chan","year":"2020","unstructured":"Chan, H.-P., Samala, R.K., Hadjiiski, L.M., Zhou, C.: Deep learning in medical image analysis. In: Lee, G., Fujita, H. (eds.) Deep Learning in Medical Image Analysis. AEMB, vol. 1213, pp. 3\u201321. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-33128-3_1"},{"issue":"4","key":"9_CR5","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3592116","volume":"42","author":"H Chefer","year":"2023","unstructured":"Chefer, H., Alaluf, Y., Vinker, Y., Wolf, L., Cohen-Or, D.: Attend-and-excite: attention-based semantic guidance for text-to-image diffusion models. ACM Trans. Graph. (TOG) 42(4), 1\u201310 (2023)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"9_CR6","unstructured":"Chen, K., Xie, E., Chen, Z., Hong, L., Li, Z., Yeung, D.Y.: Integrating geometric control into text-to-image diffusion models for high-quality detection data generation via text prompt. arXiv: 2306.04607 (2023)"},{"key":"9_CR7","unstructured":"Ding, M., et al.: CogView: mastering text-to-image generation via transformers. arXiv preprint arXiv:2105.13290 (2021)"},{"key":"9_CR8","doi-asserted-by":"crossref","unstructured":"Esser, P., Rombach, R., Ommer, B.: Taming transformers for high-resolution image synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12873\u201312883 (2021)","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"9_CR9","unstructured":"Goodfellow, I., et al.: Generative adversarial nets. In: Advances in Neural Information Processing Systems, vol. 27 (2014)"},{"key":"9_CR10","unstructured":"Gregor, K., Danihelka, I., Graves, A., Rezende, D., Wierstra, D.: DRAW: a recurrent neural network for image generation. In: International Conference on Machine Learning, pp. 1462\u20131471. PMLR (2015)"},{"key":"9_CR11","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"9_CR12","doi-asserted-by":"crossref","unstructured":"Hessel, J., Holtzman, A., Forbes, M., Bras, R.L., Choi, Y.: CLIPScore: a reference-free evaluation metric for image captioning. arXiv preprint arXiv:2104.08718 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"9_CR13","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: GANs trained by a two time-scale update rule converge to a local Nash equilibrium. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"9_CR14","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. In: Advances in Neural Information Processing Systems, vol. 33, pp. 6840\u20136851 (2020)"},{"key":"9_CR15","doi-asserted-by":"publisher","unstructured":"Jocher, G.: YOLOv5 by Ultralytics, May 2020. https:\/\/doi.org\/10.5281\/zenodo.3908559. https:\/\/github.com\/ultralytics\/yolov5","DOI":"10.5281\/zenodo.3908559"},{"key":"9_CR16","doi-asserted-by":"crossref","unstructured":"Ju, X., Zeng, A., Zhao, C., Wang, J., Zhang, L., Xu, Q.: HumanSD: a native skeleton-guided diffusion model for human image generation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15988\u201315998 (2023)","DOI":"10.1109\/ICCV51070.2023.01465"},{"key":"9_CR17","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"},{"key":"9_CR18","unstructured":"Mansimov, E., Parisotto, E., Ba, J.L., Salakhutdinov, R.: Generating images from captions with attention. arXiv preprint arXiv:1511.02793 (2015)"},{"issue":"4","key":"9_CR19","doi-asserted-by":"publisher","first-page":"94","DOI":"10.1109\/MITS.2019.2907630","volume":"11","author":"E Marti","year":"2019","unstructured":"Marti, E., De Miguel, M.A., Garcia, F., Perez, J.: A review of sensor technologies for perception in automated driving. IEEE Intell. Transp. Syst. Mag. 11(4), 94\u2013108 (2019)","journal-title":"IEEE Intell. Transp. Syst. Mag."},{"key":"9_CR20","unstructured":"Midjourney: Midjourney. https:\/\/www.midjourney.com"},{"key":"9_CR21","doi-asserted-by":"crossref","unstructured":"Mou, C., et al.: T2I-adapter: learning adapters to dig out more controllable ability for text-to-image diffusion models. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a038, pp. 4296\u20134304 (2024)","DOI":"10.1609\/aaai.v38i5.28226"},{"key":"9_CR22","unstructured":"Nichol, A., et al.: GLIDE: towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741 (2021)"},{"key":"9_CR23","unstructured":"OpenAI: GPT-4 technical report (2023)"},{"key":"9_CR24","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"9_CR25","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with CLIP latents. arXiv preprint arXiv:2204.06125 (2022)"},{"key":"9_CR26","unstructured":"Ramesh, A., et al.: Zero-shot text-to-image generation. In: International Conference on Machine Learning, pp. 8821\u20138831. PMLR (2021)"},{"key":"9_CR27","unstructured":"Reed, S.E., Akata, Z., Mohan, S., Tenka, S., Schiele, B., Lee, H.: Learning what and where to draw. In: Advances in Neural Information Processing Systems, vol. 29 (2016)"},{"key":"9_CR28","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"9_CR29","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"234","DOI":"10.1007\/978-3-319-24574-4_28","volume-title":"Medical Image Computing and Computer-Assisted Intervention \u2013 MICCAI 2015","author":"O Ronneberger","year":"2015","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-net: convolutional networks for biomedical image segmentation. In: Navab, N., Hornegger, J., Wells, W.M., Frangi, A.F. (eds.) MICCAI 2015. LNCS, vol. 9351, pp. 234\u2013241. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-24574-4_28"},{"key":"9_CR30","unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding. In: Advances in Neural Information Processing Systems, vol. 35, pp. 36479\u201336494 (2022)"},{"key":"9_CR31","doi-asserted-by":"publisher","first-page":"58","DOI":"10.1016\/j.mechatronics.2016.11.005","volume":"41","author":"Q Shi","year":"2017","unstructured":"Shi, Q., Li, C., Wang, C., Luo, H., Huang, Q., Fukuda, T.: Design and implementation of an omnidirectional vision system for robot perception. Mechatronics 41, 58\u201366 (2017)","journal-title":"Mechatronics"},{"key":"9_CR32","doi-asserted-by":"crossref","unstructured":"Shirakawa, T., Uchida, S.: NoiseCollage: a layout-aware text-to-image diffusion model based on noise cropping and merging. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8921\u20138930 (2024)","DOI":"10.1109\/CVPR52733.2024.00852"},{"key":"9_CR33","unstructured":"Sohl-Dickstein, J., Weiss, E., Maheswaranathan, N., Ganguli, S.: Deep unsupervised learning using nonequilibrium thermodynamics. In: International Conference on Machine Learning, pp. 2256\u20132265. PMLR (2015)"},{"key":"9_CR34","doi-asserted-by":"crossref","unstructured":"Swerdlow, A., Xu, R., Zhou, B.: Street-view image generation from a bird\u2019s-eye view layout. arXiv preprint arXiv:2301.04634 (2023)","DOI":"10.1109\/LRA.2024.3368234"},{"key":"9_CR35","unstructured":"Wang, W., et al.: Semantic image synthesis via diffusion models. arXiv preprint arXiv:2207.00050 (2022)"},{"key":"9_CR36","unstructured":"Yang, K., Ma, E., Peng, J., Guo, Q., Lin, D., Yu, K.: BEVControl: accurately controlling street-view elements with multi-perspective consistency via BEV sketch layout. arXiv preprint arXiv:2308.01661 (2023)"},{"key":"9_CR37","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., Agrawala, M.: Adding conditional control to text-to-image diffusion models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3836\u20133847 (2023)","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"9_CR38","unstructured":"Zhao, S., et al.: Uni-ControlNet: all-in-one control to text-to-image diffusion models. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"9_CR39","unstructured":"Zhou, Y., et al.: LAFITE: towards language-free training for text-to-image generation. arXiv preprint arXiv:2111.13792 (2021)"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-78172-8_9","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T10:05:21Z","timestamp":1733133921000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-78172-8_9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,3]]},"ISBN":["9783031781711","9783031781728"],"references-count":39,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-78172-8_9","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,12,3]]},"assertion":[{"value":"3 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICPR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Pattern Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Kolkata","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"India","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icpr2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icpr2024.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}