{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,30]],"date-time":"2026-01-30T03:50:20Z","timestamp":1769745020764,"version":"3.49.0"},"publisher-location":"Singapore","reference-count":25,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819556953","type":"print"},{"value":"9789819556960","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-5696-0_18","type":"book-chapter","created":{"date-parts":[[2026,1,29]],"date-time":"2026-01-29T14:03:56Z","timestamp":1769695436000},"page":"254-265","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Auto-Locate: A Training-Free Multi-instance Generation for\u00a0Text-to-Image Diffusion Models"],"prefix":"10.1007","author":[{"given":"Xiangzhi","family":"Tao","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kuangzhi","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhongyang","family":"Hu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Naijie","family":"Gu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,1,30]]},"reference":[{"key":"18_CR1","first-page":"852","volume":"34","author":"T Karras","year":"2021","unstructured":"Karras, T., et al.: Ali-as-free generative adversarial networks. Adv. Neural. Inf. Process. Syst. 34, 852\u2013863 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"18_CR2","volume-title":"Auto-Encoding Variational Bayes","author":"DP Kingma","year":"2013","unstructured":"Kingma, D.P., Welling, M.: Auto-Encoding Variational Bayes. Banff, Canada (2013)"},{"key":"18_CR3","unstructured":"Chen, M., et al.: Generative pretraining from pixels. In: International Conference on Machine Learning, pp. 1691\u20131703. PMLR (2020)"},{"key":"18_CR4","unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502. (2020)"},{"key":"18_CR5","doi-asserted-by":"crossref","unstructured":"Ren, J., Xu, M., Wu, J.-C., Liu, Z., Xiang, T., Toisoul, A.: Move anything with layered scene diffusion. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6380\u20136389 (2024)","DOI":"10.1109\/CVPR52733.2024.00610"},{"key":"18_CR6","doi-asserted-by":"crossref","unstructured":"Zhou, D., Li, Y., Ma, F., Zhang, X., Yang, Y.: Migc: Multi-instance generation con-troller for text-to-image synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6818\u20136828 (2024)","DOI":"10.1109\/CVPR52733.2024.00651"},{"key":"18_CR7","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"18_CR8","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: Textcraftor: Your text encoder can be image quality controller. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7985\u20137995 (2024)","DOI":"10.1109\/CVPR52733.2024.00763"},{"key":"18_CR9","doi-asserted-by":"crossref","unstructured":"Liu, Q., You, J., Wang, J., Tao, X., Zhang, B., Niu, L.: Shadow generation for compo-site image using diffusion model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8121\u20138130 (2024)","DOI":"10.1109\/CVPR52733.2024.00776"},{"key":"18_CR10","doi-asserted-by":"crossref","unstructured":"Shirakawa, T., Uchida, S.: Noisecollage: A layout-aware text-to-image diffusion model based on noise cropping and merging. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8921\u20138930 (2024)","DOI":"10.1109\/CVPR52733.2024.00852"},{"key":"18_CR11","doi-asserted-by":"crossref","unstructured":"Wang, X., Darrell, T., Rambhatla, S.S., Girdhar, R., Misra, I.: Instancediffusion: In-stance-level control for image generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6232\u20136242 (2024)","DOI":"10.1109\/CVPR52733.2024.00596"},{"key":"18_CR12","unstructured":"Hertz, A., Mokady, R., Tenenbaum, J., Aberman, K., Pritch, Y., Cohen-Or, D.: Prompt-to-prompt image editing with cross attention control (2022). https:\/\/arxiv.org\/abs\/2208.01626.1"},{"key":"18_CR13","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3592116","volume":"42","author":"H Chefer","year":"2023","unstructured":"Chefer, H., Alaluf, Y., Vinker, Y., Wolf, L., Cohen-Or, D.: Attend-and-excite: Attention-based semantic guidance for text-to-image diffusion models. ACM Trans. Graphics (TOG). 42, 1\u201310 (2023)","journal-title":"ACM Trans. Graphics (TOG)."},{"key":"18_CR14","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: Gligen: Open-set grounded text-to-image generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22511\u201322521 (2023)","DOI":"10.1109\/CVPR52729.2023.02156"},{"key":"18_CR15","doi-asserted-by":"crossref","unstructured":"Wang, X., Darrell, T., Rambhatla, S.S., Girdhar, R., Misra, I.: Instancediffusion: In-stance-level control for image generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6232\u20136242 (2024)","DOI":"10.1109\/CVPR52733.2024.00596"},{"key":"18_CR16","doi-asserted-by":"crossref","unstructured":"Xie, J., Li, Y., Huang, Y., Liu, H., Zhang, W., Zheng, Y., Shou, M.Z.: Boxdiff: Text-to-image synthesis with training-free box-constrained diffusion. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7452\u20137461 (2023)","DOI":"10.1109\/ICCV51070.2023.00685"},{"key":"18_CR17","doi-asserted-by":"crossref","unstructured":"Phung, Q., Ge, S., Huang, J.-B.: Grounded text-to-image synthesis with attention refo-cusing. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7932\u20137942 (2024)","DOI":"10.1109\/CVPR52733.2024.00758"},{"key":"18_CR18","doi-asserted-by":"crossref","unstructured":"Gong, B., Huang, S., Feng, Y., Zhang, S., Li, Y., Liu, Y.: Check locate rectify: A training-free layout calibration system for text-to-image generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6624\u20136634 (2024)","DOI":"10.1109\/CVPR52733.2024.00633"},{"key":"18_CR19","unstructured":"Feng, W., et al.: Training-free structured diffusion guidance for compositional text-to-image synthesis. arXiv preprint arXiv:2212.05032. (2022)"},{"key":"18_CR20","unstructured":"Li, Y., Keuper, M., Zhang, D., Khoreva, A.: Divide & bind your attention for improved generative semantic nursing. arXiv preprint arXiv:2307.10864 (2023)"},{"key":"18_CR21","doi-asserted-by":"crossref","unstructured":"Meral, T.H.S., Simsar, E., Tombari, F., Yanardag, P.: Conform: Contrast is all you need for high-fidelity text-to-image diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9005\u20139014 (2024)","DOI":"10.1109\/CVPR52733.2024.00860"},{"key":"18_CR22","doi-asserted-by":"crossref","unstructured":"Guo, X., Liu, J., Cui, M., Li, J., Yang, H., Huang, D.: Initno: Boosting text-to-image diffusion models via initial noise optimization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9380\u20139389 (2024)","DOI":"10.1109\/CVPR52733.2024.00896"},{"key":"18_CR23","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"234","DOI":"10.1007\/978-3-319-24574-4_28","volume-title":"Medical Image Computing and Computer-Assisted Intervention \u2014 MICCAI 2015","author":"O Ronneberger","year":"2015","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-Net: convolutional networks for biomedical image segmentation. In: Navab, N., Hornegger, J., Wells, W.M., Frangi, A.F. (eds.) MICCAI 2015. LNCS, vol. 9351, pp. 234\u2013241. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-24574-4_28"},{"key":"18_CR24","unstructured":"Radford, A., et al.: Learning transferable visual models from natural lan-guage supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PmLR (2021)"},{"key":"18_CR25","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning, pp. 12888\u201312900. PMLR (2022)"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-5696-0_18","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,29]],"date-time":"2026-01-29T14:04:05Z","timestamp":1769695445000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-5696-0_18"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819556953","9789819556960"],"references-count":25,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-5696-0_18","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"30 January 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Shanghai","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 October 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/2025.prcv.cn\/index.asp","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}