{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,8]],"date-time":"2026-01-08T12:01:16Z","timestamp":1767873676058,"version":"3.49.0"},"publisher-location":"Singapore","reference-count":36,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819557011","type":"print"},{"value":"9789819557028","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-5702-8_19","type":"book-chapter","created":{"date-parts":[[2026,1,8]],"date-time":"2026-01-08T08:29:47Z","timestamp":1767860987000},"page":"270-283","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["EIU-IC: Enhancing Interaction Understanding in\u00a0Text-to-Image Generation Models with\u00a0Interaction Control"],"prefix":"10.1007","author":[{"given":"Chengyang","family":"Zhang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yonghua","family":"Zhu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wenjing","family":"Gao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,1,9]]},"reference":[{"key":"19_CR1","unstructured":"Hua, M., Liu, J., Ding, F., Liu, W., Wu, J., He, Q.: DreamTuner: single image is enough for subject-driven generation. arXiv preprint arXiv:2312.13691 (2023)"},{"issue":"2","key":"19_CR2","first-page":"909","volume":"38","author":"Y Cai","year":"2024","unstructured":"Cai, Y., Wei, Y., Ji, Z., Bai, J., Han, H., Zuo, W.: Decoupled textual embeddings for customized image generation. Proc. AAAI Conf. Artif. Intell. 38(2), 909\u2013917 (2024)","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"19_CR3","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Zhang, R., Gu, J., Sun, T.: Customization assistant for text-to-image generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9182\u20139191 (2024)","DOI":"10.1109\/CVPR52733.2024.00877"},{"key":"19_CR4","unstructured":"Pan, J., Yan, H., Liew, J.H., Feng, J., Tan, V.Y.F.: Towards accurate guided diffusion sampling through symplectic adjoint method. arXiv preprint arXiv:2312.12030 (2023)"},{"key":"19_CR5","doi-asserted-by":"crossref","unstructured":"Chen, D.-Y., Tennent, H., Hsu, C.-W.: ArtAdapter: text-to-image style transfer using multi-level style encoder and explicit adaptation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8619\u20138628 (2024)","DOI":"10.1109\/CVPR52733.2024.00823"},{"key":"19_CR6","doi-asserted-by":"crossref","unstructured":"Giambi, N., Lisanti, G.: Conditioning diffusion models via attributes and semantic masks for face generation. arXiv preprint arXiv:2306.00914 (2023)","DOI":"10.1016\/j.cviu.2024.104026"},{"key":"19_CR7","doi-asserted-by":"crossref","unstructured":"Valevski, D., Lumen, D., Matias, Y., Leviathan, Y.: Face0: instantaneously conditioning a text-to-image model on a face. In: SIGGRAPH Asia 2023 Conference Papers, pp. 1\u201310 (2023)","DOI":"10.1145\/3610548.3618249"},{"key":"19_CR8","doi-asserted-by":"crossref","unstructured":"Hoe, J.T., Jiang, X., Chan, C.S., Tan, Y.-P., Hu, W.: InteractDiffusion: interaction control in text-to-image diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6180\u20136189 (2024)","DOI":"10.1109\/CVPR52733.2024.00591"},{"key":"19_CR9","doi-asserted-by":"crossref","unstructured":"Huang, Z., Wu, T., Jiang, Y., Chan, K.C.K, Liu, Z.: ReVersion: diffusion-based relation inversion from images. In: SIGGRAPH Asia 2024 Conference Papers, pp. 1\u201311 (2024)","DOI":"10.1145\/3680528.3687658"},{"key":"19_CR10","doi-asserted-by":"crossref","unstructured":"Huang, S., et al.: Learning disentangled identifiers for action-customized text-to-image generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7797\u20137806 (2024)","DOI":"10.1109\/CVPR52733.2024.00745"},{"key":"19_CR11","doi-asserted-by":"crossref","unstructured":"Cao, P., Zhou, F., Song, Q., Yang, L.: Controllable generation with text-to-image diffusion models: a survey. arXiv preprint arXiv:2403.04279 (2024)","DOI":"10.1109\/TPAMI.2025.3646548"},{"issue":"2","key":"19_CR12","first-page":"1584","volume":"35","author":"T Hua","year":"2021","unstructured":"Hua, T., Zheng, H., Bai, Y., Zhang, W., Zhang, X.-P., Mei, T.: Exploiting relationship for complex-scene image generation. Proc. AAAI Conf. Artif. Intell. 35(2), 1584\u20131592 (2021)","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"19_CR13","doi-asserted-by":"crossref","unstructured":"Gao, C., et al.: InteractGAN: learning to generate human-object interaction. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 165\u2013173 (2020)","DOI":"10.1145\/3394171.3413854"},{"key":"19_CR14","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"19_CR15","unstructured":"Nichol, A., et al.: Glide: towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741 (2021)"},{"key":"19_CR16","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022)"},{"key":"19_CR17","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding. Adv. Neural. Inf. Process. Syst. 35, 36479\u201336494 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"19_CR18","first-page":"8780","volume":"34","author":"P Dhariwal","year":"2021","unstructured":"Dhariwal, P., Nichol, A.: Diffusion models beat GANs on image synthesis. Adv. Neural. Inf. Process. Syst. 34, 8780\u20138794 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"19_CR19","unstructured":"Zheng, G., et al.: Entropy-driven sampling and training scheme for conditional diffusion generation. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds) Computer Vision \u2013 ECCV 2022. ECCV 2022. LNCS, vol. 13682, pp. 754\u2013769. Springer, Cham (2022)"},{"key":"19_CR20","unstructured":"Ren, J., Xu, C., Chen, H., Qin, X., Li, C., Zhu, L.: Towards flexible, scalable, and adaptive multi-modal conditioned face synthesis. arXiv preprint arXiv:2312.16274 (2023)"},{"key":"19_CR21","doi-asserted-by":"crossref","unstructured":"Zheng, G., Zhou, X., Li, X., Qi, Z., Shan, Y., Li, X.: LayoutDiffusion: controllable diffusion model for layout-to-image generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22490\u201322499 (2023)","DOI":"10.1109\/CVPR52729.2023.02154"},{"key":"19_CR22","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"234","DOI":"10.1007\/978-3-319-24574-4_28","volume-title":"Medical Image Computing and Computer-Assisted Intervention \u2013 MICCAI 2015","author":"O Ronneberger","year":"2015","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-Net: convolutional networks for biomedical image segmentation. In: Navab, N., Hornegger, J., Wells, W.M., Frangi, A.F. (eds.) MICCAI 2015. LNCS, vol. 9351, pp. 234\u2013241. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-24574-4_28"},{"key":"19_CR23","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"19_CR24","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: GliGEN: open-set grounded text-to-image generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22511\u201322521 (2023)","DOI":"10.1109\/CVPR52729.2023.02156"},{"key":"19_CR25","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)"},{"key":"19_CR26","doi-asserted-by":"crossref","unstructured":"Chao, Y.-W., Liu, Y., Liu, X., Zeng, H., Deng, J.: Learning to detect human-object interactions. In: 2018 IEEE Winter Conference on Applications of Computer Vision (WACV), pp. 381\u2013389. IEEE (2018)","DOI":"10.1109\/WACV.2018.00048"},{"issue":"4","key":"19_CR27","doi-asserted-by":"publisher","first-page":"2415","DOI":"10.1109\/TPAMI.2023.3331738","volume":"46","author":"S Ma","year":"2023","unstructured":"Ma, S., Wang, Y., Wang, S., Wei, Y.: FGAHOI: fine-grained anchors for human-object interaction detection. IEEE Trans. Pattern Anal. Mach. Intell. 46(4), 2415\u20132429 (2023)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"19_CR28","unstructured":"Bi\u0144kowski, M., Sutherland, D.J., Arbel, M., Gretton, A.: Demystifying MMD GANs. arXiv preprint arXiv:1801.01401 (2018)"},{"key":"19_CR29","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: GANs trained by a two time-scale update rule converge to a local Nash equilibrium. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"19_CR30","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"852","DOI":"10.1007\/978-3-319-46448-0_51","volume-title":"Computer Vision \u2013 ECCV 2016","author":"C Lu","year":"2016","unstructured":"Lu, C., Krishna, R., Bernstein, M., Fei-Fei, L.: Visual relationship detection with language priors. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 852\u2013869. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_51"},{"key":"19_CR31","doi-asserted-by":"crossref","unstructured":"Yu, R., Li, A., Morariu, V.I., Davis, L.S.: Visual relationship detection with internal and external linguistic knowledge distillation. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1974\u20131982 (2017)","DOI":"10.1109\/ICCV.2017.121"},{"key":"19_CR32","doi-asserted-by":"crossref","unstructured":"Zhuang, B., Liu, L., Shen, C., Reid, I.: Towards context-aware interaction recognition for visual relationship detection. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 589\u2013598 (2017)","DOI":"10.1109\/ICCV.2017.71"},{"issue":"6","key":"19_CR33","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3618322","volume":"42","author":"Y Alaluf","year":"2023","unstructured":"Alaluf, Y., Richardson, E., Metzer, G., Cohen-Or, D.: A neural space-time representation for text-to-image personalization. ACM TOG 42(6), 1\u201310 (2023)","journal-title":"ACM TOG"},{"key":"19_CR34","unstructured":"Jia, X., et al.: Taming encoder for zero fine-tuning image customization with text-to-image diffusion models. arXiv preprint arXiv:2304.02642 (2023)"},{"key":"19_CR35","doi-asserted-by":"crossref","unstructured":"Arar, M., et al.: Domain-agnostic tuning-encoder for fast personalization of text-to-image models. In: SIGGRAPH Asia 2023 Conference Papers, pp. 1\u201310 (2023)","DOI":"10.1145\/3610548.3618173"},{"issue":"1","key":"19_CR36","doi-asserted-by":"publisher","first-page":"99","DOI":"10.1145\/3503250","volume":"65","author":"B Mildenhall","year":"2021","unstructured":"Mildenhall, B., Srinivasan, P.P., Tancik, M., Barron, J.T., Ramamoorthi, R., Ng, R.: Nerf: representing scenes as neural radiance fields for view synthesis. Commun. ACM 65(1), 99\u2013106 (2021)","journal-title":"Commun. ACM"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-5702-8_19","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,8]],"date-time":"2026-01-08T08:29:56Z","timestamp":1767860996000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-5702-8_19"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819557011","9789819557028"],"references-count":36,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-5702-8_19","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"9 January 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Shanghai","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 October 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/2025.prcv.cn\/index.asp","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}