{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,24]],"date-time":"2026-03-24T20:12:28Z","timestamp":1774383148234,"version":"3.50.1"},"reference-count":48,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2021YFA1000102"],"award-info":[{"award-number":["2021YFA1000102"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100007129","name":"Natural Science Foundation of Shandong Province","doi-asserted-by":"publisher","award":["ZR2022MF260"],"award-info":[{"award-number":["ZR2022MF260"]}],"id":[{"id":"10.13039\/501100007129","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61673396"],"award-info":[{"award-number":["61673396"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62376285"],"award-info":[{"award-number":["62376285"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neural Networks"],"published-print":{"date-parts":[[2026,8]]},"DOI":"10.1016\/j.neunet.2026.108823","type":"journal-article","created":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T23:27:26Z","timestamp":1772926046000},"page":"108823","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["FreeMD: Training-free multi-domain text-to-image generation with any control"],"prefix":"10.1016","volume":"200","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7323-5896","authenticated-orcid":false,"given":"Mingwen","family":"Shao","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0001-9192-9872","authenticated-orcid":false,"given":"Chang","family":"Liu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0331-1608","authenticated-orcid":false,"given":"Xiang","family":"Lv","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6549-6989","authenticated-orcid":false,"given":"Lingzhuang","family":"Meng","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4217-3861","authenticated-orcid":false,"given":"Yecong","family":"Wan","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0006-6577-026X","authenticated-orcid":false,"given":"Zhengyi","family":"Gong","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.neunet.2026.108823_bib0001","series-title":"ACM siggraph 2024 conference papers","first-page":"1","article-title":"LooseControl: Lifting controlnet for generalized depth conditioning","author":"Bhat","year":"2024"},{"issue":"6","key":"10.1016\/j.neunet.2026.108823_bib0002","doi-asserted-by":"crossref","first-page":"679","DOI":"10.1109\/TPAMI.1986.4767851","article-title":"A computational approach to edge detection","volume":"8","author":"Canny","year":"1986","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"10.1016\/j.neunet.2026.108823_bib0003","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"22560","article-title":"MasaCtrl: Tuning-free mutual self-attention control for consistent image synthesis and editing","author":"Cao","year":"2023"},{"key":"10.1016\/j.neunet.2026.108823_bib0004","series-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","first-page":"7291","article-title":"Realtime multi-person 2D pose estimation using part affinity fields","author":"Cao","year":"2017"},{"key":"10.1016\/j.neunet.2026.108823_bib0005","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"9650","article-title":"Emerging properties in self-supervised vision transformers","author":"Caron","year":"2021"},{"key":"10.1016\/j.neunet.2026.108823_bib0006","series-title":"Proceedings of the IEEE\/CVF winter conference on applications of computer vision","first-page":"5343","article-title":"Training-free layout control with cross-attention guidance","author":"Chen","year":"2024"},{"key":"10.1016\/j.neunet.2026.108823_bib0007","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"2174","article-title":"Zero-shot spatial layout conditioning for text-to-image diffusion models","author":"Couairon","year":"2023"},{"key":"10.1016\/j.neunet.2026.108823_bib0008","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"6624","article-title":"Check locate rectify: A training-free layout calibration system for text-to-image generation","author":"Gong","year":"2024"},{"key":"10.1016\/j.neunet.2026.108823_bib0009","unstructured":"Hertz, A., Mokady, R., Tenenbaum, J., Aberman, K., Pritch, Y., & Cohen-Or, D. (2022). Prompt-to-prompt image editing with cross attention control. arXiv: 2208.01626."},{"key":"10.1016\/j.neunet.2026.108823_bib0010","first-page":"6626","article-title":"Gans trained by a two time-scale update rule converge to a local nash equilibrium","volume":"30","author":"Heusel","year":"2017","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2026.108823_bib0011","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2026.108823_bib0012","series-title":"Thirty-seventh conference on neural information processing systems","article-title":"Cocktail: Mixing multi-modality control for text-conditional image generation","author":"Hu","year":"2023"},{"key":"10.1016\/j.neunet.2026.108823_bib0013","unstructured":"Huang, L., Chen, D., Liu, Y., Shen, Y., Zhao, D., & Zhou, J. (2023). Composer: Creative and controllable image synthesis with composable conditions. arXiv: 2302.09778."},{"key":"10.1016\/j.neunet.2026.108823_bib0014","series-title":"Proceedings of the AAAI conference on artificial intelligence","first-page":"2480","article-title":"SSMG: Spatial-semantic map guided diffusion model for free-form layout-to-image generation","volume":"38","author":"Jia","year":"2024"},{"key":"10.1016\/j.neunet.2026.108823_bib0015","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"2426","article-title":"DiffusionCLIP: Text-guided diffusion models for robust image manipulation","author":"Kim","year":"2022"},{"key":"10.1016\/j.neunet.2026.108823_bib0016","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"19721","article-title":"FlowEdit: Inversion-free text-based editing using pre-trained flow models","author":"Kulikov","year":"2025"},{"issue":"10","key":"10.1016\/j.neunet.2026.108823_bib0017","doi-asserted-by":"crossref","first-page":"12581","DOI":"10.1109\/TPAMI.2023.3282631","article-title":"UniFormer: Unifying convolution and self-attention for visual recognition","volume":"45","author":"Li","year":"2023","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"10.1016\/j.neunet.2026.108823_bib0018","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"22511","article-title":"GLIGEN: Open-set grounded text-to-image generation","author":"Li","year":"2023"},{"key":"10.1016\/j.neunet.2026.108823_bib0019","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"2294","article-title":"TF-ICON: Diffusion-based training-free cross-domain image composition","author":"Lu","year":"2023"},{"key":"10.1016\/j.neunet.2026.108823_bib0020","unstructured":"Meng, C., He, Y., Song, Y., Song, J., Wu, J., Zhu, J.-Y., & Ermon, S. (2021). SDEdit: Guided image synthesis and editing with stochastic differential equations. arXiv: 2108.01073."},{"key":"10.1016\/j.neunet.2026.108823_bib0021","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"7465","article-title":"FreeControl: Training-free spatial control of any text-to-image diffusion model with any condition","author":"Mo","year":"2024"},{"key":"10.1016\/j.neunet.2026.108823_bib0022","series-title":"Proceedings of the AAAI conference on artificial intelligence","first-page":"4296","article-title":"T2I-adapter: Learning adapters to dig out more controllable ability for text-to-image diffusion models","volume":"vol. 38","author":"Mou","year":"2024"},{"key":"10.1016\/j.neunet.2026.108823_bib0023","unstructured":"Nichol, A., Dhariwal, P., Ramesh, A., Shyam, P., Mishkin, P., McGrew, B., Sutskever, I., & Chen, M. (2021). GLIDE: Towards photorealistic image generation and editing with text-guided diffusion models. arXiv: 2112.10741."},{"key":"10.1016\/j.neunet.2026.108823_bib0024","unstructured":"Nie, W., Liu, S., Mardani, M., Liu, C., Eckart, B., & Vahdat, A. (2024). Compositional text-to-image generation with dense blob representations. arXiv: 2405.08246."},{"key":"10.1016\/j.neunet.2026.108823_bib0025","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"8764","article-title":"Zero-painter: Training-free layout control for text-to-image synthesis","author":"Ohanyan","year":"2024"},{"key":"10.1016\/j.neunet.2026.108823_bib0026","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"7932","article-title":"Grounded text-to-image synthesis with attention refocusing","author":"Phung","year":"2024"},{"key":"10.1016\/j.neunet.2026.108823_bib0027","unstructured":"Podell, D., English, Z., Lacey, K., Blattmann, A., Dockhorn, T., M\u00fcller, J., Penna, J., & Rombach, R. (2023). SDXL: Improving latent diffusion models for high-resolution image synthesis. arXiv: 2307.01952."},{"key":"10.1016\/j.neunet.2026.108823_bib0028","unstructured":"Qin, C., Zhang, S., Yu, N., Feng, Y., Yang, X., Zhou, Y., Wang, H., Niebles, J. C., Xiong, C., Savarese, S. et al. (2023). UniControl: A unified diffusion model for controllable visual generation in the wild. arXiv: 2305.11147."},{"key":"10.1016\/j.neunet.2026.108823_bib0029","series-title":"International conference on machine learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.neunet.2026.108823_bib0030","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., & Chen, M. (2022). Hierarchical text-conditional image generation with clip latents, 1(2), 3. arXiv: 2204.06125."},{"issue":"3","key":"10.1016\/j.neunet.2026.108823_bib0031","doi-asserted-by":"crossref","first-page":"1623","DOI":"10.1109\/TPAMI.2020.3019967","article-title":"Towards robust monocular depth estimation: Mixing datasets for zero-shot cross-dataset transfer","volume":"44","author":"Ranftl","year":"2020","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"10.1016\/j.neunet.2026.108823_bib0032","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"10684","article-title":"High-resolution image synthesis with latent diffusion models","author":"Rombach","year":"2022"},{"key":"10.1016\/j.neunet.2026.108823_bib0033","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"22500","article-title":"DreamBooth: Fine tuning text-to-image diffusion models for subject-driven generation","author":"Ruiz","year":"2023"},{"key":"10.1016\/j.neunet.2026.108823_bib0034","first-page":"36479","article-title":"Photorealistic text-to-image diffusion models with deep language understanding","volume":"35","author":"Saharia","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2026.108823_bib0035","unstructured":"Song, J., Meng, C., & Ermon, S. (2020a). Denoising diffusion implicit models. arXiv: 2010.02502."},{"key":"10.1016\/j.neunet.2026.108823_bib0036","unstructured":"Song, Y., Sohl-Dickstein, J., Kingma, D. P., Kumar, A., Ermon, S., & Poole, B. (2020b). Score-based generative modeling through stochastic differential equations. arXiv: 2011.13456."},{"key":"10.1016\/j.neunet.2026.108823_bib0037","series-title":"European conference on computer vision","first-page":"92","article-title":"AnyControl: Create your artwork with versatile control on text-to-image generation","author":"Sun","year":"2025"},{"key":"10.1016\/j.neunet.2026.108823_bib0038","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"10748","article-title":"Splicing vit features for semantic appearance transfer","author":"Tumanyan","year":"2022"},{"key":"10.1016\/j.neunet.2026.108823_bib0039","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"1921","article-title":"Plug-and-play diffusion features for text-driven image-to-image translation","author":"Tumanyan","year":"2023"},{"key":"10.1016\/j.neunet.2026.108823_bib0040","series-title":"Proceedings of the IEEE international conference on computer vision","first-page":"1395","article-title":"Holistically-nested edge detection","author":"Xie","year":"2015"},{"key":"10.1016\/j.neunet.2026.108823_bib0041","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"3836","article-title":"Adding conditional control to text-to-image diffusion models","author":"Zhang","year":"2023"},{"key":"10.1016\/j.neunet.2026.108823_bib0042","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"6027","article-title":"SINE: Single image editing with text-to-image diffusion models","author":"Zhang","year":"2023"},{"key":"10.1016\/j.neunet.2026.108823_bib0043","series-title":"Thirty-seventh conference on neural information processing systems","article-title":"Uni-Controlnet: All-in-one control to text-to-image diffusion models","author":"Zhao","year":"2023"},{"key":"10.1016\/j.neunet.2026.108823_bib0044","doi-asserted-by":"crossref","first-page":"302","DOI":"10.1007\/s11263-018-1140-0","article-title":"Semantic understanding of scenes through the ade20k dataset","volume":"127","author":"Zhou","year":"2019","journal-title":"International Journal of Computer Vision"},{"key":"10.1016\/j.neunet.2026.108823_bib0045","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"6818","article-title":"MIGC: Multi-instance generation controller for text-to-image synthesis","author":"Zhou","year":"2024"},{"issue":"10","key":"10.1016\/j.neunet.2026.108823_bib0046","doi-asserted-by":"crossref","first-page":"3096","DOI":"10.1109\/TNNLS.2018.2890018","article-title":"A deep collaborative framework for face photo-sketch synthesis","volume":"30","author":"Zhu","year":"2019","journal-title":"IEEE Transactions on Neural Networks and Learning Systems"},{"issue":"6","key":"10.1016\/j.neunet.2026.108823_bib0047","doi-asserted-by":"crossref","first-page":"1820","DOI":"10.1007\/s11263-021-01442-2","article-title":"Learning deep patch representation for probabilistic graphical model-based face sketch synthesis","volume":"129","author":"Zhu","year":"2021","journal-title":"International Journal of Computer Vision"},{"issue":"2","key":"10.1016\/j.neunet.2026.108823_bib0048","doi-asserted-by":"crossref","first-page":"893","DOI":"10.1109\/TNNLS.2020.3030536","article-title":"Knowledge distillation for face photo-sketch synthesis","volume":"33","author":"Zhu","year":"2022","journal-title":"IEEE Transactions on Neural Networks and Learning Systems"}],"container-title":["Neural Networks"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608026002856?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608026002856?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,3,24]],"date-time":"2026-03-24T16:38:35Z","timestamp":1774370315000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0893608026002856"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,8]]},"references-count":48,"alternative-id":["S0893608026002856"],"URL":"https:\/\/doi.org\/10.1016\/j.neunet.2026.108823","relation":{},"ISSN":["0893-6080"],"issn-type":[{"value":"0893-6080","type":"print"}],"subject":[],"published":{"date-parts":[[2026,8]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"FreeMD: Training-free multi-domain text-to-image generation with any control","name":"articletitle","label":"Article Title"},{"value":"Neural Networks","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neunet.2026.108823","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"108823"}}