{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T07:37:11Z","timestamp":1773128231698,"version":"3.50.1"},"reference-count":47,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2026,2,13]],"date-time":"2026-02-13T00:00:00Z","timestamp":1770940800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,2,13]],"date-time":"2026-02-13T00:00:00Z","timestamp":1770940800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Machine Vision and Applications"],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1007\/s00138-026-01798-w","type":"journal-article","created":{"date-parts":[[2026,2,13]],"date-time":"2026-02-13T16:04:01Z","timestamp":1770998641000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Mhdaunet: enhancing semantic consistency in diffusion models via dual-path noise alignment"],"prefix":"10.1007","volume":"37","author":[{"given":"Yijun","family":"Bei","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hao","family":"Lv","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sicheng","family":"Zuo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Nianshu","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qinqin","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,2,13]]},"reference":[{"key":"1798_CR1","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. Adv. Neural. Inf. Process. Syst. 33, 6840\u20136851 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1798_CR2","unstructured":"Goodfellow, I.J., Pouget-Abadie, J., Mirza, M., Xu, B., Warde-Farley, D., Ozair, S., Courville, A., Bengio, Y.: Generative adversarial nets. Advances in neural information processing systems 27 (2014)"},{"key":"1798_CR3","unstructured":"Nichol, A., Dhariwal, P., Ramesh, A., Shyam, P., Mishkin, P., McGrew, B., Sutskever, I., Chen, M.: Glide: Towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741 (2021)"},{"key":"1798_CR4","unstructured":"Ramesh, A., Pavlov, M., Goh, G., Gray, S., Voss, C., Radford, A., Chen, M., Sutskever, I.: Zero-shot text-to-image generation. In: International Conference on Machine Learning, pp. 8821\u20138831 (2021). Pmlr"},{"key":"1798_CR5","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"1798_CR6","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia, C., Chan, W., Saxena, S., Li, L., Whang, J., Denton, E.L., Ghasemipour, K., Gontijo Lopes, R., Karagol Ayan, B., Salimans, T., et al.: Photorealistic text-to-image diffusion models with deep language understanding. Adv. Neural. Inf. Process. Syst. 35, 36479\u201336494 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1798_CR7","unstructured":"Podell, D., English, Z., Lacey, K., Blattmann, A., Dockhorn, T., M\u00fcller, J., Penna, J., Rombach, R.: Sdxl: Improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952 (2023)"},{"key":"1798_CR8","unstructured":"Balaji, Y., Nah, S., Huang, X., Vahdat, A., Song, J., Zhang, Q., Kreis, K., Aittala, M., Aila, T., Laine, S., et al.: ediff-i: Text-to-image diffusion models with an ensemble of expert denoisers. arXiv preprint arXiv:2211.01324 (2022)"},{"issue":"3","key":"1798_CR9","first-page":"8","volume":"2","author":"J Betker","year":"2023","unstructured":"Betker, J., Goh, G., Jing, L., Brooks, T., Wang, J., Li, L., Ouyang, L., Zhuang, J., Lee, J., Guo, Y., et al.: Improving image generation with better captions. Computer Sci. 2(3), 8 (2023). (https:\/\/cdn.openai.com\/papers\/dall-e-3)","journal-title":"Computer Sci."},{"key":"1798_CR10","unstructured":"Chen, J., Yu, J., Ge, C., Yao, L., Xie, E., Wu, Y., Wang, Z., Kwok, J., Luo, P., Lu, H., et al.: Pixart-$$\\alpha $$: Fast training of diffusion transformer for photorealistic text-to-image synthesis. arXiv preprint arXiv:2310.00426 (2023)"},{"key":"1798_CR11","doi-asserted-by":"crossref","unstructured":"Peebles, W., Xie, S.: Scalable diffusion models with transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4195\u20134205 (2023)","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"1798_CR12","unstructured":"Pernias, P., Rampas, D., Richter, M.L., Pal, C.J., Aubreville, M.: W\u00fcrstchen: An efficient architecture for large-scale text-to-image diffusion models. arXiv preprint arXiv:2306.00637 (2023)"},{"key":"1798_CR13","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., Agrawala, M.: Adding conditional control to text-to-image diffusion models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3836\u20133847 (2023)","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"1798_CR14","doi-asserted-by":"crossref","unstructured":"Tumanyan, N., Geyer, M., Bagon, S., Dekel, T.: Plug-and-play diffusion features for text-driven image-to-image translation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1921\u20131930 (2023)","DOI":"10.1109\/CVPR52729.2023.00191"},{"key":"1798_CR15","unstructured":"Pan, X., Dong, L., Huang, S., Peng, Z., Chen, W., Wei, F.: Kosmos-g: Generating images in context with multimodal large language models. arXiv preprint arXiv:2310.02992 (2023)"},{"key":"1798_CR16","unstructured":"Singh, V., Jandial, S., Chopra, A., Ramesh, S., Krishnamurthy, B., Balasubramanian, V.N.: On conditioning the input noise for controlled image generation with diffusion models. arXiv preprint arXiv:2205.03859 (2022)"},{"issue":"4","key":"1798_CR17","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3592116","volume":"42","author":"H Chefer","year":"2023","unstructured":"Chefer, H., Alaluf, Y., Vinker, Y., Wolf, L., Cohen-Or, D.: Attend-and-excite: Attention-based semantic guidance for text-to-image diffusion models. ACM Trans. Actions Graph. (TOG) 42(4), 1\u201310 (2023)","journal-title":"ACM Trans. Actions Graph. (TOG)"},{"key":"1798_CR18","doi-asserted-by":"crossref","unstructured":"Guo, X., Liu, J., Cui, M., Li, J., Yang, H., Huang, D.: Initno: Boosting text-to-image diffusion models via initial noise optimization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9380\u20139389 (2024)","DOI":"10.1109\/CVPR52733.2024.00896"},{"key":"1798_CR19","unstructured":"Wang, R., Huang, H., Zhu, Y., Russakovsky, O., Wu, Y.: The silent prompt: Initial noise as implicit guidance for goal-driven image generation. arXiv preprint arXiv:2412.05101 (2024)"},{"key":"1798_CR20","doi-asserted-by":"crossref","unstructured":"Chen, C., Yang, L., Yang, X., Chen, L., He, G., Wang, C., Li, Y.: Find: Fine-tuning initial noise distribution with policy optimization for diffusion models. In: Proceedings of the 32nd ACM International Conference on Multimedia, pp. 6735\u20136744 (2024)","DOI":"10.1145\/3664647.3681047"},{"key":"1798_CR21","unstructured":"Ahn, D., Kang, J., Lee, S., Min, J., Kim, M., Jang, W., Cho, H., Paul, S., Kim, S., Cha, E., et al.: A noise is worth diffusion guidance. arXiv preprint arXiv:2412.03895 (2024)"},{"key":"1798_CR22","doi-asserted-by":"crossref","unstructured":"Chen, S.X., Vaxman, Y., Ben\u00a0Baruch, E., Asulin, D., Moreshet, A., Lien, K.-C., Sra, M., Sen, P.: Tino-edit: Timestep and noise optimization for robust diffusion-based image editing. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6337\u20136346 (2024)","DOI":"10.1109\/CVPR52733.2024.00606"},{"key":"1798_CR23","unstructured":"Li, S., Le, H., Xu, J., Salzmann, M.: Enhancing compositional text-to-image generation with reliable random seeds. arXiv preprint arXiv:2411.18810 (2024)"},{"key":"1798_CR24","unstructured":"Zhou, Z., Shao, S., Bai, L., Xu, Z., Han, B., Xie, Z.: Golden noise for diffusion models: A learning framework. arXiv preprint arXiv:2411.09502 (2024)"},{"key":"1798_CR25","doi-asserted-by":"crossref","unstructured":"Tian, Y., Xia, F., Song, Y.: Diffusion networks with task-specific noise control for radiology report generation. In: Proceedings of the 32nd ACM International Conference on Multimedia, pp. 1771\u20131780 (2024)","DOI":"10.1145\/3664647.3681476"},{"key":"1798_CR26","unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502 (2020)"},{"key":"1798_CR27","unstructured":"Ho, J., Salimans, T.: Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598 (2022)"},{"key":"1798_CR28","doi-asserted-by":"crossref","unstructured":"Bao, F., Nie, S., Xue, K., Cao, Y., Li, C., Su, H., Zhu, J.: All are worth words: A vit backbone for diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22669\u201322679 (2023)","DOI":"10.1109\/CVPR52729.2023.02171"},{"key":"1798_CR29","doi-asserted-by":"crossref","unstructured":"Mou, C., Wang, X., Xie, L., Wu, Y., Zhang, J., Qi, Z., Shan, Y.: T2i-adapter: Learning adapters to dig out more controllable ability for text-to-image diffusion models. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 38, pp. 4296\u20134304 (2024)","DOI":"10.1609\/aaai.v38i5.28226"},{"key":"1798_CR30","doi-asserted-by":"crossref","unstructured":"Zhong, G., Yuan, J., Wang, P., Yang, K., Guan, W., Li, Z.: Contrast-augmented diffusion model with fine-grained sequence alignment for markup-to-image generation. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 5311\u20135320 (2023)","DOI":"10.1145\/3581783.3613781"},{"key":"1798_CR31","doi-asserted-by":"crossref","unstructured":"Shirakawa, T., Uchida, S.: Noisecollage: A layout-aware text-to-image diffusion model based on noise cropping and merging. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8921\u20138930 (2024)","DOI":"10.1109\/CVPR52733.2024.00852"},{"key":"1798_CR32","doi-asserted-by":"crossref","unstructured":"Lugmayr, A., Danelljan, M., Romero, A., Yu, F., Timofte, R., Van\u00a0Gool, L.: Repaint: Inpainting using denoising diffusion probabilistic models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11461\u201311471 (2022)","DOI":"10.1109\/CVPR52688.2022.01117"},{"key":"1798_CR33","unstructured":"Qi, Z., Bai, L., Xiong, H., Xie, Z.: Not all noises are created equally: Diffusion noise selection and optimization. arXiv preprint arXiv:2403.14887 (2024)"},{"key":"1798_CR34","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Zhou, Y., Peng, D., Lim, J.-H., Tu, Z., Soh, D.W., Foo, L.G.: Visual prompting for one-shot controllable video editing without inversion. In: Proceedings of the Computer Vision and Pattern Recognition Conference, pp. 7784\u20137794 (2025)","DOI":"10.1109\/CVPR52734.2025.00729"},{"key":"1798_CR35","unstructured":"Ye, T., Dong, L., Xia, Y., Sun, Y., Zhu, Y., Huang, G., Wei, F.: Differential transformer. arXiv preprint arXiv:2410.05258 (2024)"},{"key":"1798_CR36","doi-asserted-by":"publisher","first-page":"36652","DOI":"10.52202\/075280-1594","volume":"36","author":"Y Kirstain","year":"2023","unstructured":"Kirstain, Y., Polyak, A., Singer, U., Matiana, S., Penna, J., Levy, O.: Pick-a-pic: An open dataset of user preferences for text-to-image generation. Adv. Neural. Inf. Process. Syst. 36, 36652\u201336663 (2023)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1798_CR37","unstructured":"Li, Z., Zhang, J., Lin, Q., Xiong, J., Long, Y., Deng, X., Zhang, Y., Liu, X., Huang, M., Xiao, Z., et al.: Hunyuan-dit: A powerful multi-resolution diffusion transformer with fine-grained chinese understanding. arXiv preprint arXiv:2405.08748 (2024)"},{"key":"1798_CR38","unstructured":"Wu, X., Hao, Y., Sun, K., Chen, Y., Zhu, F., Zhao, R., Li, H.: Human preference score v2: A solid benchmark for evaluating human preferences of text-to-image synthesis. arXiv preprint arXiv:2306.09341 (2023)"},{"key":"1798_CR39","doi-asserted-by":"publisher","first-page":"52132","DOI":"10.52202\/075280-2270","volume":"36","author":"D Ghosh","year":"2023","unstructured":"Ghosh, D., Hajishirzi, H., Schmidt, L.: Geneval: An object-focused framework for evaluating text-to-image alignment. Adv. Neural. Inf. Process. Syst. 36, 52132\u201352152 (2023)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1798_CR40","unstructured":"Yu, J., Xu, Y., Koh, J.Y., Luong, T., Baid, G., Wang, Z., Vasudevan, V., Ku, A., Yang, Y., Ayan, B.K., et al. Scaling autoregressive models for content-rich text-to-image generation. arXiv preprint arXiv:2206.10789 2(3), 5 (2022)"},{"key":"1798_CR41","unstructured":"Lian, L., Li, B., Yala, A., Darrell, T.: Llm-grounded diffusion: Enhancing prompt understanding of text-to-image diffusion models with large language models. arXiv preprint arXiv:2305.13655 (2023)"},{"key":"1798_CR42","doi-asserted-by":"crossref","unstructured":"Wang, Z.J., Montoya, E., Munechika, D., Yang, H., Hoover, B., Chau, D.H.: Diffusiondb: A large-scale prompt gallery dataset for text-to-image generative models. arXiv preprint arXiv:2210.14896 (2022)","DOI":"10.18653\/v1\/2023.acl-long.51"},{"key":"1798_CR43","doi-asserted-by":"crossref","unstructured":"Hessel, J., Holtzman, A., Forbes, M., Bras, R.L., Choi, Y.: Clipscore: A reference-free evaluation metric for image captioning. arXiv preprint arXiv:2104.08718 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"1798_CR44","first-page":"15903","volume":"36","author":"J Xu","year":"2023","unstructured":"Xu, J., Liu, X., Wu, Y., Tong, Y., Li, Q., Ding, M., Tang, J., Dong, Y.: Imagereward: Learning and evaluating human preferences for text-to-image generation. Adv. Neural. Inf. Process. Syst. 36, 15903\u201315935 (2023)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1798_CR45","unstructured":"Barratt, S., Sharma, R.: A note on the inception score. arXiv preprint arXiv:1801.01973 (2018)"},{"key":"1798_CR46","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems 30 (2017)"},{"key":"1798_CR47","doi-asserted-by":"crossref","unstructured":"Zhang, S., Wang, B., Wu, J., Li, Y., Gao, T., Zhang, D., Wang, Z.: Learning multi-dimensional human preference for text-to-image generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8018\u20138027 (2024)","DOI":"10.1109\/CVPR52733.2024.00766"}],"container-title":["Machine Vision and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00138-026-01798-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00138-026-01798-w","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00138-026-01798-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,9]],"date-time":"2026-03-09T18:12:51Z","timestamp":1773079971000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00138-026-01798-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2,13]]},"references-count":47,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2026,3]]}},"alternative-id":["1798"],"URL":"https:\/\/doi.org\/10.1007\/s00138-026-01798-w","relation":{},"ISSN":["0932-8092","1432-1769"],"issn-type":[{"value":"0932-8092","type":"print"},{"value":"1432-1769","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,2,13]]},"assertion":[{"value":"30 May 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 August 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 January 2026","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 February 2026","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no Conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"37"}}