{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,28]],"date-time":"2025-03-28T07:40:42Z","timestamp":1743147642101,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":34,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819620630"},{"type":"electronic","value":"9789819620647"}],"license":[{"start":{"date-parts":[[2024,12,28]],"date-time":"2024-12-28T00:00:00Z","timestamp":1735344000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,28]],"date-time":"2024-12-28T00:00:00Z","timestamp":1735344000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-2064-7_28","type":"book-chapter","created":{"date-parts":[[2024,12,27]],"date-time":"2024-12-27T19:25:11Z","timestamp":1735327511000},"page":"381-394","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Saliency Guided Optimization of\u00a0Diffusion Latents"],"prefix":"10.1007","author":[{"given":"Xiwen","family":"Wang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jizhe","family":"Zhou","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xuekang","family":"Zhu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Cheng","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mao","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,12,28]]},"reference":[{"key":"28_CR1","unstructured":"Anlatan: NovelAI Improvements on Stable Diffusion \u2014 blog.novelai.net. https:\/\/blog.novelai.net\/novelai-improvements-on-stable-diffusion-e10d38db82ac. Accessed 16 May 2024"},{"issue":"5","key":"28_CR2","doi-asserted-by":"publisher","first-page":"2392","DOI":"10.1109\/TIP.2016.2545863","volume":"25","author":"SH Bae","year":"2016","unstructured":"Bae, S.H., Kim, M.: A novel image quality assessment with globally and locally consilient visual quality perception. IEEE Trans. Image Process. 25(5), 2392\u20132406 (2016)","journal-title":"IEEE Trans. Image Process."},{"key":"28_CR3","first-page":"8780","volume":"34","author":"P Dhariwal","year":"2021","unstructured":"Dhariwal, P., Nichol, A.: Diffusion models beat GANs on image synthesis. Adv. Neural. Inf. Process. Syst. 34, 8780\u20138794 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"28_CR4","unstructured":"Ha, D., Dai, A., Le, Q.V.: Hypernetworks. arXiv preprint arXiv:1609.09106 (2016)"},{"key":"28_CR5","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"186","DOI":"10.1007\/978-3-030-03341-5_16","volume-title":"Pattern Recognition and Computer Vision","author":"L Han","year":"2018","unstructured":"Han, L., Li, X., Dong, Y.: SalNet: edge constraint based end-to-end model for salient object detection. In: Lai, J.-H., et al. (eds.) PRCV 2018, Part IV. LNCS, vol. 11259, pp. 186\u2013198. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-03341-5_16"},{"key":"28_CR6","doi-asserted-by":"crossref","unstructured":"Han, L., Li, Y., Zhang, H., Milanfar, P., Metaxas, D., Yang, F.: SVDiff: compact parameter space for diffusion fine-tuning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7323\u20137334 (2023)","DOI":"10.1109\/ICCV51070.2023.00673"},{"key":"28_CR7","doi-asserted-by":"crossref","unstructured":"Hessel, J., Holtzman, A., Forbes, M., Bras, R.L., Choi, Y.: CLIPScore: a reference-free evaluation metric for image captioning. arXiv preprint arXiv:2104.08718 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"28_CR8","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. Adv. Neural. Inf. Process. Syst. 33, 6840\u20136851 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"28_CR9","unstructured":"Ho, J., Salimans, T.: Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598 (2022)"},{"key":"28_CR10","doi-asserted-by":"crossref","unstructured":"Hou, X., Zhang, L.: Saliency detection: a spectral residual approach. In: 2007 IEEE Conference on Computer Vision and Pattern Recognition, pp.\u00a01\u20138. IEEE (2007)","DOI":"10.1109\/CVPR.2007.383267"},{"key":"28_CR11","unstructured":"Hu, E.J., et al.: LoRa: low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)"},{"key":"28_CR12","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2020.103887","volume":"95","author":"S Jia","year":"2020","unstructured":"Jia, S., Bruce, N.D.: EML-Net: an expandable multi-layer network for saliency prediction. Image Vis. Comput. 95, 103887 (2020)","journal-title":"Image Vis. Comput."},{"issue":"11","key":"28_CR13","doi-asserted-by":"publisher","first-page":"2278","DOI":"10.1109\/5.726791","volume":"86","author":"Y LeCun","year":"1998","unstructured":"LeCun, Y., Bottou, L., Bengio, Y., Haffner, P.: Gradient-based learning applied to document recognition. Proc. IEEE 86(11), 2278\u20132324 (1998)","journal-title":"Proc. IEEE"},{"key":"28_CR14","unstructured":"Lee, T., et\u00a0al.: Holistic evaluation of text-to-image models. Adv. Neural Inf. Process. Syst. 36 (2024)"},{"key":"28_CR15","doi-asserted-by":"publisher","first-page":"455","DOI":"10.1016\/j.neucom.2022.04.080","volume":"494","author":"J Lou","year":"2022","unstructured":"Lou, J., Lin, H., Marshall, D., Saupe, D., Liu, H.: TranSalNet: towards perceptually relevant visual saliency prediction. Neurocomputing 494, 455\u2013467 (2022)","journal-title":"Neurocomputing"},{"key":"28_CR16","doi-asserted-by":"crossref","unstructured":"Moorthy, A.K., Wang, Z., Bovik, A.C.: Visual perception and quality assessment. In: Optical and Digital Image Processing: Fundamentals and Applications, pp. 419\u2013439 (2011)","DOI":"10.1002\/9783527635245.ch19"},{"key":"28_CR17","unstructured":"Nichol, A., et al.: GLIDE: towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741 (2021)"},{"key":"28_CR18","unstructured":"Nichol, A.Q., Dhariwal, P.: Improved denoising diffusion probabilistic models. In: International Conference on Machine Learning, pp. 8162\u20138171. PMLR (2021)"},{"key":"28_CR19","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"28_CR20","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.061251(2), 3 (2022)"},{"key":"28_CR21","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"28_CR22","doi-asserted-by":"crossref","unstructured":"Ruiz, N., Li, Y., Jampani, V., Pritch, Y., Rubinstein, M., Aberman, K.: DreamBooth: fine tuning text-to-image diffusion models for subject-driven generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22500\u201322510 (2023)","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"28_CR23","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia, C., Salimans, T., et al.: Photorealistic text-to-image diffusion models with deep language understanding. Adv. Neural. Inf. Process. Syst. 35, 36479\u201336494 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"issue":"2","key":"28_CR24","doi-asserted-by":"publisher","first-page":"430","DOI":"10.1109\/TIP.2005.859378","volume":"15","author":"HR Sheikh","year":"2006","unstructured":"Sheikh, H.R., Bovik, A.C.: Image information and visual quality. IEEE Trans. Image Process. 15(2), 430\u2013444 (2006)","journal-title":"IEEE Trans. Image Process."},{"key":"28_CR25","unstructured":"Sohl-Dickstein, J., Weiss, E., Maheswaranathan, N., Ganguli, S.: Deep unsupervised learning using nonequilibrium thermodynamics. In: International Conference on Machine Learning, pp. 2256\u20132265. PMLR (2015)"},{"key":"28_CR26","unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502 (2020)"},{"key":"28_CR27","unstructured":"Song, Y., Ermon, S.: Generative modeling by estimating gradients of the data distribution. Adv. Neural Inf. Process. Syst. 32 (2019)"},{"key":"28_CR28","doi-asserted-by":"crossref","unstructured":"Wallace, B., Gokul, A., Ermon, S., Naik, N.: End-to-end diffusion latent optimization improves classifier guidance. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7280\u20137290 (2023)","DOI":"10.1109\/ICCV51070.2023.00669"},{"key":"28_CR29","doi-asserted-by":"crossref","unstructured":"Wallace, B., Gokul, A., Naik, N.: EDICT: exact diffusion inversion via coupled transformations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22532\u201322541 (2023)","DOI":"10.1109\/CVPR52729.2023.02158"},{"key":"28_CR30","unstructured":"Wu, X., et al.: Human preference score V2: a solid benchmark for evaluating human preferences of text-to-image synthesis. arXiv preprint arXiv:2306.09341 (2023)"},{"key":"28_CR31","doi-asserted-by":"crossref","unstructured":"Xie, E., et al.: DiffFit: unlocking transferability of large diffusion models via simple parameter-efficient fine-tuning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4230\u20134239 (2023)","DOI":"10.1109\/ICCV51070.2023.00390"},{"key":"28_CR32","unstructured":"Zhang, C., Zhang, C., Zhang, M., Kweon, I.S.: Text-to-image diffusion model in generative AI: a survey. arXiv preprint arXiv:2303.07909 (2023)"},{"issue":"7","key":"28_CR33","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1167\/8.7.32","volume":"8","author":"L Zhang","year":"2008","unstructured":"Zhang, L., Tong, M.H., Marks, T.K., Shan, H., Cottrell, G.W.: SUN: a Bayesian framework for saliency using natural statistics. J. Vis. 8(7), 32\u201332 (2008)","journal-title":"J. Vis."},{"key":"28_CR34","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., Agrawala, M.: Adding conditional control to text-to-image diffusion models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3836\u20133847 (2023)","DOI":"10.1109\/ICCV51070.2023.00355"}],"container-title":["Lecture Notes in Computer Science","MultiMedia Modeling"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-2064-7_28","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,27]],"date-time":"2024-12-27T20:05:15Z","timestamp":1735329915000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-2064-7_28"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,28]]},"ISBN":["9789819620630","9789819620647"],"references-count":34,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-2064-7_28","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,12,28]]},"assertion":[{"value":"28 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"MMM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Multimedia Modeling","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Nara","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Japan","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9 January 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 January 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"31","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"mmm2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/mmm2025.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}