{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,21]],"date-time":"2026-01-21T11:04:11Z","timestamp":1768993451922,"version":"3.49.0"},"publisher-location":"Singapore","reference-count":51,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819556922","type":"print"},{"value":"9789819556939","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-5693-9_20","type":"book-chapter","created":{"date-parts":[[2026,1,20]],"date-time":"2026-01-20T21:23:06Z","timestamp":1768944186000},"page":"285-298","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Prompt-Guided Adaptation for\u00a0Efficient Fine-Tuning of\u00a0Diffusion Models in\u00a0Food Image Generation"],"prefix":"10.1007","author":[{"given":"Zitian","family":"Chen","sequence":"first","affiliation":[]},{"given":"Qingbing","family":"Sang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,1,21]]},"reference":[{"key":"20_CR1","doi-asserted-by":"publisher","unstructured":"Beal, J., Kim, E., Tzeng, E., Huk Park, D., Zhai, A., Kislyuk, D.: Toward transformer-based object detection. arXiv e-prints arXiv:2012.09958 (2020). https:\/\/doi.org\/10.48550\/arXiv.2012.09958","DOI":"10.48550\/arXiv.2012.09958"},{"key":"20_CR2","doi-asserted-by":"publisher","unstructured":"Ben Zaken, E., Ravfogel, S., Goldberg, Y.: BitFit: simple parameter-efficient fine-tuning for transformer-based masked language-models. arXiv e-prints arXiv:2106.10199 (2021). https:\/\/doi.org\/10.48550\/arXiv.2106.10199","DOI":"10.48550\/arXiv.2106.10199"},{"key":"20_CR3","doi-asserted-by":"crossref","unstructured":"Bossard, L., Guillaumin, M., Gool, L.V.: Food-101 \u2013 mining discriminative components with random forests. In: Springer International Publishing (2014)","DOI":"10.1007\/978-3-319-10599-4_29"},{"key":"20_CR4","doi-asserted-by":"publisher","unstructured":"Brown, T.B., et al.: Language models are few-shot learners. arXiv e-prints arXiv:2005.14165 (2020). https:\/\/doi.org\/10.48550\/arXiv.2005.14165","DOI":"10.48550\/arXiv.2005.14165"},{"key":"20_CR5","doi-asserted-by":"publisher","unstructured":"Cai, H., Li, M., Zhang, Z., Zhang, Q., Liu, M.Y., Han, S.: Condition-aware neural network for controlled image generation. arXiv e-prints arXiv:2404.01143 (2024). https:\/\/doi.org\/10.48550\/arXiv.2404.01143","DOI":"10.48550\/arXiv.2404.01143"},{"key":"20_CR6","doi-asserted-by":"publisher","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. arXiv e-prints arXiv:2005.12872 (2020). https:\/\/doi.org\/10.48550\/arXiv.2005.12872","DOI":"10.48550\/arXiv.2005.12872"},{"key":"20_CR7","doi-asserted-by":"crossref","unstructured":"Chen, J., Ngo, C.W.: Deep-based ingredient recognition for cooking recipe retrieval. In: Proceedings of the 24th ACM International Conference on Multimedia (2016). https:\/\/api.semanticscholar.org\/CorpusID:207240186","DOI":"10.1145\/2964284.2964315"},{"key":"20_CR8","doi-asserted-by":"publisher","unstructured":"Chen, S., et al.: AdaptFormer: adapting vision transformers for scalable visual recognition. arXiv e-prints arXiv:2205.13535 (2022). https:\/\/doi.org\/10.48550\/arXiv.2205.13535","DOI":"10.48550\/arXiv.2205.13535"},{"key":"20_CR9","doi-asserted-by":"publisher","unstructured":"Dai, Z., Yang, Z., Yang, Y., Carbonell, J., Le, Q.V., Salakhutdinov, R.: Transformer-XL: attentive language models beyond a fixed-length context. arXiv e-prints arXiv:1901.02860 (2019). https:\/\/doi.org\/10.48550\/arXiv.1901.02860","DOI":"10.48550\/arXiv.1901.02860"},{"key":"20_CR10","doi-asserted-by":"publisher","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. arXiv e-prints arXiv:1810.04805 (2018). https:\/\/doi.org\/10.48550\/arXiv.1810.04805","DOI":"10.48550\/arXiv.1810.04805"},{"key":"20_CR11","doi-asserted-by":"publisher","first-page":"220","DOI":"10.1038\/s42256-023-00626-4","volume":"5","author":"Parameter-efficient fine-tuning of large-scale pre-trained language models","year":"2023","unstructured":"Parameter-efficient fine-tuning of large-scale pre-trained language models: Ding, N., Q.Y.Y.G.e.a. Nat. Mach. Intell. 5, 220\u2013235 (2023). https:\/\/doi.org\/10.1038\/s42256-023-00626-4","journal-title":"Nat. Mach. Intell."},{"key":"20_CR12","doi-asserted-by":"publisher","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. arXiv e-prints arXiv:2010.11929 (2020). https:\/\/doi.org\/10.48550\/arXiv.2010.11929","DOI":"10.48550\/arXiv.2010.11929"},{"key":"20_CR13","doi-asserted-by":"publisher","unstructured":"Fan, A., et al.: Beyond english-centric multilingual machine translation. arXiv e-prints arXiv:2010.11125 (2020). https:\/\/doi.org\/10.48550\/arXiv.2010.11125","DOI":"10.48550\/arXiv.2010.11125"},{"key":"20_CR14","doi-asserted-by":"publisher","unstructured":"Fu, W., Han, Y., He, J., Baireddy, S., Gupta, M., Zhu, F.: Conditional synthetic food image generation. arxiv e-prints arXiv:2303.09005 (2023). https:\/\/doi.org\/10.48550\/arXiv.2303.09005","DOI":"10.48550\/arXiv.2303.09005"},{"key":"20_CR15","doi-asserted-by":"publisher","unstructured":"Gao, P., et al.: Lumina-T2X: transforming text into any modality, resolution, and duration via flow-based large diffusion transformers. arXiv e-prints arXiv:2405.05945 (2024). https:\/\/doi.org\/10.48550\/arXiv.2405.05945","DOI":"10.48550\/arXiv.2405.05945"},{"key":"20_CR16","doi-asserted-by":"publisher","unstructured":"Gao, S., Zhou, P., Cheng, M.M., Yan, S.: MDTv2: masked diffusion transformer is a strong image synthesizer. arXiv e-prints arXiv:2303.14389 (2023). https:\/\/doi.org\/10.48550\/arXiv.2303.14389","DOI":"10.48550\/arXiv.2303.14389"},{"key":"20_CR17","doi-asserted-by":"crossref","unstructured":"Han, F., Guerrero, R., Pavlovic, V.: Cookgan: meal image synthesis from ingredients. In: 2020 IEEE Winter Conference on Applications of Computer Vision (WACV), pp. 1439\u20131447 (2020). https:\/\/api.semanticscholar.org\/CorpusID:211505950","DOI":"10.1109\/WACV45572.2020.9093463"},{"key":"20_CR18","doi-asserted-by":"publisher","unstructured":"Han, F., Hao, G., Guerrero, R., Pavlovic, V.: MPG: a multi-ingredient pizza image generator with conditional StyleGANs. arXiv e-prints arXiv:2012.02821 (2020). https:\/\/doi.org\/10.48550\/arXiv.2012.02821","DOI":"10.48550\/arXiv.2012.02821"},{"key":"20_CR19","doi-asserted-by":"crossref","unstructured":"Han, Y., He, J., Gupta, M., Delp, E.J., Zhu, F.M.: Diffusion model with clustering-based conditioning for food image generation. In: Proceedings of the 8th International Workshop on Multimedia Assisted Dietary Management (2023). https:\/\/api.semanticscholar.org\/CorpusID:261493937","DOI":"10.1145\/3607828.3617796"},{"key":"20_CR20","doi-asserted-by":"publisher","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.: Masked autoencoders are scalable vision learners. arXiv e-prints arXiv:2111.06377 (2021). https:\/\/doi.org\/10.48550\/arXiv.2111.06377","DOI":"10.48550\/arXiv.2111.06377"},{"key":"20_CR21","doi-asserted-by":"publisher","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: GANs trained by a two time-scale update rule converge to a local nash equilibrium. arXiv e-prints arXiv:1706.08500 (2017). https:\/\/doi.org\/10.48550\/arXiv.1706.08500","DOI":"10.48550\/arXiv.1706.08500"},{"key":"20_CR22","doi-asserted-by":"publisher","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. arXiv e-prints arXiv:2006.11239 (2020). https:\/\/doi.org\/10.48550\/arXiv.2006.11239","DOI":"10.48550\/arXiv.2006.11239"},{"key":"20_CR23","doi-asserted-by":"publisher","unstructured":"Hoffmann, J., et al.: Training compute-optimal large language models. arXiv e-prints arXiv:2203.15556 (2022). https:\/\/doi.org\/10.48550\/arXiv.2203.15556","DOI":"10.48550\/arXiv.2203.15556"},{"key":"20_CR24","doi-asserted-by":"publisher","unstructured":"Houlsby, N., et al.: Parameter-efficient transfer learning for NLP. arXiv e-prints arXiv:1902.00751 (2019). https:\/\/doi.org\/10.48550\/arXiv.1902.00751","DOI":"10.48550\/arXiv.1902.00751"},{"key":"20_CR25","doi-asserted-by":"publisher","unstructured":"Hu, E.J., et al.: LoRA: low-rank adaptation of large language models. arXiv e-prints arXiv:2106.09685 (2021). https:\/\/doi.org\/10.48550\/arXiv.2106.09685","DOI":"10.48550\/arXiv.2106.09685"},{"key":"20_CR26","doi-asserted-by":"publisher","unstructured":"Hu, J., Yi, X., Li, W., Sun, M., Xie, X.: Fuse it more deeply! a variational transformer with layer-wise latent variable inference for text generation. arXiv e-prints arXiv:2207.06130 (2022). https:\/\/doi.org\/10.48550\/arXiv.2207.06130","DOI":"10.48550\/arXiv.2207.06130"},{"key":"20_CR27","doi-asserted-by":"publisher","unstructured":"Jia, M., et al.: Visual prompt tuning. arXiv e-prints arXiv:2203.12119 (2022). https:\/\/doi.org\/10.48550\/arXiv.2203.12119","DOI":"10.48550\/arXiv.2203.12119"},{"key":"20_CR28","doi-asserted-by":"publisher","unstructured":"Karras, T., Laine, S., Aittala, M., Hellsten, J., Lehtinen, J., Aila, T.: Analyzing and improving the image quality of StyleGAN. arXiv e-prints arXiv:1912.04958 (2019). https:\/\/doi.org\/10.48550\/arXiv.1912.04958","DOI":"10.48550\/arXiv.1912.04958"},{"key":"20_CR29","doi-asserted-by":"publisher","unstructured":"Kingma, D.P., Welling, M.: Auto-encoding variational bayes. arXiv e-prints arXiv:1312.6114 (2013). https:\/\/doi.org\/10.48550\/arXiv.1312.6114","DOI":"10.48550\/arXiv.1312.6114"},{"key":"20_CR30","doi-asserted-by":"publisher","unstructured":"Lee, J., Tang, R., Lin, J.: What would elsa do? freezing layers during transformer fine-tuning. arXiv e-prints arXiv:1911.03090 (2019). https:\/\/doi.org\/10.48550\/arXiv.1911.03090","DOI":"10.48550\/arXiv.1911.03090"},{"key":"20_CR31","doi-asserted-by":"publisher","unstructured":"Liu, X., Gong, C., Liu, Q.: Flow straight and fast: learning to generate and transfer data with rectified flow. arXiv e-prints arXiv:2209.03003 (2022). https:\/\/doi.org\/10.48550\/arXiv.2209.03003","DOI":"10.48550\/arXiv.2209.03003"},{"key":"20_CR32","doi-asserted-by":"publisher","unstructured":"Liu, Y., et al.: RoBERTa: a robustly optimized BERT pretraining approach. arXiv e-prints arXiv:1907.11692 (2019). https:\/\/doi.org\/10.48550\/arXiv.1907.11692","DOI":"10.48550\/arXiv.1907.11692"},{"key":"20_CR33","doi-asserted-by":"publisher","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. arXiv e-prints arXiv:2103.14030 (2021). https:\/\/doi.org\/10.48550\/arXiv.2103.14030","DOI":"10.48550\/arXiv.2103.14030"},{"key":"20_CR34","doi-asserted-by":"publisher","unstructured":"Ma, N., Goldstein, M., Albergo, M.S., Boffi, N.M., Vanden-Eijnden, E., Xie, S.: SiT: exploring flow and diffusion-based generative models with scalable interpolant transformers. arXiv e-prints arXiv:2401.08740 (2024). https:\/\/doi.org\/10.48550\/arXiv.2401.08740","DOI":"10.48550\/arXiv.2401.08740"},{"key":"20_CR35","unstructured":"Markham, O., Chen, Y., en\u00a0Amy\u00a0Tai, C., Wong, A.: Foodfusion: a latent diffusion model for realistic food image generation. arXiv abs\/2312.03540 (2023). https:\/\/api.semanticscholar.org\/CorpusID:265692868"},{"key":"20_CR36","doi-asserted-by":"publisher","unstructured":"Nichol, A., Dhariwal, P.: Improved denoising diffusion probabilistic models. arXiv e-prints arXiv:2102.09672 (2021). https:\/\/doi.org\/10.48550\/arXiv.2102.09672","DOI":"10.48550\/arXiv.2102.09672"},{"key":"20_CR37","doi-asserted-by":"publisher","unstructured":"Ott, M., Edunov, S., Grangier, D., Auli, M.: Scaling neural machine translation. arxiv e-prints arXiv:1806.00187 (2018). https:\/\/doi.org\/10.48550\/arXiv.1806.00187","DOI":"10.48550\/arXiv.1806.00187"},{"key":"20_CR38","doi-asserted-by":"publisher","unstructured":"Peebles, W., Xie, S.: Scalable diffusion models with transformers. arXiv e-prints arXiv:2212.09748 (2022). https:\/\/doi.org\/10.48550\/arXiv.2212.09748","DOI":"10.48550\/arXiv.2212.09748"},{"key":"20_CR39","doi-asserted-by":"publisher","unstructured":"Pfeiffer, J., Kamath, A., R\u00fcckl\u00e9, A., Cho, K., Gurevych, I.: AdapterFusion: non-destructive task composition for transfer learning. arXiv e-prints arXiv:2005.00247 (2020). https:\/\/doi.org\/10.48550\/arXiv.2005.00247","DOI":"10.48550\/arXiv.2005.00247"},{"key":"20_CR40","doi-asserted-by":"publisher","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. arXiv e-prints arXiv:1910.10683 (2019). https:\/\/doi.org\/10.48550\/arXiv.1910.10683","DOI":"10.48550\/arXiv.1910.10683"},{"key":"20_CR41","doi-asserted-by":"publisher","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. arXiv e-prints arXiv:2112.10752 (2021). https:\/\/doi.org\/10.48550\/arXiv.2112.10752","DOI":"10.48550\/arXiv.2112.10752"},{"key":"20_CR42","doi-asserted-by":"publisher","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-Net: convolutional networks for biomedical image segmentation. arXiv e-prints arXiv:1505.04597 (2015). https:\/\/doi.org\/10.48550\/arXiv.1505.04597","DOI":"10.48550\/arXiv.1505.04597"},{"key":"20_CR43","doi-asserted-by":"publisher","unstructured":"Song, Y., Sohl-Dickstein, J., Kingma, D.P., Kumar, A., Ermon, S., Poole, B.: Score-based generative modeling through stochastic differential equations. arXiv e-prints arXiv:2011.13456 (2020). https:\/\/doi.org\/10.48550\/arXiv.2011.13456","DOI":"10.48550\/arXiv.2011.13456"},{"key":"20_CR44","doi-asserted-by":"publisher","unstructured":"Su, J., Lu, Y., Pan, S., Murtadha, A., Wen, B., Liu, Y.: RoFormer: enhanced transformer with rotary position embedding. arxiv e-prints arXiv:2104.09864 (2021). https:\/\/doi.org\/10.48550\/arXiv.2104.09864","DOI":"10.48550\/arXiv.2104.09864"},{"key":"20_CR45","doi-asserted-by":"publisher","unstructured":"Tao, M., Bao, B.K., Tang, H., Xu, C.: GALIP: generative adversarial CLIPs for text-to-image synthesis. arXiv e-prints arXiv:2301.12959 (2023). https:\/\/doi.org\/10.48550\/arXiv.2301.12959","DOI":"10.48550\/arXiv.2301.12959"},{"key":"20_CR46","doi-asserted-by":"publisher","unstructured":"Vaswani, A., et al.: Attention is all you need. arXiv e-prints arXiv:1706.03762 (2017). https:\/\/doi.org\/10.48550\/arXiv.1706.03762","DOI":"10.48550\/arXiv.1706.03762"},{"key":"20_CR47","doi-asserted-by":"publisher","unstructured":"Xie, E., et al.: DiffFit: unlocking transferability of large diffusion models via simple parameter-efficient fine-tuning. arXiv e-prints arXiv:2304.06648 (2023). https:\/\/doi.org\/10.48550\/arXiv.2304.06648","DOI":"10.48550\/arXiv.2304.06648"},{"key":"20_CR48","doi-asserted-by":"publisher","unstructured":"Xu, M., Wang, J., Tao, M., Bao, B.K., Xu, C.: Cookgalip: recipe controllable generative adversarial clips with sequential ingredient prompts for food image generation. IEEE Trans. Multimedia, 1\u201311 (2024). https:\/\/doi.org\/10.1109\/TMM.2024.3377540","DOI":"10.1109\/TMM.2024.3377540"},{"key":"20_CR49","unstructured":"Zheng, H., Nie, W., Vahdat, A., Anandkumar, A.: Fast training of diffusion models with masked transformers. Trans. Mach. Learn. Res. (TMLR) (2024)"},{"key":"20_CR50","doi-asserted-by":"publisher","unstructured":"Zhu, B., Ngo, C.W., Chen, J., Hao, Y.: R\u00b2gan: cross-modal recipe retrieval with generative adversarial network. In: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 11469\u201311478 (2019). https:\/\/doi.org\/10.1109\/CVPR.2019.01174","DOI":"10.1109\/CVPR.2019.01174"},{"key":"20_CR51","doi-asserted-by":"publisher","unstructured":"Zhu, T., Chen, J., Zhu, R., Gupta, G.: StyleGAN3: generative networks for improving the equivariance of translation and rotation. arXiv e-prints arXiv:2307.03898 (2023). https:\/\/doi.org\/10.48550\/arXiv.2307.03898","DOI":"10.48550\/arXiv.2307.03898"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-5693-9_20","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,20]],"date-time":"2026-01-20T21:23:16Z","timestamp":1768944196000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-5693-9_20"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819556922","9789819556939"],"references-count":51,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-5693-9_20","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"21 January 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Shanghai","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 October 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/2025.prcv.cn\/index.asp","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}