{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,4]],"date-time":"2026-04-04T17:26:07Z","timestamp":1775323567182,"version":"3.50.1"},"publisher-location":"Singapore","reference-count":43,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819620531","type":"print"},{"value":"9789819620548","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-2054-8_30","type":"book-chapter","created":{"date-parts":[[2025,1,2]],"date-time":"2025-01-02T15:48:46Z","timestamp":1735832926000},"page":"401-414","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["FoodMLLM-JP: Leveraging Multimodal Large Language Models for\u00a0Japanese Recipe Generation"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-7470-8305","authenticated-orcid":false,"given":"Yuki","family":"Imajuku","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2752-6179","authenticated-orcid":false,"given":"Yoko","family":"Yamakata","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2146-6275","authenticated-orcid":false,"given":"Kiyoharu","family":"Aizawa","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,1,3]]},"reference":[{"key":"30_CR1","unstructured":"01. AI: Yi: Open foundation models by 01.AI. arXiv:2403.04652 (2024)"},{"key":"30_CR2","unstructured":"Abdin, M., et\u00a0al.: Phi-3 technical report: a highly capable language model locally on your phone. arXiv:2404.14219 (2024)"},{"key":"30_CR3","unstructured":"Anthropic: The claude 3 model family: Opus, sonnet, haiku (2024). https:\/\/www-cdn.anthropic.com\/de8ba9b01c9ab7cbabf5c33b80b7bbc618857627\/Model_Card_Claude_3.pdf. Accessed 16 Oct 2024"},{"key":"30_CR4","unstructured":"Bai, J., et\u00a0al.: Qwen technical report. arXiv:2309.16609 (2023)"},{"key":"30_CR5","unstructured":"Bai, J., et\u00a0al.: Qwen-VL: a versatile vision-language model for understanding, localization, text reading, and beyond. arXiv:2308.12966 (2023)"},{"key":"30_CR6","unstructured":"Beyer, L., et\u00a0al.: PaliGemma: a versatile 3B VLM for transfer. arXiv:2407.07726 (2024)"},{"key":"30_CR7","doi-asserted-by":"crossref","unstructured":"Chen, L., et\u00a0al.: ShareGPT4V: improving large multi-modal models with better captions. arXiv:2311.12793 (2023)","DOI":"10.1007\/978-3-031-72643-9_22"},{"key":"30_CR8","doi-asserted-by":"crossref","unstructured":"Chhikara, P., Chaurasia, D., Jiang, Y., Masur, O., Ilievski, F.: Fire: food image to recipe generation. In: WACV (2024)","DOI":"10.1109\/WACV57701.2024.00800"},{"key":"30_CR9","doi-asserted-by":"publisher","unstructured":"Cookpad Inc.: Cookpad data. Informatics Research Data Repository, National Institute of Informatics (dataset) (2015). https:\/\/doi.org\/10.32130\/idr.5.1","DOI":"10.32130\/idr.5.1"},{"key":"30_CR10","unstructured":"Dubois, Y., et\u00a0al.: AlpacaFarm: a simulation framework for methods that learn from human feedback. In: NeurIPS (2023)"},{"key":"30_CR11","unstructured":"Gemini Team, Google: Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context (2024). https:\/\/storage.googleapis.com\/deepmind-media\/gemini\/gemini_v1_5_report.pdf. Accessed 16 Oct 2024"},{"key":"30_CR12","unstructured":"Gemma Team: Gemma: Open models based on Gemini research and technology. arXiv:2403.08295 (2024)"},{"key":"30_CR13","unstructured":"Hu, E.J., et\u00a0al.: LoRA: low-rank adaptation of large language models. In: ICLR (2022)"},{"issue":"S1","key":"30_CR14","doi-asserted-by":"publisher","first-page":"S63","DOI":"10.1121\/1.2016299","volume":"62","author":"F Jelinek","year":"1977","unstructured":"Jelinek, F., Mercer, R.L., Bahl, L.R., Baker, J.K.: Perplexity-a measure of the difficulty of speech recognition tasks. J. Acoust. Soc. Am. 62(S1), S63\u2013S63 (1977)","journal-title":"J. Acoust. Soc. Am."},{"key":"30_CR15","unstructured":"Jiang, A.Q., et\u00a0al.: Mistral 7b. arXiv:2310.06825 (2023)"},{"key":"30_CR16","doi-asserted-by":"crossref","unstructured":"Kirillov, A., et\u00a0al.: Segment anything. In: ICCV, pp. 4015\u20134026 (2023)","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"30_CR17","doi-asserted-by":"crossref","unstructured":"Kuckreja, K., Danish, M.S., Naseer, M., Das, A., Khan, S., Khan, F.S.: GeoChat: grounded large vision-language model for remote sensing. In: CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.02629"},{"key":"30_CR18","unstructured":"Kudo, T.: MeCab: yet another part-of-speech and morphological analyzer (2013). https:\/\/taku910.github.io\/mecab\/. Accessed 16 Oct 2024"},{"key":"30_CR19","doi-asserted-by":"crossref","unstructured":"Li, C., et\u00a0al.: LLaVA-Med: training a large language-and-vision assistant for biomedicine in one day. In: NeurIPS (2023)","DOI":"10.32388\/VLXB6M"},{"key":"30_CR20","unstructured":"Lin, C.Y.: ROUGE: a package for automatic evaluation of summaries. In: Text Summarization Branches Out, Barcelona, Spain, pp. 74\u201381. Association for Computational Linguistics (2004)"},{"key":"30_CR21","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"30_CR22","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning. In: CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"30_CR23","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: NeurIPS (2023)"},{"key":"30_CR24","unstructured":"Llama Team, AI @ Meta: The llama 3 herd of models. arXiv:2407.21783 (2024)"},{"key":"30_CR25","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: ICLR (2019)"},{"key":"30_CR26","doi-asserted-by":"publisher","first-page":"187","DOI":"10.1109\/TPAMI.2019.2927476","volume":"43","author":"J Marin","year":"2019","unstructured":"Marin, J., et al.: Recipe1M+: a dataset for learning cross-modal embeddings for cooking recipes and food images. IEEE Trans. Pattern Anal. Mach. Intell. 43, 187\u2013203 (2019)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"30_CR27","unstructured":"OpenAI: GPT-4 Technical Report (2023). https:\/\/cdn.openai.com\/papers\/gpt-4.pdf. Accessed 16 Oct 2024"},{"key":"30_CR28","unstructured":"Ouyang, L., et\u00a0al.: Training language models to follow instructions with human feedback. In: NeurIPS (2022)"},{"key":"30_CR29","doi-asserted-by":"crossref","unstructured":"Papadopoulos, D.P., Mora, E., Chepurko, N., Huang, K.W., Ofli, F., Torralba, A.: Learning program representations for food images and cooking recipes. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01606"},{"key":"30_CR30","doi-asserted-by":"crossref","unstructured":"Post, M.: A call for clarity in reporting BLEU scores. In: Proceedings of the Third Conference on Machine Translation: Research Papers (2018)","DOI":"10.18653\/v1\/W18-6319"},{"key":"30_CR31","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML (2021)"},{"key":"30_CR32","doi-asserted-by":"publisher","unstructured":"Rakuten Group, Inc.: Rakuten recipe data. Informatics Research Data Repository, National Institute of Informatics (dataset) (2017). https:\/\/doi.org\/10.32130\/idr.2.4","DOI":"10.32130\/idr.2.4"},{"key":"30_CR33","doi-asserted-by":"crossref","unstructured":"Salvador, A., Drozdzal, M., Giro-i Nieto, X., Romero, A.: Inverse cooking: recipe generation from food images. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.01070"},{"key":"30_CR34","doi-asserted-by":"crossref","unstructured":"Salvador, A., et al.: Learning cross-modal embeddings for cooking recipes and food images. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.327"},{"key":"30_CR35","unstructured":"Song, F., Zhu, B., Hao, Y., Wang, S., He, X.: CAR: consolidation, augmentation and regulation for recipe retrieval. arXiv:2312.04763 (2023)"},{"key":"30_CR36","unstructured":"Touvron, H., et\u00a0al.: Llama 2: open foundation and fine-tuned chat models. arXiv:2307.09288 (2023)"},{"key":"30_CR37","unstructured":"Touvron, H., et\u00a0al.: Llama: open and efficient foundation language models. arXiv:2302.13971 (2023)"},{"key":"30_CR38","unstructured":"Vaswani, A., et\u00a0al.: Attention is all you need. In: NeurIPS (2017)"},{"key":"30_CR39","doi-asserted-by":"crossref","unstructured":"Wang, L., Yamakata, Y., Aizawa, K.: Automatic dataset creation from user-generated recipes for ingredient-centric food image analysis. In: MMAsia (2023)","DOI":"10.1145\/3595916.3626748"},{"key":"30_CR40","unstructured":"Wang, W., et\u00a0al.: CogVLM: visual expert for pretrained language models. arXiv:2311.03079 (2024)"},{"key":"30_CR41","unstructured":"Yin, Y., Qi, H., Zhu, B., Chen, J., Jiang, Y.G., Ngo, C.W.: FoodLMM: a versatile food assistant using large multi-modal model. arXiv:2312.14991 (2024)"},{"key":"30_CR42","doi-asserted-by":"crossref","unstructured":"Yoshikawa, Y., Shigeto, Y., Takeuchi, A.: Stair captions: Constructing a large-scale Japanese image caption dataset. In: ACL (Volume 2: Short Papers) (2017)","DOI":"10.18653\/v1\/P17-2066"},{"key":"30_CR43","unstructured":"Zheng, L., et\u00a0al.: Judging LLM-as-a-judge with MT-bench and chatbot arena. In: NeurIPS Datasets and Benchmarks Track (2023)"}],"container-title":["Lecture Notes in Computer Science","MultiMedia Modeling"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-2054-8_30","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,23]],"date-time":"2025-03-23T01:42:35Z","timestamp":1742694155000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-2054-8_30"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9789819620531","9789819620548"],"references-count":43,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-2054-8_30","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"3 January 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"MMM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Multimedia Modeling","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Nara","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Japan","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9 January 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 January 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"31","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"mmm2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/mmm2025.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}