{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,25]],"date-time":"2026-02-25T20:10:17Z","timestamp":1772050217051,"version":"3.50.1"},"publisher-location":"Cham","reference-count":24,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031882166","type":"print"},{"value":"9783031882173","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-88217-3_5","type":"book-chapter","created":{"date-parts":[[2025,5,26]],"date-time":"2025-05-26T10:23:25Z","timestamp":1748255005000},"page":"63-75","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["CalorieLLaVA: Image-Based Calorie Estimation with\u00a0Multimodal Large Language Models"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-5752-9175","authenticated-orcid":false,"given":"Hikaru","family":"Tanabe","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0431-183X","authenticated-orcid":false,"given":"Keiji","family":"Yanai","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,5,27]]},"reference":[{"key":"5_CR1","unstructured":"Alayrac, J.B., et al: Flamingo: a visual language model for few-shot learning. In: Advances in Neural Information Processing Systems, vol.\u00a035, pp. 23716\u201323736 (2022)"},{"key":"5_CR2","doi-asserted-by":"crossref","unstructured":"Ando, Y., Ege, T., Cho, J., Yanai, K.: DepthCalorieCam: a mobile application for volume-based foodcalorie estimation using depth cameras. In: Proceedings of the 5th International Workshop on Multimedia Assisted Dietary Management, pp. 76\u201381 (2019)","DOI":"10.1145\/3347448.3357172"},{"key":"5_CR3","unstructured":"Chiang, W.L., et al.: Vicuna: An open-source chatbot impressing GPT-4 with 90%* ChatGPT quality (2023). https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/"},{"key":"5_CR4","unstructured":"Dai, W., et al.: InstructBLIP: Towards general-purpose vision-language models with instruction tuning. arXiv preprint arXiv:2305.06500 (2023)"},{"key":"5_CR5","unstructured":"Dinh, T., et al.: LIFT: Language-interfaced fine-tuning for non-language machine learning tasks. In: Advances in Neural Information Processing Systems, vol.\u00a035, pp. 11763\u201311784 (2022)"},{"key":"5_CR6","doi-asserted-by":"crossref","unstructured":"Ege, T., Shimoda, W., Yanai, K.: A new large-scale food image segmentation dataset and its application to food calorie estimation based on grains of rice. In: Proceedings of ACMMM Workshop on Multimedia Assisted Dietary Management (2019)","DOI":"10.1145\/3347448.3357162"},{"key":"5_CR7","doi-asserted-by":"crossref","unstructured":"Ege, T., Yanai, K.: Image-based food calorie estimation using knowledge on food categories, ingredients and cooking directions. In: Proceedings of the on Thematic Workshops of ACM Multimedia 2017, pp. 367\u2013375 (2017)","DOI":"10.1145\/3126686.3126742"},{"key":"5_CR8","doi-asserted-by":"crossref","unstructured":"Akpa, EAH., \u00a0Suwa, H., Arakawa, Y., Yasumoto, K.: Smartphone-based food weight and calorie estimation method for effective food journaling. SICE J. Contr., Measure. Syst. Integr. 10(5), 360\u2013369 (2017)","DOI":"10.9746\/jcmsi.10.360"},{"key":"5_CR9","unstructured":"Hu, E.J., et al.: LoRA: Low-rank adaptation of large language models. In: Proceedings of International Conference on Learning Representations (2022)"},{"key":"5_CR10","unstructured":"Kaplan, J., et al.: Scaling laws for neural language models. arXiv preprint arXiv:2001.08361 (2020)"},{"key":"5_CR11","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: Proceedings of International Conference on Machine Learning (2023)"},{"key":"5_CR12","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744 (2023)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"5_CR13","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: Advances in Neural Information Processing Systems (2023)"},{"key":"5_CR14","doi-asserted-by":"crossref","unstructured":"Naritomi, S., Yanai, K.: Hungry Networks: 3d mesh reconstruction of a dish and a plate from a single dish image for estimating food volume. In: Proceedigns of the 2nd ACM International Conference on Multimedia in Asia (2021)","DOI":"10.1145\/3444685.3446275"},{"key":"5_CR15","doi-asserted-by":"crossref","unstructured":"Okamoto, K., Yanai, K.: An automatic calorie estimation system of food images on a smartphone. In: Proceedings of the 2nd International Workshop on Multimedia Assisted Dietary Management (2016)","DOI":"10.1145\/2986035.2986040"},{"key":"5_CR16","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: Proceedings of International Conference on Machine Learning, pp. 8748\u20138763 (2021)"},{"key":"5_CR17","doi-asserted-by":"crossref","unstructured":"Rajbhandari, S., Rasley, J., Ruwase, O., He, Y.: ZeRO: Memory optimizations toward training trillion parameter models. In: SC20: International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1\u201316 (2020)","DOI":"10.1109\/SC41405.2020.00024"},{"key":"5_CR18","doi-asserted-by":"crossref","unstructured":"Rasley, J., Rajbhandari, S., Ruwase, O., He, Y.: DeepSpeed: System optimizations enable training deep learning models with over 100 billion parameters. In: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pp. 3505\u20133506 (2020)","DOI":"10.1145\/3394486.3406703"},{"key":"5_CR19","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)"},{"key":"5_CR20","doi-asserted-by":"crossref","unstructured":"Tanno, R., Ege, T., Yanai, K.: AR DeepCalorieCam V2: Food calorie estimation with CNN and AR-based actual size estimation. In: Proceedings of the 24th ACM Symposium on Virtual Reality Software and Technology (2018)","DOI":"10.1145\/3281505.3281580"},{"key":"5_CR21","doi-asserted-by":"crossref","unstructured":"Thames, Q., et al.: Nutrition5k: Towards automatic nutritional understanding of generic food. In: Proceedings of IEEE Computer Vision and Pattern Recognition, pp. 8903\u20138911 (2021)","DOI":"10.1109\/CVPR46437.2021.00879"},{"key":"5_CR22","unstructured":"Wei, J., et al.: Emergent abilities of large language models. arXiv preprint arXiv:2206.07682 (2022)"},{"key":"5_CR23","unstructured":"Yin, Y., et al.: FoodLMM: A versatile food assistant using large multi-modal model. arXiv preprint arXiv:2312.14991 (2023)"},{"key":"5_CR24","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: MiniGPT-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition. ICPR 2024 International Workshops and Challenges"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-88217-3_5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T19:14:09Z","timestamp":1760296449000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-88217-3_5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031882166","9783031882173"],"references-count":24,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-88217-3_5","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"27 May 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICPR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Pattern Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Kolkata","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"India","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icpr2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icpr2024.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}