{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:07:01Z","timestamp":1765339621403,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","funder":[{"name":"Chinese Nutrition Society","award":["No. CNS-YUM2024-120"],"award-info":[{"award-number":["No. CNS-YUM2024-120"]}]},{"name":"Fundamental Research Funds for the Central Universities","award":["2023RC72"],"award-info":[{"award-number":["2023RC72"]}]},{"name":"Key Project of Philosophy and Social Sciences Research, Ministry of Education, China","award":["No.24JZD040"],"award-info":[{"award-number":["No.24JZD040"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No.62176025,No. 62301066,No. 62206012,No. 62406028"],"award-info":[{"award-number":["No.62176025,No. 62301066,No. 62206012,No. 62406028"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755734","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:55:00Z","timestamp":1761375300000},"page":"5060-5069","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["RecipeRAG: Advancing Recipe Generation with Reinforced Retrieval Augmented Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-9868-4556","authenticated-orcid":false,"given":"Jinghan","family":"Yang","sequence":"first","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8948-1589","authenticated-orcid":false,"given":"Zhenbo","family":"Xu","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China and ShiFang Technology Inc., Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-5677-4763","authenticated-orcid":false,"given":"Dehua","family":"Ma","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8730-5824","authenticated-orcid":false,"given":"Liu","family":"Liu","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0266-6896","authenticated-orcid":false,"given":"Fei","family":"Liu","sequence":"additional","affiliation":[{"name":"Zhejiang University, Zhejiang, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5557-6053","authenticated-orcid":false,"given":"Gong","family":"Huang","sequence":"additional","affiliation":[{"name":"Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3433-8435","authenticated-orcid":false,"given":"Zhaofeng","family":"He","sequence":"additional","affiliation":[{"name":"Beijing University of Post and Telecommunication, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3209978.3210036"},{"key":"e_1_3_2_1_2_1","volume-title":"Murag: Multimodal retrieval-augmented generator for open question answering over images and text. arXiv preprint arXiv:2210.02928","author":"Chen Wenhu","year":"2022","unstructured":"Wenhu Chen, Hexiang Hu, Xi Chen, Pat Verga, and William W Cohen. 2022. Murag: Multimodal retrieval-augmented generator for open question answering over images and text. arXiv preprint arXiv:2210.02928 (2022)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00800"},{"key":"e_1_3_2_1_4_1","volume-title":"The faiss library. arXiv preprint arXiv:2401.08281","author":"Douze Matthijs","year":"2024","unstructured":"Matthijs Douze, Alexandr Guzhva, Chengqi Deng, Jeff Johnson, Gergely Szilvasy, Pierre-Emmanuel Mazar\u00e9, Maria Lomeli, Lucas Hosseini, and Herv\u00e9 J\u00e9gou. 2024. The faiss library. arXiv preprint arXiv:2401.08281 (2024)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01458"},{"volume-title":"cuisine and class: a study in comparative sociology","author":"Goody Jack","key":"e_1_3_2_1_6_1","unstructured":"Jack Goody. 1982. Cooking, cuisine and class: a study in comparative sociology. Cambridge University Press."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475465"},{"key":"e_1_3_2_1_8_1","unstructured":"Daya Guo Dejian Yang Haowei Zhang Junxiao Song Ruoyu Zhang Runxin Xu Qihao Zhu Shirong Ma Peiyi Wang Xiao Bi et al. 2025. Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning. arXiv preprint arXiv:2501.12948 (2025)."},{"key":"e_1_3_2_1_9_1","volume-title":"International conference on machine learning. PMLR, 3929-3938","author":"Guu Kelvin","year":"2020","unstructured":"Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat, and Mingwei Chang. 2020. Retrieval augmented language model pre-training. In International conference on machine learning. PMLR, 3929-3938."},{"key":"e_1_3_2_1_10_1","volume-title":"Vision-r1: Incentivizing reasoning capability in multimodal large language models. arXiv preprint arXiv:2503.06749","author":"Huang Wenxuan","year":"2025","unstructured":"Wenxuan Huang, Bohan Jia, Zijie Zhai, Shaosheng Cao, Zheyu Ye, Fei Zhao, Yao Hu, and Shaohui Lin. 2025. Vision-r1: Incentivizing reasoning capability in multimodal large language models. arXiv preprint arXiv:2503.06749 (2025)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612193"},{"key":"e_1_3_2_1_12_1","first-page":"1","article-title":"Atlas: Few-shot learning with retrieval augmented language models","volume":"24","author":"Izacard Gautier","year":"2023","unstructured":"Gautier Izacard, Patrick Lewis, Maria Lomeli, Lucas Hosseini, Fabio Petroni, Timo Schick, Jane Dwivedi-Yu, Armand Joulin, Sebastian Riedel, and Edouard Grave. 2023. Atlas: Few-shot learning with retrieval augmented language models. Journal of Machine Learning Research, Vol. 24, 251 (2023), 1-43.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_13_1","volume-title":"Sung Ju Hwang, and Jong C Park","author":"Jeong Soyeong","year":"2024","unstructured":"Soyeong Jeong, Jinheon Baek, Sukmin Cho, Sung Ju Hwang, and Jong C Park. 2024. Adaptive-rag: Learning to adapt retrieval-augmented large language models through question complexity. arXiv preprint arXiv:2403.14403 (2024)."},{"key":"e_1_3_2_1_14_1","volume-title":"Search-r1: Training llms to reason and leverage search engines with reinforcement learning. arXiv preprint arXiv:2503.09516","author":"Jin Bowen","year":"2025","unstructured":"Bowen Jin, Hansi Zeng, Zhenrui Yue, Dong Wang, Hamed Zamani, and Jiawei Han. 2025. Search-r1: Training llms to reason and leverage search engines with reinforcement learning. arXiv preprint arXiv:2503.09516 (2025)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-99-4725-6_5"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV61041.2025.00244"},{"key":"e_1_3_2_1_17_1","volume-title":"Improving Food Recognition with Retrieval-Augmented and Domain-Adaptive LVLMs. In ICASSP 2025-2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1-5.","author":"Ma Dehua","year":"2025","unstructured":"Dehua Ma, Zhenbo Xu, Tianshun Xing, Lu Yuan, Jinghan Yang, Huijia Wu, Ming Lei, and Zhaofeng He. 2025. Improving Food Recognition with Retrieval-Augmented and Domain-Adaptive LVLMs. In ICASSP 2025-2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1-5."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3237871"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3627673.3679562"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01606"},{"key":"e_1_3_2_1_21_1","volume-title":"International Conference on Soft Computing and Pattern Recognition. Springer, 871-878","author":"Prasad GNR","year":"2022","unstructured":"GNR Prasad, Y Sri Lalitha, Y Gayatri, and B Indira. 2022. Swasth: An Inverse Cooking Recipe Generation from Food Images. In International Conference on Soft Computing and Pattern Recognition. Springer, 871-878."},{"key":"e_1_3_2_1_22_1","volume-title":"International conference on machine learning. PMLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748-8763."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00605"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01070"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01522"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.327"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00503"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2024.104071"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72983-6_7"},{"key":"e_1_3_2_1_30_1","first-page":"501","volume-title":"Nature","volume":"515","author":"Stehfest Elke","year":"2014","unstructured":"Elke Stehfest. 2014. Food choices for health and planet. Nature, Vol. 515, 7528 (2014), 501-502."},{"key":"e_1_3_2_1_31_1","volume-title":"Plug-and-play vqa: Zero-shot vqa by conjoining large pretrained models with zero training. arXiv preprint arXiv:2210.08773","author":"Huat Tiong Anthony Meng","year":"2022","unstructured":"Anthony Meng Huat Tiong, Junnan Li, Boyang Li, Silvio Savarese, and Steven CH Hoi. 2022. Plug-and-play vqa: Zero-shot vqa by conjoining large pretrained models with zero training. arXiv preprint arXiv:2210.08773 (2022)."},{"key":"e_1_3_2_1_32_1","volume-title":"Malm: Mask augmentation based local matching for food-recipe retrieval. arXiv preprint arXiv:2305.11327","author":"Voutharoja Bhanu Prakash","year":"2023","unstructured":"Bhanu Prakash Voutharoja, Peng Wang, Lei Wang, and Vivienne Guan. 2023. Malm: Mask augmentation based local matching for food-recipe retrieval. arXiv preprint arXiv:2305.11327 (2023)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00549"},{"key":"e_1_3_2_1_34_1","first-page":"359","volume-title":"UK","author":"Wang Hao","year":"2020","unstructured":"Hao Wang, Guosheng Lin, Steven CH Hoi, and Chunyan Miao. 2020. Structure-aware generation network for recipe generation from images. In Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part XXVII 16. Springer, 359-374."},{"key":"e_1_3_2_1_35_1","first-page":"3363","article-title":"Learning structural representations for recipe generation and food retrieval","volume":"45","author":"Wang Hao","year":"2022","unstructured":"Hao Wang, Guosheng Lin, Steven CH Hoi, and Chunyan Miao. 2022. Learning structural representations for recipe generation and food retrieval. IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. 45, 3 (2022), 3363-3377.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01184"},{"key":"e_1_3_2_1_37_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Xu Fangyuan","year":"2024","unstructured":"Fangyuan Xu, Weijia Shi, and Eunsol Choi. 2024. RECOMP: Improving retrieval-augmented LMs with context compression and selective augmentation. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_38_1","unstructured":"Yi Yang Xiaoxuan He Hongkun Pan Xiyan Jiang Yan Deng Xingtao Yang Haoyu Lu Dacheng Yin Fengyun Rao Minfeng Zhu et al. 2025. R1-onevision: Advancing generalized multimodal reasoning through cross-modal formalization. arXiv preprint arXiv:2503.10615 (2025)."},{"key":"e_1_3_2_1_39_1","volume-title":"The unreliability of explanations in few-shot prompting for textual reasoning. Advances in neural information processing systems","author":"Ye Xi","year":"2022","unstructured":"Xi Ye and Greg Durrett. 2022. The unreliability of explanations in few-shot prompting for textual reasoning. Advances in neural information processing systems, Vol. 35 (2022), 30378-30392."},{"key":"e_1_3_2_1_40_1","volume-title":"Foodlmm: A versatile food assistant using large multi-modal model. arXiv preprint arXiv:2312.14991","author":"Yin Yuehao","year":"2023","unstructured":"Yuehao Yin, Huiyan Qi, Bin Zhu, Jingjing Chen, Yu-Gang Jiang, and Chong-Wah Ngo. 2023. Foodlmm: A versatile food assistant using large multi-modal model. arXiv preprint arXiv:2312.14991 (2023)."},{"key":"e_1_3_2_1_41_1","volume-title":"RankRAG: Unifying Context Ranking with Retrieval-Augmented Generation in LLMs. arXiv preprint arXiv:2407.02485","author":"Yu Yue","year":"2024","unstructured":"Yue Yu, Wei Ping, Zihan Liu, Boxin Wang, Jiaxuan You, Chao Zhang, Mohammad Shoeybi, and Bryan Catanzaro. 2024. RankRAG: Unifying Context Ranking with Retrieval-Augmented Generation in LLMs. arXiv preprint arXiv:2407.02485 (2024)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72983-6_18"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3050090"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01174"},{"key":"e_1_3_2_1_45_1","volume-title":"INTERS: Unlocking the Power of Large Language Models in Search with Instruction Tuning. arXiv preprint arXiv:2401.06532","author":"Zhu Yutao","year":"2024","unstructured":"Yutao Zhu, Peitian Zhang, Chenghao Zhang, Yifei Chen, Binyu Xie, Zhicheng Dou, Zheng Liu, and Ji-Rong Wen. 2024. INTERS: Unlocking the Power of Large Language Models in Search with Instruction Tuning. arXiv preprint arXiv:2401.06532 (2024)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755734","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:03:29Z","timestamp":1765339409000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755734"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":45,"alternative-id":["10.1145\/3746027.3755734","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755734","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}