{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T16:17:54Z","timestamp":1774628274347,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62406126"],"award-info":[{"award-number":["62406126"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Scientific Research Project of the Education Department of Jilin Province","award":["JJKH20250119KJ"],"award-info":[{"award-number":["JJKH20250119KJ"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3758203","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:39:06Z","timestamp":1761377946000},"page":"12659-12665","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["RecipeGen: A Step-Aligned Multimodal Benchmark for Real-World Recipe Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-0829-6267","authenticated-orcid":false,"given":"Ruoxuan","family":"Zhang","sequence":"first","affiliation":[{"name":"Jilin University, Changchun, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-3310-748X","authenticated-orcid":false,"given":"Jidong","family":"Gao","sequence":"additional","affiliation":[{"name":"Guangdong University of Technology, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-4665-6978","authenticated-orcid":false,"given":"Bin","family":"Wen","sequence":"additional","affiliation":[{"name":"Jilin University, Changchun, Jilin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5652-4327","authenticated-orcid":false,"given":"Hongxia","family":"Xie","sequence":"additional","affiliation":[{"name":"Jilin University, Changchun, Jilin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-2987-0126","authenticated-orcid":false,"given":"Chenming","family":"Zhang","sequence":"additional","affiliation":[{"name":"Jilin University, Changchun, Jilin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2216-077X","authenticated-orcid":false,"given":"Hong-Han","family":"Shuai","sequence":"additional","affiliation":[{"name":"National Yang Ming Chiao Tung University, Hsinchu, Taiwan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4662-7875","authenticated-orcid":false,"given":"Wen-Huang","family":"Cheng","sequence":"additional","affiliation":[{"name":"National Taiwan University, Taipei, Taiwan"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al., 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","volume-title":"arXiv preprint arXiv:2502.13923","author":"Bai Shuai","year":"2025","unstructured":"Shuai Bai, Keqin Chen, Xuejing Liu, Jialin Wang, Wenbin Ge, Sibo Song, Kai Dang, Peng Wang, Shijie Wang, Jun Tang, Humen Zhong, Yuanzhi Zhu, Mingkun Yang, Zhaohai Li, Jianqiang Wan, Pengfei Wang, Wei Ding, Zheren Fu, Yiheng Xu, Jiabo Ye, Xi Zhang, Tianbao Xie, Zesen Cheng, Hang Zhang, Zhibo Yang, Haiyang Xu, and Junyang Lin. 2025. Qwen2.5-VL Technical Report. arXiv preprint arXiv:2502.13923 (2025)."},{"key":"e_1_3_2_1_3_1","volume-title":"Tript Sharma, Aman Kumar Sharma, Dheeraj Khanna, Jaspreet Singh Marwah, Srilakshmi Kalathil, Navjot Singh, et al.","author":"Batra Devansh","year":"2020","unstructured":"Devansh Batra, Nirav Diwan, Utkarsh Upadhyay, Jushaan Singh Kalra, Tript Sharma, Aman Kumar Sharma, Dheeraj Khanna, Jaspreet Singh Marwah, Srilakshmi Kalathil, Navjot Singh, et al., 2020. Recipedb: a resource for exploring recipes. Database, Vol. 2020 (2020), baaa077."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.inlg-1.4"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10599-4_29"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"e_1_3_2_1_7_1","volume-title":"Training-free regional prompting for diffusion transformers. arXiv preprint arXiv:2411.02395","author":"Chen Anthony","year":"2024","unstructured":"Anthony Chen, Jianjin Xu, Wenzhao Zheng, Gaole Dai, Yida Wang, Renrui Zhang, Haofan Wang, and Shanghang Zhang. 2024. Training-free regional prompting for diffusion transformers. arXiv preprint arXiv:2411.02395 (2024)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2964315"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00800"},{"key":"e_1_3_2_1_10_1","volume-title":"An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_11_1","first-page":"181","volume-title":"Companion Proceedings of the Web Conference","author":"Lee Helena H.","year":"2020","unstructured":"Helena H. Lee, Ke Shu, Palakorn Achananuparp, Philips Kokoh Prasetyo, Yue Liu, Ee-Peng Lim, and Lav R Varshney. 2020. RecipeGPT: Generative pre-training based cooking recipe generation and evaluation system. In Companion Proceedings of the Web Conference 2020. 181-184."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093463"},{"key":"e_1_3_2_1_13_1","volume-title":"In-context lora for diffusion transformers. arXiv preprint arXiv:2410.23775","author":"Huang Lianghua","year":"2024","unstructured":"Lianghua Huang, Wei Wang, Zhi-Fan Wu, Yupeng Shi, Huanzhang Dou, Chen Liang, Yutong Feng, Yu Liu, and Jingren Zhou. 2024. In-context lora for diffusion transformers. arXiv preprint arXiv:2410.23775 (2024)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2019.2929447"},{"key":"e_1_3_2_1_15_1","unstructured":"Glenn Jocher Jing Qiu and Ayush Chaurasia. 2023. Ultralytics YOLO . https:\/\/github.com\/ultralytics\/ultralytics"},{"key":"e_1_3_2_1_16_1","unstructured":"Black Forest Labs. 2024. Flux.1 AI. https:\/\/flux1ai.com\/. Accessed: 2025-04-04."},{"key":"e_1_3_2_1_17_1","volume-title":"Foodsam: Any food segmentation","author":"Lan Xing","year":"2023","unstructured":"Xing Lan, Jiayi Lyu, Hanyu Jiang, Kun Dong, Zehai Niu, Yi Zhang, and Jian Xue. 2023. Foodsam: Any food segmentation. IEEE Transactions on Multimedia (2023)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3554738"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2927476"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00600"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2019.2958761"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3329168"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3237871"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413636"},{"key":"e_1_3_2_1_25_1","volume-title":"Sdxl: Improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952","author":"Podell Dustin","year":"2023","unstructured":"Dustin Podell, Zion English, Kyle Lacey, Andreas Blattmann, Tim Dockhorn, Jonas M\u00fcller, Joe Penna, and Robin Rombach. 2023. Sdxl: Improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952 (2023)."},{"key":"e_1_3_2_1_26_1","volume-title":"International conference on machine learning. PmLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748-8763."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1609\/icwsm.v12i1.15034"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_29_1","unstructured":"Stability AI. 2022. Stable Diffusion v2.1 Model Card. https:\/\/huggingface.co\/stabilityai\/stable-diffusion-2-1. Accessed: 2025-04-11."},{"key":"e_1_3_2_1_30_1","unstructured":"Stability AI. 2024. Stable Diffusion 3.5 Large Model Card. https:\/\/huggingface.co\/stabilityai\/stable-diffusion-3.5-large. Accessed: 2025-04-11."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58583-9_22"},{"key":"e_1_3_2_1_32_1","first-page":"3363","article-title":"Learning structural representations for recipe generation and food retrieval","volume":"45","author":"Wang Hao","year":"2022","unstructured":"Hao Wang, Guosheng Lin, Steven CH Hoi, and Chunyan Miao. 2022. Learning structural representations for recipe generation and food retrieval. IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. 45, 3 (2022), 3363-3377.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3418211"},{"key":"e_1_3_2_1_34_1","volume-title":"Image quality assessment: from error visibility to structural similarity","author":"Wang Zhou","year":"2004","unstructured":"Zhou Wang, Alan C Bovik, Hamid R Sheikh, and Eero P Simoncelli. 2004. Image quality assessment: from error visibility to structural similarity. IEEE transactions on image processing, Vol. 13, 4 (2004), 600-612."},{"key":"e_1_3_2_1_35_1","volume-title":"Visual Goal-Step Inference using wikiHow. arXiv preprint arXiv:2104.05845","author":"Yang Yue","year":"2021","unstructured":"Yue Yang, Artemis Panagopoulou, Qing Lyu, Li Zhang, Mark Yatskar, and Chris Callison-Burch. 2021. Visual Goal-Step Inference using wikiHow. arXiv preprint arXiv:2104.05845 (2021)."},{"key":"e_1_3_2_1_36_1","volume-title":"Foodlmm: A versatile food assistant using large multi-modal model. arXiv preprint arXiv:2312.14991","author":"Yin Yuehao","year":"2023","unstructured":"Yuehao Yin, Huiyan Qi, Bin Zhu, Jingjing Chen, Yu-Gang Jiang, and Chong-Wah Ngo. 2023. Foodlmm: A versatile food assistant using large multi-modal model. arXiv preprint arXiv:2312.14991 (2023)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00530-024-01297-w"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00530-025-01809-2"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.101859"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12342"},{"key":"e_1_3_2_1_41_1","volume-title":"Towards Automatic Learning of Procedures from Web Instructional Videos. arXiv preprint arXiv:1703.09788","author":"Zhou Luowei","year":"2017","unstructured":"Luowei Zhou, Chenliang Xu, and Jason J Corso. 2017. Towards Automatic Learning of Procedures from Web Instructional Videos. arXiv preprint arXiv:1703.09788 (2017)."},{"key":"e_1_3_2_1_42_1","volume-title":"FoodSky: A Food-oriented Large Language Model that Passes the Chef and Dietetic Examination. arXiv preprint arXiv:2406.10261","author":"Zhou Pengfei","year":"2024","unstructured":"Pengfei Zhou, Weiqing Min, Chaoran Fu, Ying Jin, Mingyu Huang, Xiangyang Li, Shuhuan Mei, and Shuqiang Jiang. 2024. FoodSky: A Food-oriented Large Language Model that Passes the Chef and Dietetic Examination. arXiv preprint arXiv:2406.10261 (2024)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3758203","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:59:22Z","timestamp":1765310362000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3758203"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":42,"alternative-id":["10.1145\/3746027.3758203","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3758203","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}