{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,20]],"date-time":"2026-05-20T23:08:41Z","timestamp":1779318521905,"version":"3.51.4"},"reference-count":58,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100007129","name":"Shandong Province Natural Science Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100007129","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neural Networks"],"published-print":{"date-parts":[[2026,11]]},"DOI":"10.1016\/j.neunet.2026.109102","type":"journal-article","created":{"date-parts":[[2026,5,16]],"date-time":"2026-05-16T22:17:48Z","timestamp":1778969868000},"page":"109102","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Cross-modal recipe retrieval via multi-granularity alignment"],"prefix":"10.1016","volume":"203","author":[{"given":"Runqi","family":"Zan","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2660-1050","authenticated-orcid":false,"given":"Tao","family":"Yao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuxin","family":"Yu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Guorui","family":"Sheng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zongchao","family":"Huang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"78","reference":[{"key":"10.1016\/j.neunet.2026.109102_bib0001","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2019.105428","article-title":"Cross-modal recipe retrieval via parallel-and cross-attention networks learning","volume":"193","author":"Cao","year":"2020","journal-title":"Knowledge-Based Systems"},{"key":"10.1016\/j.neunet.2026.109102_bib0002","series-title":"Proceedings of the 25th ACM international conference on multimedia","first-page":"1771","article-title":"Cross-modal recipe retrieval with rich food attributes","author":"Chen","year":"2017"},{"key":"10.1016\/j.neunet.2026.109102_bib0003","series-title":"Proceedings of the 26th ACM international conference on multimedia","first-page":"1020","article-title":"Deep understanding of cooking procedure for cross-modal recipe retrieval","author":"Chen","year":"2018"},{"key":"10.1016\/j.neunet.2026.109102_bib0004","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"5896","article-title":"Learning a sparse transformer network for effective image deraining","author":"Chen","year":"2023"},{"key":"10.1016\/j.neunet.2026.109102_bib0005","series-title":"International conference on learning representations","first-page":"1","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2021"},{"key":"10.1016\/j.neunet.2026.109102_bib0006","series-title":"Proceedings of the 22nd SIGMORPHON workshop on computational morphology, phonology, and phonetics","first-page":"1","article-title":"Prompt and circumstance: A word-by-word LLM prompting approach to interlinear glossing for low-resource languages","author":"Elsner","year":"2025"},{"key":"10.1016\/j.neunet.2026.109102_bib0007","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"14570","article-title":"MCEN: Bridging cross-modal gap between cooking recipes and dish images with latent variable model","author":"Fu","year":"2020"},{"key":"10.1016\/j.neunet.2026.109102_bib0008","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2024.125087","article-title":"Llms-based machine translation for e-commerce","volume":"258","author":"Gao","year":"2024","journal-title":"Expert Systems with Applications"},{"key":"10.1016\/j.neunet.2026.109102_bib0009","series-title":"Proceedings of the 29th ACM international conference on multimedia","first-page":"3192","article-title":"Cross-modal retrieval and synthesis (x-mrs): Closing the modality gap in shared subspace learning","author":"Guerrero","year":"2021"},{"key":"10.1016\/j.neunet.2026.109102_bib0010","series-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","first-page":"770","article-title":"Deep residual learning for image recognition","author":"He","year":"2016"},{"key":"10.1016\/j.neunet.2026.109102_bib0011","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2023.109897","article-title":"Sparse self-attention transformer for image inpainting","volume":"145","author":"Huang","year":"2024","journal-title":"Pattern Recognition"},{"key":"10.1016\/j.neunet.2026.109102_bib0012","doi-asserted-by":"crossref","first-page":"2783","DOI":"10.1109\/TMM.2024.3384672","article-title":"Cross-modal recipe retrieval with fine-grained prompting alignment and evidential semantic consistency","volume":"27","author":"Huang","year":"2025","journal-title":"IEEE Transactions on Multimedia"},{"key":"10.1016\/j.neunet.2026.109102_bib0013","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2022.116979","article-title":"An attention-based convolutional neural network for recipe recommendation","volume":"201","author":"Jia","year":"2022","journal-title":"Expert Systems with Applications"},{"key":"10.1016\/j.neunet.2026.109102_bib0014","unstructured":"Kumar, A., Naseer, M., Narayan, S., Anwer, R. M., Khan, S., & Cholakkal, H. (2024). Multi-modal generation via cross-modal in-context learning. arXiv preprint arXiv: 2405.18304."},{"key":"10.1016\/j.neunet.2026.109102_bib0015","series-title":"Proceedings of the 2021 international conference on multimedia retrieval","first-page":"173","article-title":"Cross-modal image-recipe retrieval via intra-and inter-modality hybrid fusion","author":"Li","year":"2021"},{"key":"10.1016\/j.neunet.2026.109102_bib0016","doi-asserted-by":"crossref","first-page":"6609","DOI":"10.1109\/TMM.2024.3355644","article-title":"Cross-modal adaptive dual association for text-to-image person retrieval","volume":"26","author":"Lin","year":"2024","journal-title":"IEEE Transactions on Multimedia"},{"key":"10.1016\/j.neunet.2026.109102_bib0017","series-title":"Proceedings of the AAAI conference on artificial intelligence","first-page":"568","article-title":"Graph-based cross-domain knowledge distillation for cross-dataset text-to-image person retrieval","author":"Luo","year":"2025"},{"issue":"1","key":"10.1016\/j.neunet.2026.109102_bib0018","doi-asserted-by":"crossref","first-page":"187","DOI":"10.1109\/TPAMI.2019.2927476","article-title":"Recipe1M+: A dataset for learning cross-modal embeddings for cooking recipes and food images","volume":"43","author":"Mar\u0131n","year":"2021","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"10.1016\/j.neunet.2026.109102_bib0019","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2025.127630","article-title":"CELLMEA: A collaboratively enhanced large language model-based entity alignment for aircraft fault maintenance","volume":"282","author":"Meng","year":"2025","journal-title":"Expert Systems with Applications"},{"issue":"7","key":"10.1016\/j.neunet.2026.109102_bib0020","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3715143","article-title":"Multimodal food learning","volume":"21","author":"Min","year":"2025","journal-title":"ACM Transactions on Multimedia Computing, Communications and Applications"},{"issue":"5","key":"10.1016\/j.neunet.2026.109102_bib0021","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3329168","article-title":"A survey on food computing","volume":"52","author":"Min","year":"2019","journal-title":"Acm Computing Surveys (CSUR)"},{"key":"10.1016\/j.neunet.2026.109102_bib0022","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2025.127603","article-title":"Service-oriented multi-platform for food computing: A mobile application for recipe adaptation to nutrition behaviours (AI2cuisine)","volume":"281","author":"Morales-Garz\u00f3n","year":"2025","journal-title":"Expert Systems with Applications"},{"key":"10.1016\/j.neunet.2026.109102_bib0023","series-title":"Proceedings of the AAAI conference on artificial intelligence","first-page":"2423","article-title":"CHEF: Cross-modal hierarchical embeddings for food domain retrieval","author":"Pham","year":"2021"},{"key":"10.1016\/j.neunet.2026.109102_bib0024","series-title":"International conference on machine learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.neunet.2026.109102_bib0025","doi-asserted-by":"crossref","first-page":"53","DOI":"10.1162\/tacl_a_00353","article-title":"Efficient content-based sparse attention with routing transformers","volume":"9","author":"Roy","year":"2021","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"10.1016\/j.neunet.2026.109102_bib0026","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"15475","article-title":"Revamping cross-modal recipe retrieval with hierarchical transformers and self-supervised learning","author":"Salvador","year":"2021"},{"key":"10.1016\/j.neunet.2026.109102_bib0027","series-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","first-page":"3020","article-title":"Learning cross-modal embeddings for cooking recipes and food images","author":"Salvador","year":"2017"},{"key":"10.1016\/j.neunet.2026.109102_bib0028","doi-asserted-by":"crossref","DOI":"10.1016\/j.foodchem.2024.141133","article-title":"Relatively reliable and rapid identification of colorant compounds in food matrices by HPLC-DAD-QTOF-MS combined with theoretical calculation","volume":"463","author":"Shi","year":"2025","journal-title":"Food Chemistry"},{"key":"10.1016\/j.neunet.2026.109102_bib0029","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"4567","article-title":"Transformer decoders with multimodal regularization for cross-modal food retrieval","author":"Shukor","year":"2022"},{"key":"10.1016\/j.neunet.2026.109102_bib0030","doi-asserted-by":"crossref","DOI":"10.1016\/j.cviu.2024.104071","article-title":"Vision and structured-language pretraining for cross-modal food retrieval","volume":"247","author":"Shukor","year":"2024","journal-title":"Computer Vision and Image Understanding"},{"key":"10.1016\/j.neunet.2026.109102_bib0031","series-title":"Proceedings of the 29th ACM international conference on multimedia","first-page":"2501","article-title":"Cross-modal recipe embeddings by disentangling recipe contents and dish styles","author":"Sugiyama","year":"2021"},{"issue":"5","key":"10.1016\/j.neunet.2026.109102_bib0032","doi-asserted-by":"crossref","DOI":"10.1016\/j.jfp.2025.100476","article-title":"Effect of ripening temperature on microbial safety and biogenic amine levels in rennet cheeses produced from raw cow milk","volume":"88","author":"Szosland-Fa\u0142tyn","year":"2025","journal-title":"Journal of Food Protection"},{"issue":"1","key":"10.1016\/j.neunet.2026.109102_bib0033","doi-asserted-by":"crossref","first-page":"424","DOI":"10.3390\/app15010424","article-title":"Computational fluid dynamics simulation of thermal processes in food technology and their applications in the food industry","volume":"15","author":"Szpicer","year":"2025","journal-title":"Applied Sciences"},{"key":"10.1016\/j.neunet.2026.109102_bib0034","unstructured":"G. Team, Anil, R., Borgeaud, S., Alayrac, J.-B., Yu, J., Soricut, R., Schalkwyk, J., Dai, A. M., Hauth, A., Millican, K. et al. (2023). Gemini: a family of highly capable multimodal models. arXiv preprint arXiv: 2312.11805."},{"key":"10.1016\/j.neunet.2026.109102_bib0035","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"8903","article-title":"Nutrition5k: Towards automatic nutritional understanding of generic food","author":"Thames","year":"2021"},{"key":"10.1016\/j.neunet.2026.109102_bib0036","unstructured":"Touvron, H., Martin, L., Stone, K., Albert, P., Almahairi, A., Babaei, Y., Bashlykov, N., Batra, S., Bhargava, P., Bhosale, S. et al. (2023). Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv: 2307.09288."},{"key":"10.1016\/j.neunet.2026.109102_bib0037","first-page":"5998","article-title":"Attention is all you need","volume":"30","author":"Vaswani","year":"2017","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2026.109102_bib0038","unstructured":"Voutharoja, B. P., Wang, P., Wang, L., & Guan, V. (2023). MALM: Mask augmentation based local matching for food-recipe retrieval. arXiv preprint arXiv: 2305.11327."},{"key":"10.1016\/j.neunet.2026.109102_bib0039","series-title":"Proceedings of the IEEE\/CVF winter conference on applications of computer vision","first-page":"5584","article-title":"Fine-grained alignment for cross-modal recipe retrieval","author":"Wahed","year":"2024"},{"key":"10.1016\/j.neunet.2026.109102_bib0040","series-title":"Computer vision\u2013ECCV 2020: 16th european conference, glasgow, UK, august 23\u201328, 2020, proceedings, part XXVII 16","first-page":"359","article-title":"Structure-aware generation network for recipe generation from images","author":"Wang","year":"2020"},{"key":"10.1016\/j.neunet.2026.109102_bib0041","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"11572","article-title":"Learning cross-modal embeddings with adversarial networks for cooking recipes and food images","author":"Wang","year":"2019"},{"key":"10.1016\/j.neunet.2026.109102_bib0042","doi-asserted-by":"crossref","first-page":"2515","DOI":"10.1109\/TMM.2021.3083109","article-title":"Cross-modal food retrieval: Learning a joint embedding of food images and recipes with semantic consistency and attention mechanism","volume":"24","author":"Wang","year":"2021","journal-title":"IEEE Transactions on Multimedia"},{"key":"10.1016\/j.neunet.2026.109102_bib0043","doi-asserted-by":"crossref","first-page":"2848","DOI":"10.1109\/TMM.2025.3543067","article-title":"Threefold encoder interaction: Hierarchical multi-grained semantic alignment for cross-modal food retrieval","volume":"27","author":"Wang","year":"2025","journal-title":"IEEE Transactions on Multimedia"},{"key":"10.1016\/j.neunet.2026.109102_bib0044","series-title":"Proceedings of the AAAI conference on artificial intelligence","first-page":"21189","article-title":"Pre-trained vision-language models as noisy partial annotators","author":"Wang","year":"2025"},{"key":"10.1016\/j.neunet.2026.109102_bib0045","series-title":"Proceedings of the 32nd ACM international conference on multimedia","first-page":"8296","article-title":"Multimodal llm enhanced cross-lingual cross-modal retrieval","author":"Wang","year":"2024"},{"key":"10.1016\/j.neunet.2026.109102_bib0046","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2025.113082","article-title":"Disentangled sparse graph attention networks with multi-intent fusion for session-based recommendation","volume":"311","author":"Wang","year":"2025","journal-title":"Knowledge-Based Systems"},{"key":"10.1016\/j.neunet.2026.109102_bib0047","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"10941","article-title":"Multi-modality cross attention network for image and sentence matching","author":"Wei","year":"2020"},{"key":"10.1016\/j.neunet.2026.109102_bib0048","series-title":"Icassp 2025-2025 ieee international conference on acoustics, speech and signal processing (icassp)","first-page":"1","article-title":"Relation-aware semantic alignment network for text-to-image person retrieval","author":"Wu","year":"2025"},{"issue":"6","key":"10.1016\/j.neunet.2026.109102_bib0049","doi-asserted-by":"crossref","first-page":"3304","DOI":"10.1109\/TSC.2021.3098834","article-title":"Learning TFIDF enhanced joint embedding for recipe-image cross-modal retrieval service","volume":"15","author":"Xie","year":"2022","journal-title":"IEEE Transactions on Services Computing"},{"issue":"4","key":"10.1016\/j.neunet.2026.109102_bib0050","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3490519","article-title":"Learning text-image joint embedding for efficient cross-modal retrieval with deep feature engineering","volume":"40","author":"Xie","year":"2021","journal-title":"ACM Transactions on Information Systems"},{"key":"10.1016\/j.neunet.2026.109102_bib0051","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2025.113806","article-title":"GCS-Net: A universal ai-generated visual content detection method based on clip","volume":"323","author":"Xu","year":"2025","journal-title":"Knowledge-Based Systems"},{"key":"10.1016\/j.neunet.2026.109102_bib0052","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"5728","article-title":"Restormer: Efficient transformer for high-resolution image restoration","author":"Zamir","year":"2022"},{"key":"10.1016\/j.neunet.2026.109102_bib0053","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2024.112641","article-title":"Cross-modal recipe retrieval based on unified text encoder with fine-grained contrastive learning","volume":"305","author":"Zhang","year":"2024","journal-title":"Knowledge-Based Systems"},{"key":"10.1016\/j.neunet.2026.109102_bib0054","series-title":"Proceedings of the 47th international ACM SIGIR conference on research and development in information retrieval","first-page":"862","article-title":"Universal adversarial perturbations for vision-language pre-trained models","author":"Zhang","year":"2024"},{"issue":"5","key":"10.1016\/j.neunet.2026.109102_bib0055","doi-asserted-by":"crossref","DOI":"10.1016\/j.patter.2025.101234","article-title":"Foodsky: A food-oriented large language model that can pass the chef and dietetic examinations","volume":"6","author":"Zhou","year":"2025","journal-title":"Patterns"},{"key":"10.1016\/j.neunet.2026.109102_bib0056","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"11477","article-title":"R2GAN: Cross-modal recipe retrieval with generative adversarial network","author":"Zhu","year":"2019"},{"key":"10.1016\/j.neunet.2026.109102_bib0057","doi-asserted-by":"crossref","first-page":"33283","DOI":"10.1109\/ACCESS.2024.3370158","article-title":"CREAMY: Cross-modal recipe retrieval by avoiding matching imperfectly","volume":"12","author":"Zou","year":"2024","journal-title":"IEEE Access"},{"issue":"11","key":"10.1016\/j.neunet.2026.109102_bib0058","doi-asserted-by":"crossref","first-page":"1628","DOI":"10.3390\/foods13111628","article-title":"Disambiguity and alignment: An effective multi-modal alignment method for cross-modal recipe retrieval","volume":"13","author":"Zou","year":"2024","journal-title":"Foods"}],"container-title":["Neural Networks"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608026005629?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608026005629?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,20]],"date-time":"2026-05-20T22:48:43Z","timestamp":1779317323000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0893608026005629"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,11]]},"references-count":58,"alternative-id":["S0893608026005629"],"URL":"https:\/\/doi.org\/10.1016\/j.neunet.2026.109102","relation":{},"ISSN":["0893-6080"],"issn-type":[{"value":"0893-6080","type":"print"}],"subject":[],"published":{"date-parts":[[2026,11]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Cross-modal recipe retrieval via multi-granularity alignment","name":"articletitle","label":"Article Title"},{"value":"Neural Networks","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neunet.2026.109102","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"109102"}}