{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T14:03:16Z","timestamp":1774447396520,"version":"3.50.1"},"reference-count":119,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2025,12,29]],"date-time":"2025-12-29T00:00:00Z","timestamp":1766966400000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Expert Systems with Applications"],"published-print":{"date-parts":[[2026,4]]},"DOI":"10.1016\/j.eswa.2025.130813","type":"journal-article","created":{"date-parts":[[2025,12,29]],"date-time":"2025-12-29T17:00:56Z","timestamp":1767027656000},"page":"130813","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Large-scale benchmarks for multimodal recommendation with Ducho"],"prefix":"10.1016","volume":"307","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-6600-1938","authenticated-orcid":false,"given":"Matteo","family":"Attimonelli","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0000-5203-1229","authenticated-orcid":false,"given":"Danilo","family":"Danese","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0007-3238-4327","authenticated-orcid":false,"given":"Angela","family":"Di Fazio","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2228-0333","authenticated-orcid":false,"given":"Daniele","family":"Malitesta","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5206-3909","authenticated-orcid":false,"given":"Claudio","family":"Pomo","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0939-5462","authenticated-orcid":false,"given":"Tommaso","family":"Di Noia","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.eswa.2025.130813_bib0001","series-title":"SIGIR","first-page":"2405","article-title":"Elliot: A comprehensive and rigorous framework for reproducible recommender systems evaluation","author":"Anelli","year":"2021"},{"key":"10.1016\/j.eswa.2025.130813_bib0002","series-title":"Dl4sr@cikm","article-title":"Reshaping graph recommendation with edge graph collaborative filtering and customer reviews","volume":"vol. 3317","author":"Anelli","year":"2022"},{"key":"10.1016\/j.eswa.2025.130813_bib0003","series-title":"WWW (Companion volume)","first-page":"1075","article-title":"Ducho 2.0: Towards a more up-to-date unified framework for the extraction of multimodal features in recommendation","author":"Attimonelli","year":"2024"},{"issue":"2","key":"10.1016\/j.eswa.2025.130813_bib0004","doi-asserted-by":"crossref","first-page":"423","DOI":"10.1109\/TPAMI.2018.2798607","article-title":"Multimodal machine learning: A survey and taxonomy","volume":"41","author":"Baltrusaitis","year":"2019","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"10.1016\/j.eswa.2025.130813_bib0005","series-title":"The netflix prize","author":"Bennett","year":"2007"},{"key":"10.1016\/j.eswa.2025.130813_bib0006","doi-asserted-by":"crossref","first-page":"805","DOI":"10.1109\/TMM.2021.3059508","article-title":"Heterogeneous hierarchical feature aggregation network for personalized micro-video recommendation","volume":"24","author":"Cai","year":"2022","journal-title":"IEEE Trans. Multim."},{"key":"10.1016\/j.eswa.2025.130813_bib0007","series-title":"Proceedings of the 5th ACM conference on recommender systems RecSys 2011","article-title":"2nd workshop on information heterogeneity and fusion in recommender systems (hetrec 2011)","author":"Cantador","year":"2011"},{"key":"10.1016\/j.eswa.2025.130813_bib0008","series-title":"ACM Multimedia","first-page":"385","article-title":"Breaking isolation: Multimodal graph fusion for multimedia recommendation by edge-wise modulation","author":"Chen","year":"2022"},{"key":"10.1016\/j.eswa.2025.130813_bib0009","series-title":"IJCAI","first-page":"2449","article-title":"Neural tensor model for learning multi-aspect factors in recommender systems","author":"Chen","year":"2020"},{"key":"10.1016\/j.eswa.2025.130813_bib0010","series-title":"KDD","article-title":"POG: Personalized outfit generation for fashion recommendation at alibaba ifashion","author":"Chen","year":"2019"},{"key":"10.1016\/j.eswa.2025.130813_bib0011","series-title":"SIGIR","first-page":"765","article-title":"Personalized fashion recommendation with visual explanations based on multimodal attention network: Towards visually explainable recommendation","author":"Chen","year":"2019"},{"key":"10.1016\/j.eswa.2025.130813_bib0012","doi-asserted-by":"crossref","first-page":"484","DOI":"10.1109\/TMM.2020.2978618","article-title":"Learning and fusing multiple user interest representations for micro-video and movie recommendations","volume":"23","author":"Chen","year":"2021","journal-title":"IEEE Trans. Multim."},{"key":"10.1016\/j.eswa.2025.130813_bib0013","doi-asserted-by":"crossref","unstructured":"Chen, Z., Liu, G., Zhang, B.-W., Yang, Q., & Wu, L. (2023). AltCLIP: Altering the language encoder in CLIP for extended language capabilities. In A. Rogers, J. Boyd-Graber, & N. Okazaki (Eds.), Findings of the association for computational linguistics: ACL 2023 (pp. 8666\u20138682). Toronto, Canada: Association for Computational Linguistics. 10.18653\/v1\/2023.findings-acl.552.","DOI":"10.18653\/v1\/2023.findings-acl.552"},{"issue":"2","key":"10.1016\/j.eswa.2025.130813_bib0014","doi-asserted-by":"crossref","first-page":"16:1","DOI":"10.1145\/3291060","article-title":"MMALFM: Explainable recommendation by leveraging reviews and images","volume":"37","author":"Cheng","year":"2019","journal-title":"ACM Transactions on Information Systems"},{"key":"10.1016\/j.eswa.2025.130813_bib0015","first-page":"125","article-title":"On effective personalized music retrieval by exploring online user behaviors","author":"Cheng","year":"2016"},{"issue":"2","key":"10.1016\/j.eswa.2025.130813_bib0016","doi-asserted-by":"crossref","first-page":"317","DOI":"10.1109\/TKDE.2018.2881260","article-title":"MV-RNN: A multi-view recurrent neural network for sequential recommendation","volume":"32","author":"Cui","year":"2020","journal-title":"IEEE Transactions on Knowledge and Data Engineering"},{"key":"10.1016\/j.eswa.2025.130813_bib0017","series-title":"CVPR Workshops","first-page":"3961","article-title":"A study on the relative importance of convolutional neural networks in visually-aware recommender systems","author":"Deldjoo","year":"2021"},{"key":"10.1016\/j.eswa.2025.130813_bib0018","series-title":"ECIR (2)","first-page":"84","article-title":"Leveraging content-style item representation for visual recommendation","volume":"vol. 13186","author":"Deldjoo","year":"2022"},{"key":"10.1016\/j.eswa.2025.130813_bib0019","series-title":"NAACL-HLT (1)","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2019"},{"key":"10.1016\/j.eswa.2025.130813_bib0020","series-title":"ACM Multimedia","first-page":"302","article-title":"Personalized capsule wardrobe creation with garment and user modeling","author":"Dong","year":"2019"},{"key":"10.1016\/j.eswa.2025.130813_bib0021","series-title":"ICLR","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2021"},{"issue":"6","key":"10.1016\/j.eswa.2025.130813_bib0022","doi-asserted-by":"crossref","first-page":"1647","DOI":"10.1109\/TMM.2019.2945180","article-title":"Hierarchical attention network for visually-aware food recommendation","volume":"22","author":"Gao","year":"2020","journal-title":"IEEE Trans. Multim."},{"key":"10.1016\/j.eswa.2025.130813_bib0023","doi-asserted-by":"crossref","unstructured":"Geng, X., Zhang, H., Bian, J., & Chua, T. (2015). Learning image and user features for recommendation in social networks. In 2015\u202fIEEE international conference on computer vision, ICCV 2015, santiago, chile, december 7\u201313, 2015 (pp. 4274\u20134282). IEEE Computer Society. 10.1109\/ICCV.2015.486.","DOI":"10.1109\/ICCV.2015.486"},{"key":"10.1016\/j.eswa.2025.130813_bib0024","series-title":"AAAI","first-page":"8454","article-title":"Lgmrec: Local and global graph learning for multimodal recommendation","author":"Guo","year":"2024"},{"key":"10.1016\/j.eswa.2025.130813_bib0025","series-title":"Acm multimedia","article-title":"Learning fashion compatibility with bidirectional LSTMs","author":"Han","year":"2017"},{"key":"10.1016\/j.eswa.2025.130813_bib0026","series-title":"CVPR","first-page":"770","article-title":"Deep residual learning for image recognition","author":"He","year":"2016"},{"key":"10.1016\/j.eswa.2025.130813_bib0027","series-title":"WWW","first-page":"507","article-title":"Ups and downs: Modeling the visual evolution of fashion trends with one-class collaborative filtering","author":"He","year":"2016"},{"key":"10.1016\/j.eswa.2025.130813_bib0028","series-title":"AAAI","first-page":"144","article-title":"VBPR: Visual bayesian personalized ranking from implicit feedback","author":"He","year":"2016"},{"key":"10.1016\/j.eswa.2025.130813_bib0029","series-title":"SIGIR","first-page":"639","article-title":"LightGCN: Simplifying and powering graph convolution network for recommendation","author":"He","year":"2020"},{"key":"10.1016\/j.eswa.2025.130813_bib0030","unstructured":"Jia, C., Yang, Y., Xia, Y., Chen, Y., Parekh, Z., Pham, H., Le, Q. V., Sung, Y., Li, Z., & Duerig, T. (2021). Scaling up visual and vision-language representation learning with noisy text supervision. In M. Meila, & T. Zhang (Eds.), Proceedings of the 38th international conference on machine learning, ICML 2021, 18\u201324 july 2021, virtual event, Proceedings of Machine Learning Research (pp. 4904\u20134916). PMLR (vol. 139). http:\/\/proceedings.mlr.press\/v139\/jia21b.html."},{"key":"10.1016\/j.eswa.2025.130813_bib0031","doi-asserted-by":"crossref","unstructured":"Jiang, H., Wang, W., Liu, M., Nie, L., Duan, L., & Xu, C. (2019). Market2dish: A health-aware food recommendation system. In L. Amsaleg, B. Huet, M.A. Larson, G. Gravier, H. Hung, C. Ngo, & W.T. Ooi (Eds.), Proceedings of the 27th ACM international conference on multimedia, MM 2019, nice, france, october 21\u201325, 2019 (pp. 2188\u20132190). ACM. 10.1145\/3343031.3350594.","DOI":"10.1145\/3343031.3350594"},{"key":"10.1016\/j.eswa.2025.130813_bib0032","series-title":"ACM Multimedia","first-page":"7591","article-title":"DiffMM: Multi- modal diffusion model for recommendation","author":"Jiang","year":"2024"},{"key":"10.1016\/j.eswa.2025.130813_bib0033","series-title":"CIKM","first-page":"993","article-title":"MARIO: Modality-aware attention and modality-preserving decoders for multimedia recommendation","author":"Kim","year":"2022"},{"issue":"8","key":"10.1016\/j.eswa.2025.130813_bib0034","doi-asserted-by":"crossref","first-page":"30","DOI":"10.1109\/MC.2009.263","article-title":"Matrix factorization techniques for recommender systems","volume":"42","author":"Koren","year":"2009","journal-title":"Computer"},{"key":"10.1016\/j.eswa.2025.130813_bib0035","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2021.115708","article-title":"Is the suggested food your desired?: Multi-modal recipe recommendation with demand-based knowledge graph","volume":"186","author":"Lei","year":"2021","journal-title":"Expert Systems with Applications"},{"issue":"8","key":"10.1016\/j.eswa.2025.130813_bib0036","doi-asserted-by":"crossref","first-page":"1639","DOI":"10.1109\/TKDE.2019.2906180","article-title":"Translation-based sequential recommendation for complex users on sparse data","volume":"32","author":"Li","year":"2020","journal-title":"IEEE Transactions on Knowledge and Data Engineering"},{"issue":"6","key":"10.1016\/j.eswa.2025.130813_bib0037","doi-asserted-by":"crossref","first-page":"1548","DOI":"10.1007\/s10618-019-00632-4","article-title":"Hhmf: Hidden hierarchical matrix factorization for recommender systems","volume":"33","author":"Li","year":"2019","journal-title":"Data Mining and Knowledge Discovery"},{"key":"10.1016\/j.eswa.2025.130813_bib0038","series-title":"SIGIR","first-page":"1239","article-title":"A revisit to social network-based recommender systems","author":"Li","year":"2014"},{"key":"10.1016\/j.eswa.2025.130813_bib0039","series-title":"Recsys","first-page":"27","article-title":"Overlapping community regularization for rating prediction in social recommender systems","author":"Li","year":"2015"},{"key":"10.1016\/j.eswa.2025.130813_bib0040","series-title":"ICML","first-page":"12888","article-title":"BLIP: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume":"vol. 162","author":"Li","year":"2022"},{"key":"10.1016\/j.eswa.2025.130813_bib0041","series-title":"ACM Multimedia","first-page":"1526","article-title":"User diverse preference modeling by multimodal attentive metric learning","author":"Liu","year":"2019"},{"key":"10.1016\/j.eswa.2025.130813_bib0042","article-title":"Multimodal recommender systems: A survey","volume":"abs\/2302.03883","author":"Liu","year":"2023","journal-title":"CoRR"},{"key":"10.1016\/j.eswa.2025.130813_bib0043","series-title":"KDD","first-page":"6566","article-title":"Multimodal pretraining, adaptation, and generation for recommendation: A survey","author":"Liu","year":"2024"},{"key":"10.1016\/j.eswa.2025.130813_bib0044","series-title":"ACM Multimedia","first-page":"3755","article-title":"Mmfashion: An open-source toolbox for visual fashion analysis","author":"Liu","year":"2021"},{"key":"10.1016\/j.eswa.2025.130813_bib0045","series-title":"ACM Multimedia","first-page":"2853","article-title":"Pre-training graph transformer with multimodal side information for recommendation","author":"Liu","year":"2021"},{"key":"10.1016\/j.eswa.2025.130813_bib0046","series-title":"ICMR","first-page":"99","article-title":"Multi-modal contrastive pre-training for recommendation","author":"Liu","year":"2022"},{"key":"10.1016\/j.eswa.2025.130813_bib0047","series-title":"EvalRS@KDD","article-title":"Disentangling the performance puzzle of multimodal-aware recommender systems","volume":"vol. 3450","author":"Malitesta","year":"2023"},{"issue":"3","key":"10.1016\/j.eswa.2025.130813_bib0048","first-page":"37:1","article-title":"Formalizing multimedia recommendation through multimodal deep learning","volume":"3","author":"Malitesta","year":"2025","journal-title":"Trans. Recomm. Syst."},{"key":"10.1016\/j.eswa.2025.130813_bib0049","series-title":"Mmir@mm","first-page":"59","article-title":"On popularity bias of multimodal-aware recommender systems: A modalities-driven analysis","author":"Malitesta","year":"2023"},{"key":"10.1016\/j.eswa.2025.130813_bib0050","series-title":"ACM Multimedia","first-page":"9668","article-title":"Ducho: A unified framework for the extraction of multimodal features in recommendation","author":"Malitesta","year":"2023"},{"key":"10.1016\/j.eswa.2025.130813_bib0051","series-title":"CIKM","first-page":"3943","article-title":"Do we really need to drop items with missing modalities in multimodal recommendation?","author":"Malitesta","year":"2024"},{"issue":"1","key":"10.1016\/j.eswa.2025.130813_bib0052","doi-asserted-by":"crossref","first-page":"187","DOI":"10.1109\/TPAMI.2019.2927476","article-title":"Recipe1m+: A dataset for learning cross-modal embeddings for cooking recipes and food images","volume":"43","author":"Marin","year":"2019","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"10.1016\/j.eswa.2025.130813_bib0053","series-title":"SIGIR","first-page":"43","article-title":"Image-based recommendations on styles and substitutes","author":"McAuley","year":"2015"},{"issue":"10","key":"10.1016\/j.eswa.2025.130813_bib0054","doi-asserted-by":"crossref","first-page":"2659","DOI":"10.1109\/TMM.2019.2958761","article-title":"Food recommendation: Framework, existing solutions, and challenges","volume":"22","author":"Min","year":"2020","journal-title":"IEEE Trans. Multim."},{"key":"10.1016\/j.eswa.2025.130813_bib0055","series-title":"ACM Multimedia","first-page":"376","article-title":"Learning hybrid behavior patterns for multimedia recommendation","author":"Mu","year":"2022"},{"key":"10.1016\/j.eswa.2025.130813_bib0056","series-title":"ICML","first-page":"689","article-title":"Multimodal deep learning","author":"Ngiam","year":"2011"},{"key":"10.1016\/j.eswa.2025.130813_bib0057","series-title":"EMNLP\/IJCNLP (1)","first-page":"188","article-title":"Justifying recommendations using distantly-labeled reviews and fine-grained aspects","author":"Ni","year":"2019"},{"key":"10.1016\/j.eswa.2025.130813_bib0058","series-title":"WSDM","first-page":"773","article-title":"Spectrum-based modality representation fusion graph convolutional network for multimodal recommendation","author":"Ong","year":"2025"},{"key":"10.1016\/j.eswa.2025.130813_bib0059","series-title":"Dlrs@recsys","first-page":"32","article-title":"A deep multimodal approach for cold-start music recommendation","author":"Oramas","year":"2017"},{"key":"10.1016\/j.eswa.2025.130813_bib0060","series-title":"Recsys","first-page":"894","article-title":"Yambda-5b - a large-scale multi-modal dataset for ranking and retrieval","author":"Ploshkin","year":"2025"},{"key":"10.1016\/j.eswa.2025.130813_bib0061","article-title":"Do recommender systems really leverage multimodal content? a comprehensive analysis on multimodal representations for recommendation","volume":"abs\/2508.04571","author":"Pomo","year":"2025","journal-title":"CoRR"},{"key":"10.1016\/j.eswa.2025.130813_bib0062","series-title":"ICML","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume":"vol. 139","author":"Radford","year":"2021"},{"key":"10.1016\/j.eswa.2025.130813_bib0063","series-title":"EMNLP\/IJCNLP (1)","first-page":"3980","article-title":"Sentence-BERT: Sentence embeddings using siamese BERT-networks","author":"Reimers","year":"2019"},{"key":"10.1016\/j.eswa.2025.130813_bib0064","series-title":"UAI","article-title":"BPR: Bayesian personalized ranking from implicit feedback","author":"Rendle","year":"2009"},{"key":"10.1016\/j.eswa.2025.130813_bib0065","article-title":"Recommender systems handbook","year":"2015"},{"key":"10.1016\/j.eswa.2025.130813_bib0066","series-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","article-title":"Learning cross-modal embeddings for cooking recipes and food images","author":"Salvador","year":"2017"},{"key":"10.1016\/j.eswa.2025.130813_bib0067","doi-asserted-by":"crossref","first-page":"2019","DOI":"10.1109\/TMM.2020.3007330","article-title":"Context-dependent propagating-based video recommendation in multimodal heterogeneous information networks","volume":"23","author":"Sang","year":"2021","journal-title":"IEEE Trans. Multim."},{"key":"10.1016\/j.eswa.2025.130813_bib0068","series-title":"WWW","first-page":"285","article-title":"Item-based collaborative filtering recommendation algorithms","author":"Sarwar","year":"2001"},{"key":"10.1016\/j.eswa.2025.130813_bib0069","series-title":"IJCNN","first-page":"1","article-title":"Enhancing music recommendation with social media content: An attentive multimodal autoencoder approach","author":"Shen","year":"2020"},{"key":"10.1016\/j.eswa.2025.130813_bib0070","doi-asserted-by":"crossref","unstructured":"Song, X., Feng, F., Liu, J., Li, Z., Nie, L., & Ma, J. (2017). Neurostylist: Neural compatibility modeling for clothing matching. In Q. Liu, R. Lienhart, H. Wang, S.K. Chen, S. Boll, Y.P. Chen, G. Friedland, J. Li, & S. Yan (Eds.), Proceedings of the 2017\u202fACM on multimedia conference, MM 2017, mountain view, ca, usa, october 23\u201327, 2017 (pp. 753\u2013761). ACM. 10.1145\/3123266.3123314.","DOI":"10.1145\/3123266.3123314"},{"key":"10.1016\/j.eswa.2025.130813_bib0071","series-title":"ACM Multimedia","first-page":"320","article-title":"GP-BPR: Personalized compatibility modeling for clothing matching","author":"Song","year":"2019"},{"key":"10.1016\/j.eswa.2025.130813_bib0072","series-title":"Recsys","first-page":"847","article-title":"See the movie, hear the song, read the book: Extending movielens-1m, last.fm-2k, and DBbook with multimodal data","author":"Spillo","year":"2025"},{"key":"10.1016\/j.eswa.2025.130813_bib0073","series-title":"ACM Multimedia","first-page":"5838","article-title":"SOIL: Contrastive second-order interest learning for multimodal recommendation","author":"Su","year":"2024"},{"key":"10.1016\/j.eswa.2025.130813_bib0074","article-title":"Themes inferred audio-visual correspondence learning","author":"Su","year":"2020","journal-title":"CoRR"},{"key":"10.1016\/j.eswa.2025.130813_bib0075","series-title":"CIKM","first-page":"1405","article-title":"Multi-modal knowledge graphs for recommender systems","author":"Sun","year":"2020"},{"key":"10.1016\/j.eswa.2025.130813_bib0076","doi-asserted-by":"crossref","first-page":"5107","DOI":"10.1109\/TMM.2022.3187556","article-title":"Self-supervised learning for multimedia recommendation","volume":"25","author":"Tao","year":"2023","journal-title":"IEEE Trans. Multim."},{"issue":"5","key":"10.1016\/j.eswa.2025.130813_bib0077","doi-asserted-by":"crossref","DOI":"10.1016\/j.ipm.2020.102277","article-title":"MGAT: Multimodal graph attention network for recommendation","volume":"57","author":"Tao","year":"2020","journal-title":"Information Processing Management"},{"key":"10.1016\/j.eswa.2025.130813_bib0078","unstructured":"Tianchi (2018). Fashion collocation data on taobao.com. https:\/\/tianchi.aliyun.com\/dataset\/dataDetail?dataId=52."},{"key":"10.1016\/j.eswa.2025.130813_bib0079","series-title":"NIPS","first-page":"5998","article-title":"Attention is all you need","author":"Vaswani","year":"2017"},{"key":"10.1016\/j.eswa.2025.130813_bib0080","series-title":"BigMM","first-page":"25","article-title":"Multimodal fusion based attentive networks for sequential music recommendation","author":"Vaswani","year":"2021"},{"key":"10.1016\/j.eswa.2025.130813_bib0081","series-title":"EMNLP","article-title":"LRMM: Learning to recommend with missing modalities","author":"Wang","year":"2018"},{"key":"10.1016\/j.eswa.2025.130813_bib0082","doi-asserted-by":"crossref","first-page":"1074","DOI":"10.1109\/TMM.2021.3138298","article-title":"DualGNN: Dual graph neural network for multimedia recommendation","volume":"25","author":"Wang","year":"2023","journal-title":"IEEE Trans. Multim."},{"issue":"1","key":"10.1016\/j.eswa.2025.130813_bib0083","doi-asserted-by":"crossref","first-page":"33:1","DOI":"10.1145\/3418211","article-title":"Market2dish: Health-aware food recommendation","volume":"17","author":"Wang","year":"2021","journal-title":"ACM Trans. Multim. Comput. Commun. Appl."},{"key":"10.1016\/j.eswa.2025.130813_bib0084","series-title":"SIGIR","first-page":"165","article-title":"Neural graph collaborative filtering","author":"Wang","year":"2019"},{"key":"10.1016\/j.eswa.2025.130813_bib0085","series-title":"SIGIR","first-page":"1001","article-title":"Disentangled graph collaborative filtering","author":"Wang","year":"2020"},{"key":"10.1016\/j.eswa.2025.130813_bib0086","series-title":"Proceedings of the ACM web conference 2023","first-page":"790","article-title":"Multi-modal self-supervised learning for recommendation","author":"Wei","year":"2023"},{"key":"10.1016\/j.eswa.2025.130813_bib0087","series-title":"WWW","first-page":"790","article-title":"Multi-modal self-supervised learning for recommendation","author":"Wei","year":"2023"},{"key":"10.1016\/j.eswa.2025.130813_bib0088","series-title":"ACM Multimedia","first-page":"3541","article-title":"Graph-refined convolutional network for multimedia recommendation with implicit feedback","author":"Wei","year":"2020"},{"key":"10.1016\/j.eswa.2025.130813_bib0089","series-title":"ACM Multimedia","first-page":"1437","article-title":"MMGCN: Multi-modal graph convolution network for personalized recommendation of micro-video","author":"Wei","year":"2019"},{"key":"10.1016\/j.eswa.2025.130813_bib0090","doi-asserted-by":"crossref","unstructured":"Wu, C., Wu, F., Qi, T., Zhang, C., Huang, Y., & Xu, T. (2022). Mm-rec: Visiolinguistic model empowered multimodal news recommendation. In E. Amig\u00f3, P. Castells, J. Gonzalo, B. Carterette, J.S. Culpepper, & G. Kazai (Eds.), SIGIR \u201922: The 45th international ACM SIGIR conference on research and development in information retrieval, madrid, spain, july 11, - 15, 2022 (pp. 2560\u20132564). ACM. 10.1145\/3477495.3531896.","DOI":"10.1145\/3477495.3531896"},{"key":"10.1016\/j.eswa.2025.130813_bib0091","doi-asserted-by":"crossref","unstructured":"Wu, F., Qiao, Y., Chen, J., Wu, C., Qi, T., Lian, J., Liu, D., Xie, X., Gao, J., Wu, W., & Zhou, M. (2020). MIND: A large-scale dataset for news recommendation. In D. Jurafsky, J. Chai, N. Schluter, & J.R. Tetreault (Eds.), Proceedings of the 58th annual meeting of the association for computational linguistics, ACL 2020, online, july 5\u201310, 2020 (pp. 3597\u20133606). Association for Computational Linguistics. 10.18653\/V1\/2020.ACL-MAIN.331.","DOI":"10.18653\/v1\/2020.acl-main.331"},{"key":"10.1016\/j.eswa.2025.130813_bib0092","series-title":"SIGIR","first-page":"726","article-title":"Self-supervised graph learning for recommendation","author":"Wu","year":"2021"},{"issue":"5","key":"10.1016\/j.eswa.2025.130813_bib0093","doi-asserted-by":"crossref","first-page":"60","DOI":"10.1007\/s11280-024-01291-2","article-title":"A survey on large language models for recommendation","volume":"27","author":"Wu","year":"2024","journal-title":"World Wide Web (WWW)"},{"key":"10.1016\/j.eswa.2025.130813_bib0094","series-title":"SIGIR","first-page":"1830","article-title":"COHESION: Composite graph convolutional network with dual-stage fusion for multimodal recommendation","author":"Xu","year":"2025"},{"key":"10.1016\/j.eswa.2025.130813_bib0095","series-title":"AAAI","first-page":"12908","article-title":"MENTOR: Multi-level self-supervised learning for multimodal recommendation","author":"Xu","year":"2025"},{"issue":"3","key":"10.1016\/j.eswa.2025.130813_bib0096","doi-asserted-by":"crossref","first-page":"768","DOI":"10.1109\/TCSS.2020.2986778","article-title":"AMNN: Attention-based multimodal neural network model for hashtag recommendation","volume":"7","author":"Yang","year":"2020","journal-title":"IEEE Trans. Comput. Soc. Syst."},{"key":"10.1016\/j.eswa.2025.130813_bib0097","series-title":"AAAI","first-page":"287","article-title":"Learning to match on graph for fashion compatibility modeling","author":"Yang","year":"2020"},{"key":"10.1016\/j.eswa.2025.130813_bib0098","doi-asserted-by":"crossref","unstructured":"Ye, Y., Fu, J., Song, Y., Zheng, K., & Jose, J. M. (2025). Are multimodal embeddings truly beneficial for recommendation? A deep dive into whole vs. individual modalities. CoRR, abs\/2508.07399.","DOI":"10.1007\/978-3-032-21324-2_5"},{"key":"10.1016\/j.eswa.2025.130813_bib0099","doi-asserted-by":"crossref","first-page":"1067","DOI":"10.1109\/TMM.2021.3111487","article-title":"Multi-modal variational graph auto-encoder for recommendation systems","volume":"24","author":"Yi","year":"2022","journal-title":"IEEE Trans. Multim."},{"key":"10.1016\/j.eswa.2025.130813_bib0100","article-title":"Large multi-modal encoders for recommendation","volume":"abs\/2310.20343","author":"Yi","year":"2023","journal-title":"CoRR"},{"key":"10.1016\/j.eswa.2025.130813_bib0101","doi-asserted-by":"crossref","unstructured":"Yi, Z., Long, Z., Ounis, I., Macdonald, C., & McCreadie, R. (2025). Enhancing recommender systems: Deep modality alignment with large multi-modal encoders. ACM Trans. Recomm. Syst., 3(4). 10.1145\/3718099.","DOI":"10.1145\/3718099"},{"key":"10.1016\/j.eswa.2025.130813_bib0102","series-title":"SIGIR","first-page":"1807","article-title":"Multi-modal graph contrastive learning for micro-video recommendation","author":"Yi","year":"2022"},{"key":"10.1016\/j.eswa.2025.130813_bib0103","series-title":"KDD","first-page":"974","article-title":"Graph convolutional neural networks for web-scale recommender systems","author":"Ying","year":"2018"},{"key":"10.1016\/j.eswa.2025.130813_bib0104","doi-asserted-by":"crossref","unstructured":"Yu, A., & Grauman, K. (2014). Fine-grained visual comparisons with local learning. In 2014\u202fIEEE conference on computer vision and pattern recognition, CVPR 2014, columbus, oh, usa, june 23\u201328, 2014 (pp. 192\u2013199). IEEE Computer Society. 10.1109\/CVPR.2014.32.","DOI":"10.1109\/CVPR.2014.32"},{"key":"10.1016\/j.eswa.2025.130813_bib0105","doi-asserted-by":"crossref","unstructured":"Yu, A., & Grauman, K. (2017). Semantic jitter: Dense supervision for visual comparisons via synthetic images. In IEEE international conference on computer vision, ICCV 2017, venice, italy, october 22\u201329, 2017 (pp. 5571\u20135580). IEEE Computer Society. 10.1109\/ICCV.2017.594.","DOI":"10.1109\/ICCV.2017.594"},{"key":"10.1016\/j.eswa.2025.130813_bib0106","series-title":"ACM Multimedia","first-page":"6576","article-title":"Multi-view graph convolutional network for multimedia recommendation","author":"Yu","year":"2023"},{"key":"10.1016\/j.eswa.2025.130813_bib0107","series-title":"AAAI","first-page":"13096","article-title":"Mind individual information! principal graph learning for multimedia recommendation","author":"Yu","year":"2025"},{"key":"10.1016\/j.eswa.2025.130813_bib0108","doi-asserted-by":"crossref","first-page":"819","DOI":"10.1109\/TMM.2021.3059514","article-title":"A3FKG: Attentive attribute-aware fashion knowledge graph for outfit preference prediction","volume":"24","author":"Zhan","year":"2022","journal-title":"IEEE Trans. Multim."},{"key":"10.1016\/j.eswa.2025.130813_bib0109","series-title":"ACM Multimedia","first-page":"3872","article-title":"Mining latent structures for multimedia recommendation","author":"Zhang","year":"2021"},{"key":"10.1016\/j.eswa.2025.130813_bib0110","series-title":"IJCAI","first-page":"3420","article-title":"Hashtag recommendation for multimodal microblog using co-attention network","author":"Zhang","year":"2017"},{"issue":"1","key":"10.1016\/j.eswa.2025.130813_bib0111","first-page":"5:1","article-title":"Deep learning based recommender system: A survey and new perspectives","volume":"52","author":"Zhang","year":"2019","journal-title":"ACM Computing Surveys"},{"key":"10.1016\/j.eswa.2025.130813_bib0112","article-title":"Does multimodality improve recommender systems as expected? a critical analysis and future directions","volume":"abs\/2508.05377","author":"Zhou","year":"2025","journal-title":"CoRR"},{"key":"10.1016\/j.eswa.2025.130813_bib0113","article-title":"A comprehensive survey on multimodal recommender systems: Taxonomy, evaluation, and future directions","volume":"abs\/2302.04473","author":"Zhou","year":"2023","journal-title":"CoRR"},{"key":"10.1016\/j.eswa.2025.130813_bib0114","series-title":"ECAI","first-page":"3123","article-title":"Enhancing dyadic relations with homogeneous graphs for multimodal recommendation","volume":"vol. 372","author":"Zhou","year":"2023"},{"key":"10.1016\/j.eswa.2025.130813_bib0115","series-title":"Mmasia (workshops)","first-page":"6:1","article-title":"Mmrec: Simplifying multimodal recommendation","author":"Zhou","year":"2023"},{"key":"10.1016\/j.eswa.2025.130813_bib0116","article-title":"A tale of two graphs: Freezing and denoising graph structures for multimodal recommendation","volume":"abs\/2211.06924","author":"Zhou","year":"2022","journal-title":"CoRR"},{"key":"10.1016\/j.eswa.2025.130813_bib0117","series-title":"ACM Multimedia","first-page":"935","article-title":"A tale of two graphs: Freezing and denoising graph structures for multimodal recommendation","author":"Zhou","year":"2023"},{"key":"10.1016\/j.eswa.2025.130813_bib0118","series-title":"WWW","first-page":"845","article-title":"Bootstrap latent representations for multi-modal recommendation","author":"Zhou","year":"2023"},{"key":"10.1016\/j.eswa.2025.130813_bib0119","series-title":"SIGIR","first-page":"2912","article-title":"BARS: Towards open benchmarking for recommender systems","author":"Zhu","year":"2022"}],"container-title":["Expert Systems with Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0957417425044288?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0957417425044288?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T12:21:49Z","timestamp":1774441309000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0957417425044288"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4]]},"references-count":119,"alternative-id":["S0957417425044288"],"URL":"https:\/\/doi.org\/10.1016\/j.eswa.2025.130813","relation":{},"ISSN":["0957-4174"],"issn-type":[{"value":"0957-4174","type":"print"}],"subject":[],"published":{"date-parts":[[2026,4]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Large-scale benchmarks for multimodal recommendation with Ducho","name":"articletitle","label":"Article Title"},{"value":"Expert Systems with Applications","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.eswa.2025.130813","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2025 The Authors. Published by Elsevier Ltd.","name":"copyright","label":"Copyright"}],"article-number":"130813"}}