{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,2]],"date-time":"2026-06-02T09:39:15Z","timestamp":1780393155529,"version":"3.54.1"},"reference-count":53,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2020,3,18]],"date-time":"2020-03-18T00:00:00Z","timestamp":1584489600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2020,3,18]],"date-time":"2020-03-18T00:00:00Z","timestamp":1584489600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Inf Retrieval J"],"published-print":{"date-parts":[[2020,8]]},"DOI":"10.1007\/s10791-020-09371-3","type":"journal-article","created":{"date-parts":[[2020,3,18]],"date-time":"2020-03-18T19:02:52Z","timestamp":1584558172000},"page":"387-410","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":45,"title":["Offline evaluation options for recommender systems"],"prefix":"10.1007","volume":"23","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2278-0445","authenticated-orcid":false,"given":"Roc\u00edo","family":"Ca\u00f1amares","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0668-6317","authenticated-orcid":false,"given":"Pablo","family":"Castells","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6638-0232","authenticated-orcid":false,"given":"Alistair","family":"Moffat","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2020,3,18]]},"reference":[{"issue":"6","key":"9371_CR1","doi-asserted-by":"publisher","first-page":"734","DOI":"10.1109\/TKDE.2005.99","volume":"17","author":"G Adomavicius","year":"2005","unstructured":"Adomavicius, G., & Tuzhilin, A. (2005). Toward the next generation of recommender systems: A survey of the state-of-the-art and possible extensions. IEEE Transactions on Knowledge and Data Engineering, 17(6), 734\u2013749.","journal-title":"IEEE Transactions on Knowledge and Data Engineering"},{"key":"9371_CR2","doi-asserted-by":"crossref","unstructured":"Armstrong, T.G., Moffat, A., Webber, W., & Zobel, J. (2009). Improvements that don\u2019t add up: Ad-hoc retrieval results since 1998. In Proceedings of the ACM international conference on information and knowledge management (CIKM) (pp. 601\u2013610).","DOI":"10.1145\/1645953.1646031"},{"key":"9371_CR3","doi-asserted-by":"crossref","unstructured":"Bailey, P., Craswell, N., Soboroff, I., Thomas, P., de Vries, A.P., & Yilmaz, E. (2008). Relevance assessment: Are judges exchangeable and does it matter. In Proceedings of the ACM international conference on research and development in information retrieval (SIGIR) (pp. 667\u2013674).","DOI":"10.1145\/1390334.1390447"},{"issue":"6","key":"9371_CR4","doi-asserted-by":"publisher","first-page":"606","DOI":"10.1007\/s10791-017-9312-z","volume":"20","author":"A Bellog\u00edn","year":"2017","unstructured":"Bellog\u00edn, A., Castells, P., & Cantador, I. (2017). Statistical biases in information retrieval metrics for recommender systems. Information Retrieval, 20(6), 606\u2013634.","journal-title":"Information Retrieval"},{"issue":"6","key":"9371_CR5","doi-asserted-by":"publisher","first-page":"697","DOI":"10.1007\/s10791-012-9214-z","volume":"16","author":"A Bellog\u00edn","year":"2013","unstructured":"Bellog\u00edn, A., Wang, J., & Castells, P. (2013). Bridging memory-based collaborative filtering and text retrieval. Information Retrieval, 16(6), 697\u2013724.","journal-title":"Information Retrieval"},{"key":"9371_CR6","doi-asserted-by":"crossref","unstructured":"Bellogn, A., Castells, P., & Cantador, I. (2011). Precision-oriented evaluation of recommender systems: An algorithmic comparison. In Proceedings of the ACM conference on recommender systems (RecSys) (pp. 333\u2013336).","DOI":"10.1145\/2043932.2043996"},{"key":"9371_CR7","unstructured":"Bertin-Mahieux, T., Ellis D.P.W., Whitman, B., & Lamere, P. (2011). The million song dataset. In Proceedings of the international society for music information retrieval conference (ISMIR) (pp. 591\u2013596)."},{"key":"9371_CR8","doi-asserted-by":"crossref","unstructured":"Ca\u00f1amares, R., & Castells, P. (2017). A probabilistic reformulation of memory-based collaborative filtering: Implications on popularity biases. In Proceedings of the ACM international conference on research and development in information retrieval (SIGIR) (pp. 215\u2013224).","DOI":"10.1145\/3077136.3080836"},{"key":"9371_CR9","unstructured":"Ca\u00f1amares, R., & Castells, P. (2018a). Characterization of fair experiments for recommender system evaluation: A formal analysis. In RecSys wrkshop on offline evaluation of recommender systems (REVEAL)."},{"key":"9371_CR10","doi-asserted-by":"crossref","unstructured":"Ca\u00f1amares, R., & Castells, P. (2018b). Should I follow the crowd? A probabilistic analysis of the effectiveness of popularity in recommender systems. In Proceedings of the ACM international conference on research and development in information retrieval (SIGIR) (pp. 415\u2013424).","DOI":"10.1145\/3209978.3210014"},{"issue":"1","key":"9371_CR11","doi-asserted-by":"publisher","first-page":"4:1","DOI":"10.1145\/2094072.2094076","volume":"30","author":"BA Carterette","year":"2012","unstructured":"Carterette, B. A. (2012). Multiple testing in statistical analysis of systems-based information retrieval experiments. ACM Transactions on Information Systems, 30(1), 4:1\u20134:34.","journal-title":"ACM Transactions on Information Systems"},{"key":"9371_CR12","doi-asserted-by":"crossref","unstructured":"Castells, P., Hurley, N.J., & Vargas, S. (2015). Novelty and diversity in recommender systems. In Recommender systems handbook (pp. 881\u2013918). Springer.","DOI":"10.1007\/978-1-4899-7637-6_26"},{"key":"9371_CR13","doi-asserted-by":"crossref","unstructured":"Chapelle, O., Metlzer, D., Zhang, Y., & Grinspan, P. (2009). Expected reciprocal rank for graded relevance. In Proceedings of the ACM international conference on information and knowledge management (CIKM) (pp. 621\u2013630).","DOI":"10.1145\/1645953.1646033"},{"key":"9371_CR14","doi-asserted-by":"crossref","unstructured":"Cremonesi, P., Koren, Y., & Turrin, R. (2010). Performance of recommender algorithms on top-$$n$$ recommendation tasks. In Proceedings of the ACM conference on recommender systems (RecSys) (pp. 39\u201346).","DOI":"10.1145\/1864708.1864721"},{"key":"9371_CR15","unstructured":"Dacrema, MF., Cremonesi, P., & Jannach, D. (2019) . Are we really making much progress? A worrying analysis of recent neural recommendation approaches. In Proceedings of the ACM conference on recommender systems (RecSys) (pp. 101\u2013109)."},{"key":"9371_CR16","doi-asserted-by":"crossref","unstructured":"Din\u00e7er, BT., Macdonald, C., & Ounis, I . (2014a) . Hypothesis testing for the risk-sensitive evaluation of retrieval systems. In Proceedings of the ACM international conference on research and development in information retrieval (SIGIR) (pp. 23\u201332).","DOI":"10.1145\/2600428.2609625"},{"key":"9371_CR17","doi-asserted-by":"crossref","unstructured":"Din\u00e7er, BT., Ounis, I., & Macdonald, C .(2014b) . Tackling biased baselines in the risk-sensitive evaluation of retrieval systems. In Proceedings of the European conference on information retrieval (ECIR) (pp. 26\u201338).","DOI":"10.1007\/978-3-319-06028-6_3"},{"issue":"1","key":"9371_CR18","first-page":"96","volume":"7","author":"N Ferro","year":"2018","unstructured":"Ferro, N., Fuhr, N., Grefenstette, G., Konstan, J. A., Castells, P., Daly, E. M., et al. (2018). From evaluating to forecasting performance: How to turn Information retrieval, natural language processing and recommender systems into predictive sciences. Dagstuhl Manifestos, 7(1), 96\u2013139.","journal-title":"Dagstuhl Manifestos"},{"key":"9371_CR19","doi-asserted-by":"crossref","unstructured":"Gilotte, A., Calauz\u00e8nes, C., Nedelec, T., Abraham, A., & Doll, S . (2018) . Offline A\/B testing for recommender systems. In Proceedings of the ACM international conference on web earch and data mining (WSDM) (pp. 198\u2013206).","DOI":"10.1145\/3159652.3159687"},{"key":"9371_CR20","doi-asserted-by":"crossref","unstructured":"Gruson, A., Chandar, P., Charbuillet, C., McInerney, J., Hansen, S., Tardieu, D., & Carterette, B .(2019). Offline evaluation to make decisions about playlist recommendation. In Proceedings of the ACM international conference on web search and data mining (WSDM) (pp. 420\u2013428).","DOI":"10.1145\/3289600.3291027"},{"key":"9371_CR21","doi-asserted-by":"crossref","unstructured":"Gunawardana, A., & Shani, G. (2015) . Evaluating recommendation systems. In Recommender systems handbook (pp. 265\u2013308). Springer.","DOI":"10.1007\/978-1-4899-7637-6_8"},{"key":"9371_CR22","first-page":"21","volume-title":"TREC: Experiment and evaluation in information retrieval, chap 2","author":"DK Harman","year":"2005","unstructured":"Harman, D. K. (2005). The TREC test collections. In E. M. Voorhees & D. K. Harman (Eds.), TREC: Experiment and evaluation in information retrieval, chap 2 (pp. 21\u201352). Cambridge: MIT Press."},{"issue":"4","key":"9371_CR23","doi-asserted-by":"publisher","first-page":"19.1","DOI":"10.1145\/2827872","volume":"5","author":"FM Harper","year":"2016","unstructured":"Harper, F. M., & Konstan, J. A. (2016). The MovieLens datasets: History and context. ACM Transactions on Interactive Intelligent Systems, 5(4), 19.1\u201319.19.","journal-title":"ACM Transactions on Interactive Intelligent Systems"},{"key":"9371_CR24","doi-asserted-by":"crossref","unstructured":"He, R., & McAuley, J . (2016) . Ups and downs: Modeling the visual evolution of fashion trends with one-class collaborative filtering. In Proceedings of the international conference on the world wide web (WWW) (pp. 507\u2013517).","DOI":"10.1145\/2872427.2883037"},{"issue":"1","key":"9371_CR25","doi-asserted-by":"publisher","first-page":"5","DOI":"10.1145\/963770.963772","volume":"22","author":"JL Herlocker","year":"2004","unstructured":"Herlocker, J. L., Konstan, J. A., Terveen, L. G., & Riedl, J. T. (2004). Evaluating collaborative filtering recommender systems. ACM Transactions on Information Systems, 22(1), 5\u201353.","journal-title":"ACM Transactions on Information Systems"},{"issue":"1","key":"9371_CR26","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1561\/1500000051","volume":"10","author":"K Hofmann","year":"2016","unstructured":"Hofmann, K., Li, L., & Radlinski, F. (2016). Online evaluation for information retrieval. Foundations & Trends in Information Retrieval, 10(1), 1\u2013117.","journal-title":"Foundations & Trends in Information Retrieval"},{"key":"9371_CR27","doi-asserted-by":"crossref","unstructured":"Hu, Y., Koren, Y., & Volinsky, C . (2008). Collaborative filtering for implicit feedback datasets. In Proceedings of the international conference on data mining (ICDM) (pp. 15\u201319).","DOI":"10.1109\/ICDM.2008.22"},{"issue":"5","key":"9371_CR28","doi-asserted-by":"publisher","first-page":"427","DOI":"10.1007\/s11257-015-9165-3","volume":"25","author":"D Jannach","year":"2015","unstructured":"Jannach, D., Lerche, L., Kamehkhosh, I., & Jugovac, M. (2015). What recommenders recommend: An analysis of recommendation biases and possible countermeasures. User Modeling and User-Adapted Interaction, 25(5), 427\u2013491.","journal-title":"User Modeling and User-Adapted Interaction"},{"issue":"4","key":"9371_CR29","doi-asserted-by":"publisher","first-page":"422","DOI":"10.1145\/582415.582418","volume":"20","author":"K J\u00e4rvelin","year":"2002","unstructured":"J\u00e4rvelin, K., & Kek\u00e4l\u00e4inen, J. (2002). Cumulated gain-based evaluation of IR techniques. ACM Transactions on Information Systems, 20(4), 422\u2013446.","journal-title":"ACM Transactions on Information Systems"},{"issue":"2","key":"9371_CR30","doi-asserted-by":"publisher","first-page":"138","DOI":"10.1007\/s10791-012-9205-0","volume":"16","author":"G Kazai","year":"2013","unstructured":"Kazai, G., Kamps, J., & Milic-Frayling, N. (2013). An analysis of human factors and label accuracy in crowdsourcing relevance judgments. Information Retrieval, 16(2), 138\u2013178.","journal-title":"Information Retrieval"},{"key":"9371_CR31","doi-asserted-by":"crossref","unstructured":"Kutlu, M., McDonnell, T., Barkallah, Y., Elsayed, T., & Lease, M. (2018). Crowd versus expert: What can relevance judgment rationales teach us about assessor disagreement? In Proceedins of the ACM international conference on research and development in information retrieval (SIGIR) (pp. 805\u2013814).","DOI":"10.1145\/3209978.3210033"},{"issue":"4","key":"9371_CR32","doi-asserted-by":"publisher","first-page":"416","DOI":"10.1007\/s10791-016-9282-6","volume":"19","author":"X Lu","year":"2016","unstructured":"Lu, X., Moffat, A., & Culpepper, J. S. (2016). The effect of pooling and evaluation depth on IR metrics. Information Retrieval, 19(4), 416\u2013445.","journal-title":"Information Retrieval"},{"key":"9371_CR33","doi-asserted-by":"crossref","unstructured":"Marlin, B.M., & Zemel, R.S. (2009) .Collaborative prediction and ranking with non-random missing data. In Proceedings of the ACM conference on recommender systems (RecSys) (pp. 5\u201312).","DOI":"10.1145\/1639714.1639717"},{"issue":"3","key":"9371_CR34","doi-asserted-by":"publisher","first-page":"24:1","DOI":"10.1145\/3052768","volume":"35","author":"A Moffat","year":"2017","unstructured":"Moffat, A., Bailey, P., Scholer, F., & Thomas, P. (2017). Incorporating user expectations and behavior into the measurement of search effectiveness. ACM Transactions on Information Systems, 35(3), 24:1\u201324:38.","journal-title":"ACM Transactions on Information Systems"},{"issue":"1","key":"9371_CR35","doi-asserted-by":"publisher","first-page":"2.1","DOI":"10.1145\/1416950.1416952","volume":"27","author":"A Moffat","year":"2008","unstructured":"Moffat, A., & Zobel, J. (2008). Rank-biased precision for measurement of retrieval effectiveness. ACM Transactions on Information Systems, 27(1), 2.1\u20132.27.","journal-title":"ACM Transactions on Information Systems"},{"key":"9371_CR36","doi-asserted-by":"crossref","unstructured":"Ning, X., Desrosiers, C., & Karypis, G .(2015) . A comprehensive survey of neighborhood-based recommendation methods. In Recommender systems handbook (pp. 37\u201376). Springer.","DOI":"10.1007\/978-1-4899-7637-6_2"},{"key":"9371_CR37","doi-asserted-by":"crossref","unstructured":"Ricci, F., Rokach, L., & Shapira, B. (2015). Recommender systems: Introduction and challenges. In Recommender systems handbook (pp 1\u201334). Springer.","DOI":"10.1007\/978-1-4899-7637-6_1"},{"key":"9371_CR38","doi-asserted-by":"crossref","unstructured":"Robertson, S. (2006). On GMAP: And other transformations. In Proceedings of the ACM international conference on information and knowledge management (CIKM) (pp. 78\u201383).","DOI":"10.1145\/1183614.1183630"},{"key":"9371_CR39","doi-asserted-by":"crossref","unstructured":"Said, A., & Bellog\u00edn, A. (2014). Comparative recommender system evaluation: Benchmarking recommendation frameworks. In Proceedings of the ACM conference on recommender systems (RecSys) (pp. 129\u2013136).","DOI":"10.1145\/2645710.2645746"},{"key":"9371_CR40","doi-asserted-by":"crossref","unstructured":"Sakai, T. (2016). Statistical significance, power, and sample sizes: A systematic review of SIGIR and TOIS, 2006\u20132015. In Proceedings of the ACM international conference on research and development in information retrieval (SIGIR) (pp. 5\u201314).","DOI":"10.1145\/2911451.2911492"},{"key":"9371_CR41","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-13-1199-4","volume-title":"Laboratory experiments in information retrieval: sample sizes, effect sizes, and statistical power, the information retrieval series,","author":"T Sakai","year":"2018","unstructured":"Sakai, T. (2018). Laboratory experiments in information retrieval: sample sizes, effect sizes, and statistical power, the information retrieval series, (Vol. 40). Berlin: Springer."},{"issue":"5","key":"9371_CR42","doi-asserted-by":"publisher","first-page":"447","DOI":"10.1007\/s10791-008-9059-7","volume":"11","author":"T Sakai","year":"2008","unstructured":"Sakai, T., & Kando, N. (2008). On information retrieval metrics designed for evaluation with incomplete relevance assessments. Information Retrieval, 11(5), 447\u2013470.","journal-title":"Information Retrieval"},{"key":"9371_CR43","unstructured":"Schnabel, T., Swaminathan, A., Singh, A., Chandak, N., & Joachims, T. (2016). Recommendations as treatments: Debiasing learning and evaluation. In Proceedings of the international conference on machine learning (ICML) (pp. 1670\u20131679)."},{"key":"9371_CR44","doi-asserted-by":"crossref","unstructured":"Steck, H. (2010). Training and testing of recommender systems on data missing not at random. In Proceedings of the conference on knowledge discovery and data mining (KDD) (pp. 713\u2013722).","DOI":"10.1145\/1835804.1835895"},{"key":"9371_CR45","doi-asserted-by":"crossref","unstructured":"Steck, H. (2011). Item popularity and recommendation accuracy. In Proceedings of the ACM conference on recommender systems (RecSys) (pp. 125\u2013132).","DOI":"10.1145\/2043932.2043957"},{"key":"9371_CR46","doi-asserted-by":"crossref","unstructured":"Steck, H. (2013). Evaluation of recommendations: Rating-prediction and ranking. In Proceedings of the ACM conference on recommender systems (RecSys) (pp. 213\u2013220).","DOI":"10.1145\/2507157.2507160"},{"key":"9371_CR47","unstructured":"Swaminathan, A., Krishnamurthy, A., Agarwal, A., Dud\u00edk, M., Langford, J., Jose, D., & Zitouni, I. (2017). Off-policy evaluation for slate recommendation. In Proceedings of the conference on neural information processing systems (NIPS) (pp. 3635\u20133645)."},{"key":"9371_CR48","doi-asserted-by":"crossref","unstructured":"Valcarce, D., Bellogn, A., Parapar, J., & Castells, P. (2018). On the robustness and discriminative power of IR metrics for top-$$n$$ recommendation. In Proceedings of the ACM conference on recommender systems (RecSys) (pp. 260\u2013268).","DOI":"10.1145\/3240323.3240347"},{"issue":"sup1","key":"9371_CR49","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1080\/00031305.2019.1583913","volume":"73","author":"RL Wasserstein","year":"2019","unstructured":"Wasserstein, R. L., Schirm, A. L., & Lazar, N. A. (2019). Moving to a world beyond \u201c$$p<0.05$$\u201d. The American Statistician, 73(sup1), 1\u201319.","journal-title":"The American Statistician"},{"key":"9371_CR50","doi-asserted-by":"crossref","unstructured":"Webber, W., Moffat, A., & Zobel, J. (2008). Score standardization for inter-collection comparison of retrieval systems. In Proceedings of the ACM international conference on research and development in information retrieval (SIGIR) (pp. 51\u201358).","DOI":"10.1145\/1390334.1390346"},{"issue":"4","key":"9371_CR51","doi-asserted-by":"publisher","first-page":"20.1","DOI":"10.1145\/1852102.1852106","volume":"28","author":"W Webber","year":"2010","unstructured":"Webber, W., Moffat, A., & Zobel, J. (2010). A similarity measure for indefinite rankings. ACM Transactions on Information Systems, 28(4), 20.1\u201320.38.","journal-title":"ACM Transactions on Information Systems"},{"key":"9371_CR52","doi-asserted-by":"crossref","unstructured":"Yang, L., Cui, Y., Xuan, Y., Wang, C., Belongie, S., & Estrin, D. (2018). Unbiased offline recommender evaluation for missing-not-at-random implicit feedback. In Proceedins of the ACM conference on recommender systems (RecSys) (pp. 279\u2013287).","DOI":"10.1145\/3240323.3240355"},{"key":"9371_CR53","doi-asserted-by":"crossref","unstructured":"Yilmaz, E., Aslam, J.A., & Robertson, S. (2008). A new rank correlation coefficient for information retrieval. In Proceedings of the ACM international conference on research and development in information retrieval (SIGIR) (pp. 587\u2013594).","DOI":"10.1145\/1390334.1390435"}],"container-title":["Information Retrieval Journal"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10791-020-09371-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10791-020-09371-3\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10791-020-09371-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,2]],"date-time":"2024-01-02T14:04:35Z","timestamp":1704204275000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10791-020-09371-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,3,18]]},"references-count":53,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2020,8]]}},"alternative-id":["9371"],"URL":"https:\/\/doi.org\/10.1007\/s10791-020-09371-3","relation":{},"ISSN":["1386-4564","1573-7659"],"issn-type":[{"value":"1386-4564","type":"print"},{"value":"1573-7659","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020,3,18]]},"assertion":[{"value":"11 September 2019","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 March 2020","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 March 2020","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}