{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T00:50:56Z","timestamp":1774399856418,"version":"3.50.1"},"publisher-location":"Cham","reference-count":62,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031887079","type":"print"},{"value":"9783031887086","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-88708-6_19","type":"book-chapter","created":{"date-parts":[[2025,4,4]],"date-time":"2025-04-04T11:51:57Z","timestamp":1743767517000},"page":"293-309","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Context Example Selection for LLM Generated Relevance Assessments"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1509-9248","authenticated-orcid":false,"given":"Jack","family":"McKechnie","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1266-5996","authenticated-orcid":false,"given":"Graham","family":"McDonald","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3143-279X","authenticated-orcid":false,"given":"Craig","family":"Macdonald","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,4,3]]},"reference":[{"key":"19_CR1","unstructured":"Abbasiantaeb, Z., Meng, C., Azzopardi, L., Aliannejadi, M.: Can we use large language models to fill relevance judgment holes? arXiv preprint arXiv:2405.05600 (2024)"},{"key":"19_CR2","unstructured":"Achiam, J., et\u00a0al.: GPT-4 technical report. arXiv preprint arXiv:2303.08774 (2023)"},{"key":"19_CR3","doi-asserted-by":"crossref","unstructured":"Arabzadeh, N., Vtyurina, A., Yan, X., Clarke, C.L.: Shallow pooling for sparse labels. Inf. Retrieval J. 25(4) (2022)","DOI":"10.1007\/s10791-022-09411-0"},{"key":"19_CR4","doi-asserted-by":"crossref","unstructured":"Aslam, J.A., Savell, R.: On the effectiveness of evaluating retrieval systems in the absence of relevance judgments. In: Proceedings of SIGIR (2003)","DOI":"10.1145\/860500.860501"},{"key":"19_CR5","unstructured":"Bajaj, P., et\u00a0al.: MS MARCO: A human generated machine reading comprehension dataset. arXiv preprint arXiv:1611.09268 (2016)"},{"key":"19_CR6","doi-asserted-by":"crossref","unstructured":"Boytsov, L., Belova, A., Westfall, P.: Deciding on an adjustment for multiplicity in IR experiments. In: Proceedings of SIGIR (2013)","DOI":"10.1145\/2484028.2484034"},{"key":"19_CR7","unstructured":"Brants, T., Popat, A., Xu, P., Och, F.J., Dean, J.: Large language models in machine translation. In: Proceedings of EMNLP-CoNLL (2007)"},{"key":"19_CR8","unstructured":"Brown, T.B., et al.: Language models are few-shot learners. In: Proceedings of NeurIPS (2020)"},{"key":"19_CR9","doi-asserted-by":"crossref","unstructured":"Buckley, C., Dimmick, D., Soboroff, I., Voorhees, E.: Bias and the limits of pooling for large collections. Inf. Retrieval 10 (2007)","DOI":"10.1007\/s10791-007-9032-x"},{"key":"19_CR10","doi-asserted-by":"crossref","unstructured":"Carterette, B., Gabrilovich, E., Josifovski, V., Metzler, D.: Measuring the reusability of test collections. In: Proceedings of WSDM (2010)","DOI":"10.1145\/1718487.1718516"},{"key":"19_CR11","doi-asserted-by":"crossref","unstructured":"Cleverdon, C.: The Cranfield tests on index language devices. In: ASLIB. vol.\u00a019 (1967)","DOI":"10.1108\/eb050097"},{"key":"19_CR12","doi-asserted-by":"crossref","unstructured":"Cohen, J.: A coefficient of agreement for nominal scales. EPM 20(1) (1960)","DOI":"10.1177\/001316446002000104"},{"key":"19_CR13","doi-asserted-by":"crossref","unstructured":"Craswell, N., Mitra, B., Yilmaz, E., Campos, D., Voorhees, E.M.: Overview of the TREC 2019 deep learning track. arXiv preprint arXiv:2003.07820 (2020)","DOI":"10.6028\/NIST.SP.1266.deep-overview"},{"key":"19_CR14","doi-asserted-by":"crossref","unstructured":"Craswell, N., Mitra, B., Yilmaz, E., Campos, D., Voorhees, E.M.: Overview of the TREC 2019 deep learning track. arXiv preprint arXiv:2003.07820 (2020)","DOI":"10.6028\/NIST.SP.1266.deep-overview"},{"key":"19_CR15","unstructured":"Devlin, J.: BERT: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"19_CR16","unstructured":"Dong, L., et al.: Unified language model pre-training for natural language understanding and generation. In: Proceedings of NeurIPS (2019)"},{"key":"19_CR17","unstructured":"Dubey, A., et\u00a0al.: The Llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)"},{"key":"19_CR18","doi-asserted-by":"crossref","unstructured":"Dunn, O.J.: Multiple comparisons among means. JASA 56(293) (1961)","DOI":"10.2307\/2282330"},{"key":"19_CR19","doi-asserted-by":"crossref","unstructured":"Ekstrand, M.D., McDonald, G., Raj, A., Johnson, I.: Overview of the TREC 2022 fair ranking track. arXiv preprint arXiv:2302.05558 (2023)","DOI":"10.6028\/NIST.SP.500-338.fair-overview"},{"key":"19_CR20","doi-asserted-by":"crossref","unstructured":"Faggioli, G., et\u00a0al.: Perspectives on large language models for relevance judgment. In: Proceedings of SIGIR (2023)","DOI":"10.1145\/3578337.3605136"},{"key":"19_CR21","doi-asserted-by":"crossref","unstructured":"Fang, J., Meng, Z., Macdonald, C.: Trace the evidence: Constructing knowledge-grounded reasoning chains for retrieval-augmented generation. arXiv preprint arXiv:2406.11460 (2024)","DOI":"10.18653\/v1\/2024.findings-emnlp.496"},{"key":"19_CR22","doi-asserted-by":"crossref","unstructured":"Fisher, R.A.: Frequency distribution of the values of the correlation coefficient in samples from an indefinitely large population. Biometrika 10(4) (1915)","DOI":"10.2307\/2331838"},{"key":"19_CR23","doi-asserted-by":"crossref","unstructured":"Gonen, H., Iyer, S., Blevins, T., Smith, N.A., Zettlemoyer, L.: Demystifying prompts in language models via perplexity estimation. arXiv preprint arXiv:2212.04037 (2022)","DOI":"10.18653\/v1\/2023.findings-emnlp.679"},{"key":"19_CR24","doi-asserted-by":"crossref","unstructured":"Harman, D.: Overview of the first TREC conference. In: Proceedings of SIGIR (1993)","DOI":"10.3115\/1075671.1075685"},{"key":"19_CR25","doi-asserted-by":"crossref","unstructured":"Harman, D.: Overview of the third text retrieval conference (TREC-3). DIANE Publishing (1995)","DOI":"10.6028\/NIST.SP.500-225"},{"key":"19_CR26","doi-asserted-by":"crossref","unstructured":"Hauff, C., Hiemstra, D., Azzopardi, L., De\u00a0Jong, F.: A case for automatic system evaluation. In: Proceedings of ECIR (2010)","DOI":"10.1007\/978-3-642-12275-0_16"},{"key":"19_CR27","doi-asserted-by":"crossref","unstructured":"Hawking, D., Craswell, N., Thistlewaite, P., Harman, D.: Results and challenges in web search evaluation. Comput. Netw. 31(11-16) (1999)","DOI":"10.1016\/S1389-1286(99)00024-9"},{"key":"19_CR28","doi-asserted-by":"crossref","unstructured":"Hofst\u00e4tter, S., Lin, S.C., Yang, J.H., Lin, J., Hanbury, A.: Efficiently teaching an effective dense retriever with balanced topic aware sampling. In: Proceedings of SIGIR (2021)","DOI":"10.1145\/3404835.3462891"},{"key":"19_CR29","doi-asserted-by":"crossref","unstructured":"J\u00e4rvelin, K., Kek\u00e4l\u00e4inen, J.: Cumulated gain-based evaluation of IR techniques. TOIS 20(4) (2002)","DOI":"10.1145\/582415.582418"},{"key":"19_CR30","unstructured":"Jones, S.: Report on the need for and provision of an \u201cideal\u201d information retrieval test collection (1975)"},{"key":"19_CR31","unstructured":"Kendall, M.G.: Rank correlation methods (1948)"},{"key":"19_CR32","doi-asserted-by":"crossref","unstructured":"Liu, J., Shen, D., Zhang, Y., Dolan, W.B., Carin, L., Chen, W.: What makes good in-context examples for GPT-3? In: Proceedings of DeeLIO@ACL (2022)","DOI":"10.18653\/v1\/2022.deelio-1.10"},{"key":"19_CR33","doi-asserted-by":"crossref","unstructured":"Losada, D.E., Parapar, J., Barreiro, A.: Cost-effective construction of information retrieval test collections. In: Proceedings of CERI (2018)","DOI":"10.1145\/3230599.3230612"},{"key":"19_CR34","doi-asserted-by":"crossref","unstructured":"MacAvaney, S., Macdonald, C., Ounis, I.: Streamlining evaluation with IR-measures. In: Proceedings of ECIR (2022)","DOI":"10.1007\/978-3-030-99739-7_38"},{"key":"19_CR35","doi-asserted-by":"crossref","unstructured":"MacAvaney, S., Soldaini, L.: One-shot labeling for automatic relevance estimation. In: Proceedings of SIGIR (2023)","DOI":"10.1145\/3539618.3592032"},{"key":"19_CR36","doi-asserted-by":"crossref","unstructured":"MacAvaney, S., Yates, A., Feldman, S., Downey, D., Cohan, A., Goharian, N.: Simplified data wrangling with IR_datasets. In: Proceedings of SIGIR (2021)","DOI":"10.1145\/3404835.3463254"},{"key":"19_CR37","doi-asserted-by":"crossref","unstructured":"Mackie, I., Dalton, J., Yates, A.: How deep is your learning: The DL-HARD annotated deep learning dataset. In: Proceedings of SIGIR (2021)","DOI":"10.1145\/3404835.3463262"},{"key":"19_CR38","unstructured":"Meng, C., Arabzadeh, N., Askari, A., Aliannejadi, M., de\u00a0Rijke, M.: Query performance prediction using relevance judgments generated by large language models. arXiv preprint arXiv:2404.01012 (2024)"},{"key":"19_CR39","doi-asserted-by":"crossref","unstructured":"Nuray, R., Can, F.: Automatic ranking of information retrieval systems using data fusion. Inf. Process. Manage. 42(3) (2006)","DOI":"10.1016\/j.ipm.2005.03.023"},{"key":"19_CR40","doi-asserted-by":"crossref","unstructured":"Pilault, J., Li, R., Subramanian, S., Pal, C.: On extractive and abstractive neural document summarization with transformer language models. In: Proceedings of EMNLP (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.748"},{"key":"19_CR41","doi-asserted-by":"crossref","unstructured":"Pradeep, R., Nogueira, R., Lin, J.: The expando-mono-duo design pattern for text ranking with pretrained sequence-to-sequence models. arXiv preprint arXiv:2101.05667 (2021)","DOI":"10.18653\/v1\/2020.findings-emnlp.63"},{"key":"19_CR42","unstructured":"Pradeep, R., et al.: Ragnar\u00f6k: A reusable RAG framework and baselines for TREC 2024 retrieval-augmented generation track. arXiv preprint arXiv:2406.16828 (2024)"},{"key":"19_CR43","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. JMLR 21(140) (2020)"},{"key":"19_CR44","doi-asserted-by":"crossref","unstructured":"Rahmani, H.A., et al.: LLM4Eval: large language model for evaluation in IR. In: Proceedings of SIGIR (2024)","DOI":"10.1145\/3626772.3657992"},{"key":"19_CR45","doi-asserted-by":"crossref","unstructured":"Rahmani, H.A., et al.: Report on the 1st workshop on large language model for evaluation in information retrieval (LLM4Eval 2024) at SIGIR 2024. arXiv preprint arXiv:2408.05388 (2024)","DOI":"10.1145\/3722449.3722461"},{"key":"19_CR46","unstructured":"Rahmani, H.A., et al.: LLMJudge: LLMs for relevance judgments. arXiv preprint arXiv:2408.08896 (2024)"},{"key":"19_CR47","doi-asserted-by":"crossref","unstructured":"Schweizer, B., Wolff, E.F.: On nonparametric measures of dependence for random variables. Ann. Stat. 9(4) (1981)","DOI":"10.1214\/aos\/1176345528"},{"key":"19_CR48","doi-asserted-by":"crossref","unstructured":"Soboroff, I.: Don\u2019t use LLMs to make relevance judgments. arXiv preprint arXiv:2409.15133 (2024)","DOI":"10.54195\/irrj.19625"},{"key":"19_CR49","doi-asserted-by":"crossref","unstructured":"Soboroff, I., Nicholas, C., Cahan, P.: Ranking retrieval systems without relevance judgments. In: Proceedings of SIGIR (2001)","DOI":"10.1145\/383952.383961"},{"key":"19_CR50","doi-asserted-by":"crossref","unstructured":"Sorensen, T., et al.: An information-theoretic approach to prompt engineering without ground truth labels. arXiv preprint arXiv:2203.11364 (2022)","DOI":"10.18653\/v1\/2022.acl-long.60"},{"key":"19_CR51","doi-asserted-by":"crossref","unstructured":"Spoerri, A.: Using the structure of overlap between search results to rank retrieval systems without relevance judgments. Inf. Process. Manag. 43(4) (2007)","DOI":"10.1016\/j.ipm.2006.09.009"},{"key":"19_CR52","doi-asserted-by":"crossref","unstructured":"Su, Y., Tai, Y., Ji, Y., Li, J., Yan, B., Zhang, M.: Demonstration augmentation for zero-shot in-context learning. arXiv preprint arXiv:2406.01224 (2024)","DOI":"10.18653\/v1\/2024.findings-acl.846"},{"key":"19_CR53","unstructured":"Tamber, M.S., Pradeep, R., Lin, J.: Scaling down, LiTting up: Efficient zero-shot listwise reranking with seq2seq encoder-decoder models. arXiv preprint arXiv:2312.16098 (2023)"},{"key":"19_CR54","doi-asserted-by":"crossref","unstructured":"Thomas, P., Spielman, S., Craswell, N., Mitra, B.: Large language models can accurately predict searcher preferences. In: Proceedings of SIGIR (2024)","DOI":"10.1145\/3626772.3657707"},{"key":"19_CR55","unstructured":"Upadhyay, S., Kamalloo, E., Lin, J.: LLMs can patch up missing relevance judgments in evaluation. arXiv preprint arXiv:2405.04727 (2024)"},{"key":"19_CR56","unstructured":"Upadhyay, S., Pradeep, R., Thakur, N., Craswell, N., Lin, J.: UMBRELA: UMbrela is the (open-source reproduction of the) Bing RELevance assessor. arXiv preprint arXiv:2406.06519 (2024)"},{"key":"19_CR57","unstructured":"Vaswani, A.: Attention is all you need. In: Proceedings of NeurIPS (2017)"},{"key":"19_CR58","doi-asserted-by":"crossref","unstructured":"Voorhees, E.M.: The TREC robust retrieval track. In: ACM SIGIR Forum. vol.\u00a039 (2005)","DOI":"10.1145\/1067268.1067272"},{"key":"19_CR59","unstructured":"Voorhees, E.M., Soboroff, I., Lin, J.: Can old TREC collections reliably evaluate modern neural retrieval models? arXiv preprint arXiv:2201.11086 (2022)"},{"key":"19_CR60","doi-asserted-by":"crossref","unstructured":"Wu, S., Crestani, F.: Data fusion with estimated weights. In: Proceedings of CIKM (2002)","DOI":"10.1145\/584792.584908"},{"key":"19_CR61","doi-asserted-by":"crossref","unstructured":"Wu, S., Crestani, F.: Methods for ranking information retrieval systems without relevance judgments. In: Proceedings of ACM\/SIGAPP (2003)","DOI":"10.1145\/952686.952693"},{"key":"19_CR62","doi-asserted-by":"crossref","unstructured":"Zhuang, H., et al.: RankT5: fine-tuning t5 for text ranking with ranking losses. In: Proceedings of SIGIR (2023)","DOI":"10.1145\/3539618.3592047"}],"container-title":["Lecture Notes in Computer Science","Advances in Information Retrieval"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-88708-6_19","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,4]],"date-time":"2025-04-04T11:52:45Z","timestamp":1743767565000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-88708-6_19"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031887079","9783031887086"],"references-count":62,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-88708-6_19","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"3 April 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"ECIR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Information Retrieval","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Lucca","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7 April 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 April 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"47","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ecir2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/ecir2025.eu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}