{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,4,5]],"date-time":"2025-04-05T04:10:32Z","timestamp":1743826232851,"version":"3.40.3"},"publisher-location":"Cham","reference-count":30,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031887079","type":"print"},{"value":"9783031887086","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-88708-6_10","type":"book-chapter","created":{"date-parts":[[2025,4,4]],"date-time":"2025-04-04T11:52:28Z","timestamp":1743767548000},"page":"149-163","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Retrieve, Annotate, Evaluate, Repeat: Leveraging Multimodal LLMs for\u00a0Large-Scale Product Retrieval Evaluation"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4396-6019","authenticated-orcid":false,"given":"Kasra","family":"Hosseini","sequence":"first","affiliation":[]},{"given":"Thomas","family":"Kober","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0002-6329-4280","authenticated-orcid":false,"given":"Josip","family":"Krapac","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4658-0381","authenticated-orcid":false,"given":"Roland","family":"Vollgraf","sequence":"additional","affiliation":[]},{"given":"Weiwei","family":"Cheng","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0007-8922-9643","authenticated-orcid":false,"given":"Ana","family":"Peleteiro Ramallo","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,4,3]]},"reference":[{"key":"10_CR1","unstructured":"Achiam, J., et\u00a0al.: Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)"},{"issue":"6","key":"10_CR2","doi-asserted-by":"crossref","first-page":"1053","DOI":"10.1016\/j.ipm.2012.01.004","volume":"48","author":"O Alonso","year":"2012","unstructured":"Alonso, O., Mizzaro, S.: Using crowdsourcing for trec relevance assessment. Inf. Process. Manag. 48(6), 1053\u20131066 (2012)","journal-title":"Inf. Process. Manag."},{"key":"10_CR3","unstructured":"Bergum, J.K.: Improving retrieval with llm-as-a-judge (2024). https:\/\/blog.vespa.ai\/improving-retrieval-with-llm-as-a-judge\/. Accessed 10 June 2024"},{"key":"10_CR4","doi-asserted-by":"crossref","unstructured":"Blanco, R., et al.: Repeatable and reliable search system evaluation using crowdsourcing. In: Proceedings of the 34th International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 923\u2013932 (2011)","DOI":"10.1145\/2009916.2010039"},{"key":"10_CR5","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., et al.: Language models are few-shot learners. Adv. Neural. Inf. Process. Syst. 33, 1877\u20131901 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"10_CR6","doi-asserted-by":"crossref","unstructured":"Chen, K.T., Alonso, O., Larson, M., King, I.: Introduction to the special issue on crowd in intelligent systems (2016)","DOI":"10.1145\/2920522"},{"key":"10_CR7","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"128","DOI":"10.1007\/978-3-030-99736-6_9","volume-title":"Advances in Information Retrieval","author":"Y Chen","year":"2022","unstructured":"Chen, Y., Liu, S., Liu, Z., Sun, W., Baltrunas, L., Schroeder, B.: WANDS: dataset for product search relevance assessment. In: Hagen, M., et al. (eds.) ECIR 2022. LNCS, vol. 13185, pp. 128\u2013141. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-030-99736-6_9"},{"key":"10_CR8","doi-asserted-by":"publisher","unstructured":"Faggioli, G., et al.: Perspectives on large language models for relevance judgment. In: Proceedings of the 2023 ACM SIGIR International Conference on Theory of Information Retrieval, ICTIR \u201923, pp. 39\u201350. Association for Computing Machinery, New York (2023). https:\/\/doi.org\/10.1145\/3578337.3605136","DOI":"10.1145\/3578337.3605136"},{"key":"10_CR9","doi-asserted-by":"crossref","unstructured":"Faggioli, G., et al.: Who determines what is relevant? Humans or ai? Why not both? Commun. ACM 67(4), 31\u201334 (2024)","DOI":"10.1145\/3624730"},{"issue":"2","key":"10_CR10","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3110217","volume":"36","author":"M Ferrante","year":"2017","unstructured":"Ferrante, M., Ferro, N., Maistro, M.: Aware: exploiting evaluation measures to combine multiple assessors. ACM Trans. Inf. Syst. (TOIS) 36(2), 1\u201338 (2017)","journal-title":"ACM Trans. Inf. Syst. (TOIS)"},{"key":"10_CR11","doi-asserted-by":"publisher","unstructured":"Halvey, M., Villa, R., Clough, P.D.: Sigir 2014: workshop on gathering efficient assessments of relevance (gear). SIGIR Forum 49(1), 16\u201319 (2015). https:\/\/doi.org\/10.1145\/2795403.2795409","DOI":"10.1145\/2795403.2795409"},{"key":"10_CR12","doi-asserted-by":"crossref","first-page":"140","DOI":"10.1007\/s10618-008-0114-1","volume":"18","author":"R Kohavi","year":"2009","unstructured":"Kohavi, R., Longbotham, R., Sommerfield, D., Henne, R.M.: Controlled experiments on the web: survey and practical guide. Data Min. Knowl. Disc. 18, 140\u2013181 (2009)","journal-title":"Data Min. Knowl. Disc."},{"key":"10_CR13","doi-asserted-by":"crossref","first-page":"91","DOI":"10.1007\/s10791-013-9222-7","volume":"16","author":"M Lease","year":"2013","unstructured":"Lease, M., Yilmaz, E.: Crowdsourcing for information retrieval: introduction to the special issue. Inf. Retr. 16, 91\u2013100 (2013)","journal-title":"Inf. Retr."},{"key":"10_CR14","doi-asserted-by":"publisher","unstructured":"MacAvaney, S., Soldaini, L.: One-shot labeling for automatic relevance estimation. In: Proceedings of the 46th International ACM SIGIR Conference on Research and Development in Information Retrieval, SIGIR \u201923, pp. 2230\u20132235. Association for Computing Machinery, New York (2023).https:\/\/doi.org\/10.1145\/3539618.3592032","DOI":"10.1145\/3539618.3592032"},{"key":"10_CR15","doi-asserted-by":"crossref","unstructured":"Marcus, A., Parameswaran, A., et\u00a0al.: Crowdsourced data management: industry and academic perspectives. Found. Trends\u00ae Databases 6(1-2), 1\u2013161 (2015)","DOI":"10.1561\/1900000044"},{"key":"10_CR16","doi-asserted-by":"crossref","unstructured":"Moe, W.W.: Buying, searching, or browsing: differentiating between online shoppers using in-store navigational clickstream. J. Consum. Psychol. 13(1\u20132), 29\u201339 (2003). https:\/\/doi.org\/10.1207\/S15327663JCP13-1&2_03","DOI":"10.1207\/S15327663JCP13-1&2_03"},{"key":"10_CR17","unstructured":"Nye, M., et\u00a0al.: Show your work: scratchpads for intermediate computation with language models. arXiv preprint arXiv:2112.00114 (2021)"},{"key":"10_CR18","unstructured":"OpenAI: Hello gpt-4o (2024). https:\/\/openai.com\/index\/hello-gpt-4o\/. Accessed 17 July 2024"},{"issue":"8","key":"10_CR19","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I., et al.: Language models are unsupervised multitask learners. OpenAI blog 1(8), 9 (2019)","journal-title":"OpenAI blog"},{"key":"10_CR20","doi-asserted-by":"crossref","unstructured":"Rahmani, H.A., Craswell, N., Yilmaz, E., Mitra, B., Campos, D.: Synthetic test collections for retrieval evaluation (2024)","DOI":"10.1145\/3626772.3657942"},{"key":"10_CR21","doi-asserted-by":"crossref","unstructured":"Sanderson, M., et\u00a0al.: Test collection based evaluation of information retrieval systems. Found. Trends\u00ae Inf. Retr. 4(4), 247\u2013375 (2010)","DOI":"10.1561\/1500000009"},{"key":"10_CR22","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1007\/978-3-031-56066-8_1","volume-title":"Advances in Information Retrieval","author":"B Soviero","year":"2024","unstructured":"Soviero, B., Kuhn, D., Salle, A., Moreira, V.P.: Chatgpt goes shopping: LLMs can predict relevance in ecommerce search. In: Goharian, N., et al. (eds.) Advances in Information Retrieval, pp. 3\u201311. Springer, Cham (2024). https:\/\/doi.org\/10.1007\/978-3-031-56066-8_1"},{"key":"10_CR23","unstructured":"Spark-Jones, K.: Report on the need for and provision of an\u2019ideal\u2019information retrieval test collection. Computer Laboratory (1975)"},{"key":"10_CR24","doi-asserted-by":"publisher","unstructured":"Sun, W., et al.: Is ChatGPT good at search? investigating large language models as re-ranking agents. In: Bouamor, H., Pino, J., Bali, K. (eds.) Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, pp. 14918\u201314937. Association for Computational Linguistics, Singapore (2023). https:\/\/doi.org\/10.18653\/v1\/2023.emnlp-main.923. https:\/\/aclanthology.org\/2023.emnlp-main.923\/","DOI":"10.18653\/v1\/2023.emnlp-main.923"},{"key":"10_CR25","doi-asserted-by":"crossref","unstructured":"Thomas, P., Spielman, S., Craswell, N., Mitra, B.: Large language models can accurately predict searcher preferences. arXiv preprint arXiv:2309.10621 (2023)","DOI":"10.1145\/3626772.3657707"},{"key":"10_CR26","unstructured":"Upadhyay, S., Kamalloo, E., Lin, J.: LLMs can patch up missing relevance judgments in evaluation. arXiv preprint arXiv:2405.04727 (2024)"},{"key":"10_CR27","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"355","DOI":"10.1007\/3-540-45691-0_34","volume-title":"Evaluation of Cross-Language Information Retrieval Systems","author":"EM Voorhees","year":"2002","unstructured":"Voorhees, E.M.: The philosophy of information retrieval evaluation. In: Peters, C., Braschler, M., Gonzalo, J., Kluck, M. (eds.) CLEF 2001. LNCS, vol. 2406, pp. 355\u2013370. Springer, Heidelberg (2002). https:\/\/doi.org\/10.1007\/3-540-45691-0_34"},{"key":"10_CR28","doi-asserted-by":"publisher","unstructured":"Wang, H., Na, T.: Rethinking e-commerce search. SIGIR Forum 57(2) (2024). https:\/\/doi.org\/10.1145\/3642979.3643007","DOI":"10.1145\/3642979.3643007"},{"key":"10_CR29","doi-asserted-by":"publisher","unstructured":"Wang, P., et al.: Large language models are not fair evaluators. In: Ku, L.W., Martins, A., Srikumar, V. (eds.) Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics, vol. 1: Long Papers, pp. 9440\u20139450. Association for Computational Linguistics, Bangkok (2024). https:\/\/doi.org\/10.18653\/v1\/2024.acl-long.511. https:\/\/aclanthology.org\/2024.acl-long.511\/","DOI":"10.18653\/v1\/2024.acl-long.511"},{"key":"10_CR30","first-page":"24824","volume":"35","author":"J Wei","year":"2022","unstructured":"Wei, J., et al.: Chain-of-thought prompting elicits reasoning in large language models. Adv. Neural. Inf. Process. Syst. 35, 24824\u201324837 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."}],"container-title":["Lecture Notes in Computer Science","Advances in Information Retrieval"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-88708-6_10","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,4]],"date-time":"2025-04-04T11:52:51Z","timestamp":1743767571000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-88708-6_10"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031887079","9783031887086"],"references-count":30,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-88708-6_10","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"3 April 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"ECIR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Information Retrieval","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Lucca","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7 April 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 April 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"47","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ecir2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/ecir2025.eu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}