{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T10:12:24Z","timestamp":1775038344114,"version":"3.50.1"},"publisher-location":"Cham","reference-count":41,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031887161","type":"print"},{"value":"9783031887178","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-88717-8_5","type":"book-chapter","created":{"date-parts":[[2025,4,4]],"date-time":"2025-04-04T12:08:33Z","timestamp":1743768513000},"page":"48-63","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["A Reproducibility Study on\u00a0Consistent LLM Reasoning for\u00a0Natural Language Inference over\u00a0Clinical Trials"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3682-4960","authenticated-orcid":false,"given":"Artur","family":"Guimar\u00e3es","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6290-5719","authenticated-orcid":false,"given":"Jo\u00e3o","family":"Magalh\u00e3es","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3856-2936","authenticated-orcid":false,"given":"Bruno","family":"Martins","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,4,3]]},"reference":[{"key":"5_CR1","doi-asserted-by":"crossref","unstructured":"Alkaissi, H., McFarlane, S.I.: Artificial hallucinations in chatgpt: implications in scientific writing. Cureus 15(2) (2023)","DOI":"10.7759\/cureus.35179"},{"issue":"12","key":"5_CR2","doi-asserted-by":"publisher","first-page":"1860","DOI":"10.1200\/JCO.2005.03.8976","volume":"24","author":"NE Avis","year":"2006","unstructured":"Avis, N.E., Smith, K.W., Link, C.L., Hortobagyi, G.N., Rivera, E.: Factors associated with participation in breast cancer treatment clinical trials. J. Clin. Oncol. 24(12), 1860\u20131867 (2006)","journal-title":"J. Clin. Oncol."},{"key":"5_CR3","unstructured":"Bernsohn, D., et al.: LegalLens: leveraging LLMs for legal violation identification in unstructured text. In: Graham, Y., Purver, M. (eds.) Proceedings of the Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers), St. Julian\u2019s, Malta, pp. 2129\u20132145. Association for Computational Linguistics (2024). https:\/\/aclanthology.org\/2024.eacl-long.130"},{"key":"5_CR4","unstructured":"Chowdhery, A., et al.: Palm: scaling language modeling with pathways (2022)"},{"key":"5_CR5","unstructured":"Chung, H.W., et al.: Scaling instruction-finetuned language models. arXiv:2210.11416 (2022)"},{"key":"5_CR6","doi-asserted-by":"crossref","unstructured":"Danilevsky, M., Qian, K., Aharonov, R., Katsis, Y., Kawas, B., Sen, P.: A survey of the state of explainable AI for natural language processing. arXiv:2010.00711 (2020)","DOI":"10.18653\/v1\/2020.aacl-main.46"},{"key":"5_CR7","doi-asserted-by":"crossref","unstructured":"Deng, C., Zhao, Y., Tang, X., Gerstein, M., Cohan, A.: Investigating data contamination in modern benchmarks for large language models. arXiv:2311.09783 (2023)","DOI":"10.18653\/v1\/2024.naacl-long.482"},{"key":"5_CR8","doi-asserted-by":"crossref","unstructured":"DeYoung, J., Lehman, E., Nye, B., Marshall, I.J., Wallace, B.C.: Evidence inference 2.0: more data, better models. arXiv:2005.04177 (2020)","DOI":"10.18653\/v1\/2020.bionlp-1.13"},{"key":"5_CR9","unstructured":"Dubey, A., et al.: The llama 3 herd of models. arXiv:2407.21783 (2024)"},{"key":"5_CR10","doi-asserted-by":"publisher","unstructured":"Gema, A., Hong, G., Minervini, P., Daines, L., Alex, B.: Edinburgh clinical NLP at SemEval-2024 task 2: fine-tune your model unless you have access to GPT-4. In: Ojha, A.K., Do\u011fru\u00f6z, A.S., Tayyar\u00a0Madabushi, H., Da\u00a0San\u00a0Martino, G., Rosenthal, S., Ros\u00e1, A. (eds.) Proceedings of the International Workshop on Semantic Evaluation, Mexico City, Mexico, pp. 1894\u20131904. Association for Computational Linguistics (2024). https:\/\/doi.org\/10.18653\/v1\/2024.semeval-1.265. https:\/\/aclanthology.org\/2024.semeval-1.265","DOI":"10.18653\/v1\/2024.semeval-1.265"},{"key":"5_CR11","unstructured":"Golchin, S., Surdeanu, M.: Time travel in LLMs: tracing data contamination in large language models. arXiv:2308.08493 (2023)"},{"key":"5_CR12","doi-asserted-by":"publisher","unstructured":"Guimar\u00e3es, A., Martins, B., Magalh\u00e3es, J.: Lisbon computational linguists at SemEval-2024 task 2: using a mistral-7B model and data augmentation. In: Ojha, A.K., Do\u011fru\u00f6z, A.S., Tayyar\u00a0Madabushi, H., Da\u00a0San\u00a0Martino, G., Rosenthal, S., Ros\u00e1, A. (eds.) Proceedings of the International Workshop on Semantic Evaluation, Mexico City, Mexico, pp. 1280\u20131287. Association for Computational Linguistics (2024). https:\/\/doi.org\/10.18653\/v1\/2024.semeval-1.185. https:\/\/aclanthology.org\/2024.semeval-1.185","DOI":"10.18653\/v1\/2024.semeval-1.185"},{"key":"5_CR13","doi-asserted-by":"crossref","unstructured":"He, X., Wu, Y., Camburu, O.M., Minervini, P., Stenetorp, P.: Using natural language explanations to improve robustness of in-context learning for natural language inference. arXiv:2311.07556 (2023)","DOI":"10.18653\/v1\/2024.acl-long.728"},{"key":"5_CR14","unstructured":"Jiang, A.Q., et\u00a0al.: Mistral 7b. arXiv:2310.06825 (2023)"},{"key":"5_CR15","doi-asserted-by":"crossref","unstructured":"Jin, Q., et al.: Matching patients to clinical trials with large language models. arXiv:2307.15051 (2023)","DOI":"10.1038\/s41467-024-53081-z"},{"key":"5_CR16","doi-asserted-by":"crossref","unstructured":"Jullien, M., Valentino, M., Freitas, A.: SemEval-2024 task 2: safe biomedical natural language inference for clinical trials. In: Proceedings of the International Workshop on Semantic Evaluation. Association for Computational Linguistics (2024)","DOI":"10.18653\/v1\/2024.semeval-1.271"},{"key":"5_CR17","doi-asserted-by":"crossref","unstructured":"Jullien, M., Valentino, M., Frost, H., O\u2019Regan, P., Landers, D., Freitas, A.: Nli4ct: multi-evidence natural language inference for clinical trial reports. arXiv:2305.03598 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.1041"},{"key":"5_CR18","unstructured":"Kim, D., et al.: Chatgpt (large language model) (2023). https:\/\/chat.openai.com\/chat"},{"key":"5_CR19","first-page":"22199","volume":"35","author":"T Kojima","year":"2022","unstructured":"Kojima, T., Gu, S.S., Reid, M., Matsuo, Y., Iwasawa, Y.: Large language models are zero-shot reasoners. Adv. Neural. Inf. Process. Syst. 35, 22199\u201322213 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"5_CR20","doi-asserted-by":"crossref","unstructured":"Labrak, Y., Bazoge, A., Morin, E., Gourraud, P.A., Rouvier, M., Dufour, R.: Biomistral: a collection of open-source pretrained large language models for medical domains. arXiv:2402.10373 (2024)","DOI":"10.18653\/v1\/2024.findings-acl.348"},{"key":"5_CR21","doi-asserted-by":"publisher","unstructured":"Lee, L.H., Chiou, C.Y., Lin, T.M.: NYCU-NLP at SemEval-2024 task 2: aggregating large language models in biomedical natural language inference for clinical trials. In: Ojha, A.K., Do\u011fru\u00f6z, A.S., Tayyar\u00a0Madabushi, H., Da\u00a0San\u00a0Martino, G., Rosenthal, S., Ros\u00e1, A. (eds.) Proceedings of the International Workshop on Semantic Evaluation, Mexico City, Mexico, pp. 1455\u20131462. Association for Computational Linguistics (2024). https:\/\/doi.org\/10.18653\/v1\/2024.semeval-1.209. https:\/\/aclanthology.org\/2024.semeval-1.209","DOI":"10.18653\/v1\/2024.semeval-1.209"},{"key":"5_CR22","unstructured":"Li, Y., Du, M., Song, R., Wang, X., Wang, Y.: A survey on fairness in large language models. arXiv:2308.10149 (2023)"},{"key":"5_CR23","doi-asserted-by":"publisher","unstructured":"Liu, J., Thoma, S.: FZI-WIM at SemEval-2024 task 2: self-consistent CoT for complex NLI in biomedical domain. In: Ojha, A.K., Do\u011fru\u00f6z, A.S., Tayyar\u00a0Madabushi, H., Da\u00a0San\u00a0Martino, G., Rosenthal, S., Ros\u00e1, A. (eds.) Proceedings of the International Workshop on Semantic Evaluation, Mexico City, Mexico, pp. 1269\u20131279. Association for Computational Linguistics (2024). https:\/\/doi.org\/10.18653\/v1\/2024.semeval-1.184. https:\/\/aclanthology.org\/2024.semeval-1.184","DOI":"10.18653\/v1\/2024.semeval-1.184"},{"key":"5_CR24","doi-asserted-by":"crossref","unstructured":"Magar, I., Schwartz, R.: Data contamination: from memorization to exploitation. arXiv:2203.08242 (2022)","DOI":"10.18653\/v1\/2022.acl-short.18"},{"key":"5_CR25","unstructured":"Ojha, A.K., Do\u011fru\u00f6z, A.S., Tayyar\u00a0Madabushi, H., Da\u00a0San\u00a0Martino, G., Rosenthal, S., Ros\u00e1, A. (eds.): Proceedings of the International Workshop on Semantic Evaluation, Mexico City, Mexico. Association for Computational Linguistics (2024). https:\/\/aclanthology.org\/2024.semeval-1.0"},{"key":"5_CR26","unstructured":"Achiam, J., et al.: GPT-4 technical report (2024)"},{"key":"5_CR27","doi-asserted-by":"crossref","unstructured":"Roberts, K., Demner-Fushman, D., Voorhees, E.M., Bedrick, S., Hersh, W.R.: Overview of the TREC 2022 clinical trials track. In: TREC (2022)","DOI":"10.6028\/NIST.SP.500-338.trials-overview"},{"key":"5_CR28","doi-asserted-by":"crossref","unstructured":"Romanov, A., Shivade, C.: Lessons from natural language inference in the clinical domain. arXiv:1808.06752 (2018)","DOI":"10.18653\/v1\/D18-1187"},{"key":"5_CR29","doi-asserted-by":"crossref","unstructured":"Soboroff, I.: Overview of TREC 2021. In: TREC (2021)","DOI":"10.6028\/NIST.SP.500-335.overview-overview"},{"key":"5_CR30","unstructured":"Team, G., et al.: Gemini: a family of highly capable multimodal models. arXiv:2312.11805 (2023)"},{"key":"5_CR31","unstructured":"Touvron, H., et al.: Llama 2: open foundation and fine-tuned chat models (2023)"},{"key":"5_CR32","doi-asserted-by":"crossref","unstructured":"Wan, Y., Pu, G., Sun, J., Garimella, A., Chang, K.W., Peng, N.: \u201cKelly is a warm person, joseph is a role model\u201d: gender biases in LLM-generated reference letters. arXiv:2310.09219 (2023)","DOI":"10.18653\/v1\/2023.findings-emnlp.243"},{"key":"5_CR33","unstructured":"Wang, G., Cheng, S., Zhan, X., Li, X., Song, S., Liu, Y.: Openchat: advancing open-source language models with mixed-quality data (2024)"},{"key":"5_CR34","unstructured":"Wang, X., Zhu, W., Saxon, M., Steyvers, M., Wang, W.Y.: Large language models are latent variable models: explaining and finding good demonstrations for in-context learning. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"5_CR35","unstructured":"Wang, X., et al.: Self-consistency improves chain of thought reasoning in language models. arXiv:2203.11171 (2022)"},{"key":"5_CR36","unstructured":"Wei, J., et al.: Finetuned language models are zero-shot learners. arXiv:2109.01652 (2022)"},{"key":"5_CR37","unstructured":"Wei, J., et al.: Emergent abilities of large language models. arXiv:2206.07682 (2022)"},{"key":"5_CR38","first-page":"24824","volume":"35","author":"J Wei","year":"2022","unstructured":"Wei, J., et al.: Chain-of-thought prompting elicits reasoning in large language models. Adv. Neural. Inf. Process. Syst. 35, 24824\u201324837 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"5_CR39","unstructured":"Wong, C., et al.: Scaling clinical trial matching using large language models: a case study in oncology. In: Machine Learning for Healthcare Conference, pp. 846\u2013862. PMLR (2023)"},{"key":"5_CR40","doi-asserted-by":"crossref","unstructured":"Xie, Q., Schenck, E.J., Yang, H.S., Chen, Y., Peng, Y., Wang, F.: Faithful AI in medicine: a systematic review with large language models and beyond. medRxiv (2023)","DOI":"10.21203\/rs.3.rs-3661764\/v1"},{"issue":"2","key":"5_CR41","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3639372","volume":"15","author":"H Zhao","year":"2024","unstructured":"Zhao, H., et al.: Explainability for large language models: a survey. ACM Trans. Intell. Syst. Technol. 15(2), 1\u201338 (2024)","journal-title":"ACM Trans. Intell. Syst. Technol."}],"container-title":["Lecture Notes in Computer Science","Advances in Information Retrieval"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-88717-8_5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,8]],"date-time":"2025-05-08T09:23:18Z","timestamp":1746696198000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-88717-8_5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031887161","9783031887178"],"references-count":41,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-88717-8_5","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"3 April 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that\u00a0are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"ECIR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Information Retrieval","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Lucca","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7 April 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 April 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"47","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ecir2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/ecir2025.eu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}