{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,11]],"date-time":"2026-04-11T13:17:49Z","timestamp":1775913469167,"version":"3.50.1"},"publisher-location":"Cham","reference-count":41,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031790287","type":"print"},{"value":"9783031790294","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-79029-4_32","type":"book-chapter","created":{"date-parts":[[2025,1,29]],"date-time":"2025-01-29T22:25:11Z","timestamp":1738189511000},"page":"460-474","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Evaluating Large Language Models for\u00a0Tax Law Reasoning"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-4160-6495","authenticated-orcid":false,"given":"Jo\u00e3o Paulo Cavalcante","family":"Presa","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2553-8790","authenticated-orcid":false,"given":"Celso Gon\u00e7alves","family":"Camilo Junior","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0002-1203-5246","authenticated-orcid":false,"given":"S\u00e1vio Salvarino Teles de","family":"Oliveira","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,1,30]]},"reference":[{"key":"32_CR1","unstructured":"Qwen2 blog (2024). https:\/\/qwenlm.github.io\/blog\/qwen2\/. Accessed 08 June 2024"},{"issue":"1","key":"32_CR2","doi-asserted-by":"publisher","first-page":"127","DOI":"10.1186\/s40537-023-00802-8","volume":"10","author":"A Abdallah","year":"2023","unstructured":"Abdallah, A., Piryani, B., Jatowt, A.: Exploring the state of the art in legal qa systems. J. Big Data 10(1), 127 (2023)","journal-title":"J. Big Data"},{"key":"32_CR3","unstructured":"AI@Meta: Llama 3 model card (2024). https:\/\/github.com\/meta-llama\/llama3\/tree\/main"},{"key":"32_CR4","doi-asserted-by":"crossref","unstructured":"Ainslie, J., Lee-Thorp, J., de\u00a0Jong, M., Zemlyanskiy, Y., Lebron, F., Sanghai, S.: Gqa: training generalized multi-query transformer models from multi-head checkpoints. In: Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, pp. 4895\u20134901 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.298"},{"key":"32_CR5","unstructured":"Cui, J., Li, Z., Yan, Y., Chen, B., Yuan, L.: Chatlaw: open-source legal large language model with integrated external knowledge bases. arXiv preprint arXiv:2306.16092 (2023)"},{"key":"32_CR6","unstructured":"Dai, Y., et al.: Laiw: a Chinese legal large language models benchmark (a technical report). arXiv preprint arXiv:2310.05620 (2023)"},{"key":"32_CR7","unstructured":"Du, Y., Wei, F., Zhang, H.: Anytool: self-reflective, hierarchical agents for large-scale api calls. arXiv preprint arXiv:2402.04253 (2024)"},{"key":"32_CR8","doi-asserted-by":"crossref","unstructured":"Fei, Z., et al.: Lawbench: benchmarking legal knowledge of large language models. arXiv preprint arXiv:2309.16289 (2023)","DOI":"10.18653\/v1\/2024.emnlp-main.452"},{"key":"32_CR9","unstructured":"General Coordination of Taxation (Cosit): Questions and answers for legal entities (2023). https:\/\/www.gov.br\/receitafederal\/pt-br\/assuntos\/orientacao-tributaria\/declaracoes-e-demonstrativos\/ecf\/perguntas-e-respostas-pj-2023.pdf. Accessed 11 Nov 2023"},{"key":"32_CR10","doi-asserted-by":"crossref","unstructured":"Hackl, V., M\u00fcller, A.E., Granitzer, M., Sailer, M.: Is gpt-4 a reliable rater? evaluating consistency in gpt-4\u2019s text ratings. In: Frontiers in Education. vol.\u00a08, p. 1272229. Frontiers Media SA (2023)","DOI":"10.3389\/feduc.2023.1272229"},{"issue":"5","key":"32_CR11","doi-asserted-by":"publisher","DOI":"10.1088\/1361-6579\/ab86d6","volume":"41","author":"S Haghayegh","year":"2020","unstructured":"Haghayegh, S., Kang, H.A., Khoshnevis, S., Smolensky, M.H., Diller, K.R.: A comprehensive guideline for bland-altman and intra class correlation calculations to properly compare two methods of measurement and interpret findings. Physiol. Meas. 41(5), 055012 (2020)","journal-title":"Physiol. Meas."},{"key":"32_CR12","unstructured":"Jiang, A.Q., et\u00a0al.: Mistral 7b (2023). arXiv preprint arXiv:2310.06825"},{"key":"32_CR13","unstructured":"Jiang, A.Q., et\u00a0al.: Mixtral of experts (2024). arXiv preprint arXiv:2401.04088"},{"key":"32_CR14","doi-asserted-by":"crossref","unstructured":"Joshi, M., Choi, E., Weld, D.S., Zettlemoyer, L.: Triviaqa: a large scale distantly supervised challenge dataset for reading comprehension. In: Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics, vol. 1: Long Papers, pp. 1601\u20131611 (2017)","DOI":"10.18653\/v1\/P17-1147"},{"key":"32_CR15","unstructured":"Kim, D., et al.: sdpo: don\u2019t use your data all at once (2024)"},{"key":"32_CR16","doi-asserted-by":"crossref","unstructured":"Kim, D., et al.: Solar 10.7b: scaling large language models with simple yet effective depth up-scaling (2023)","DOI":"10.18653\/v1\/2024.naacl-industry.3"},{"key":"32_CR17","doi-asserted-by":"crossref","unstructured":"Koutcheme, C., Dainese, N., Sarsa, S., Hellas, A., Leinonen, J., Denny, P.: Open source language models can provide feedback: evaluating llms\u2019 ability to help students using gpt-4-as-a-judge. arXiv preprint arXiv:2405.05253 (2024)","DOI":"10.1145\/3649217.3653612"},{"key":"32_CR18","unstructured":"Lee, A.N., Hunter, C.J., Ruiz, N.: Platypus: quick, cheap, and powerful refinement of llms (2023)"},{"key":"32_CR19","unstructured":"Lin, C.Y.: Rouge: a package for automatic evaluation of summaries. In: Text Summarization Branches Out, pp. 74\u201381 (2004)"},{"key":"32_CR20","doi-asserted-by":"crossref","unstructured":"Liu, Y., Iter, D., Xu, Y., Wang, S., Xu, R., Zhu, C.: G-eval: Nlg evaluation using gpt-4 with better human alignment. In: The 2023 Conference on Empirical Methods in Natural Language Processing (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.153"},{"key":"32_CR21","doi-asserted-by":"crossref","unstructured":"Louis, A., van Dijck, G., Spanakis, G.: Interpretable long-form legal question answering with retrieval-augmented large language models. arXiv preprint arXiv:2309.17050 (2023)","DOI":"10.1609\/aaai.v38i20.30232"},{"key":"32_CR22","unstructured":"Ma, S., Chen, C., Chu, Q., Mao, J.: Leveraging large language models for relevance judgments in legal case retrieval. arXiv preprint arXiv:2403.18405 (2024)"},{"key":"32_CR23","unstructured":"Martin, L., Whitehouse, N., Yiu, S., Catterson, L., Perera, R.: Better call gpt, comparing large language models against lawyers. arXiv preprint arXiv:2401.16212 (2024)"},{"key":"32_CR24","unstructured":"Mistral.ai: Introducing the mixtral-8x22b-instruct-v0.1 model (2024). https:\/\/mistral.ai\/news\/mixtral-8x22b\/. Accessed 15 May 2024"},{"key":"32_CR25","unstructured":"Niklaus, J., et al.: Flawn-t5: an empirical examination of effective instruction-tuning data mixtures for legal reasoning. arXiv preprint arXiv:2404.02127 (2024)"},{"key":"32_CR26","unstructured":"OpenAI: Gpt-4 technical report (2023). https:\/\/cdn.openai.com\/papers\/gpt-4.pdf. Accessed 10 Jan 2024"},{"key":"32_CR27","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"32_CR28","doi-asserted-by":"crossref","unstructured":"Rajpurkar, P., Zhang, J., Lopyrev, K., Liang, P.: Squad: 100,000+ questions for machine comprehension of text. In: Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing, pp. 2383\u20132392 (2016)","DOI":"10.18653\/v1\/D16-1264"},{"key":"32_CR29","unstructured":"Lai, J., Gan, W., Wu, J., Qi, Z., Philip, S.Y.: Large language models in law: a survey (2023). https:\/\/arxiv.org\/abs\/2312.03718"},{"key":"32_CR30","unstructured":"Shazeer, N.: Glu variants improve transformer. arXiv preprint arXiv:2002.05202 (2020)"},{"key":"32_CR31","doi-asserted-by":"crossref","unstructured":"Sottana, A., Liang, B., Zou, K., Yuan, Z.: Evaluation metrics in the era of gpt-4: reliably evaluating large language models on sequence to sequence tasks. In: The 2023 Conference on Empirical Methods in Natural Language Processing (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.543"},{"key":"32_CR32","unstructured":"Team, I.: Openhermes-2-5-mistral-7b (2024). https:\/\/github.com\/inferless\/OpenHermes-2-5-Mistral-7B. Accessed 15 May 2024"},{"key":"32_CR33","unstructured":"Touvron, H., A.: Llama 2: open foundation and fine-tuned chat models (2023)"},{"key":"32_CR34","unstructured":"Wang, G., Cheng, S., Zhan, X., Li, X., Song, S., Liu, Y.: Openchat: advancing open-source language models with mixed-quality data. In: The Twelfth International Conference on Learning Representations (2023)"},{"key":"32_CR35","unstructured":"Wei, F., Chen, X., Luo, L.: Rethinking generative large language model evaluation for semantic comprehension. arXiv e-prints pp. arXiv\u20132403 (2024)"},{"key":"32_CR36","unstructured":"Xu, C., et al.: Wizardlm: empowering large language models to follow complex instructions. arXiv preprint arXiv:2304.12244 (2023)"},{"key":"32_CR37","doi-asserted-by":"crossref","unstructured":"Yue, L., et al.: Fedjudge: federated legal large language model. arXiv preprint arXiv:2309.08173 (2023)","DOI":"10.1007\/978-981-97-5569-1_17"},{"key":"32_CR38","unstructured":"Yue, S., et\u00a0al.: Disc-lawllm: fine-tuning large language models for intelligent legal services. arXiv preprint arXiv:2309.11325 (2023)"},{"key":"32_CR39","unstructured":"Zhang, R., et al.: Evaluation ethics of llms in legal domain. arXiv preprint arXiv:2403.11152 (2024)"},{"key":"32_CR40","unstructured":"Zhang, T., Kishore, V., Wu, F., Weinberger, K.Q., Artzi, Y.: Bertscore: evaluating text generation with bert (2020)"},{"key":"32_CR41","unstructured":"Zheng, L., et\u00a0al.: Judging llm-as-a-judge with mt-bench and chatbot arena. In: Thirty-Seventh Conference on Neural Information Processing Systems Datasets and Benchmarks Track (2023)"}],"container-title":["Lecture Notes in Computer Science","Intelligent Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-79029-4_32","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,29]],"date-time":"2025-01-29T22:25:23Z","timestamp":1738189523000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-79029-4_32"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031790287","9783031790294"],"references-count":41,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-79029-4_32","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"30 January 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"BRACIS","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Brazilian Conference on Intelligent Systems","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Bel\u00e9m do Par\u00e1","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Brazil","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17 November 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21 November 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"34","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"bracis2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}