{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,5]],"date-time":"2026-05-05T03:34:07Z","timestamp":1777952047937,"version":"3.51.4"},"reference-count":25,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T00:00:00Z","timestamp":1769817600000},"content-version":"vor","delay-in-days":30,"URL":"http:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Procedia Computer Science"],"published-print":{"date-parts":[[2026]]},"DOI":"10.1016\/j.procs.2026.01.060","type":"journal-article","created":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T19:30:19Z","timestamp":1774035019000},"page":"513-522","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["LLM as a Judge for Evaluating Contract Graphs: Multi-Judge Benchmarking and Agentic Uncertainty-Aware Refinement"],"prefix":"10.1016","volume":"275","author":[{"given":"Moriya","family":"Dechtiar","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Daniel Martin","family":"Katz","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sylvain","family":"Jaume","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hongming","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"78","reference":[{"key":"10.1016\/j.procs.2026.01.060_bib1","doi-asserted-by":"crossref","unstructured":"Liu, Y., Iter, D., Xu, Y., Wang, S., Xu, R., & Zhu, C. (2023). G-Eval: NLG evaluation using GPT-4 with better human alignment. In Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing (pp. 2511\u20132522). Association for Computational Linguistics, Singapore. doi: 10.18653\/v1\/2023.emnlp-main.153.","DOI":"10.18653\/v1\/2023.emnlp-main.153"},{"key":"10.1016\/j.procs.2026.01.060_bib2","first-page":"46595","article-title":"Judging llm-as-a-judge with mt-bench and chatbot arena","volume":"36","author":"Zheng","year":"2023","journal-title":"Advances in neural information processing systems"},{"key":"10.1016\/j.procs.2026.01.060_bib3","doi-asserted-by":"crossref","unstructured":"Yu, G., Sivakumar, M., Belle, A. B., Ghari, S., Wang, S., & Lethbridge, T. C. (2025). LLMs as Judges: Toward The Automatic Review of GSN-compliant Assurance Cases. arXiv preprint arXiv:2511.02203.","DOI":"10.2139\/ssrn.5912869"},{"key":"10.1016\/j.procs.2026.01.060_bib4","unstructured":"Wang, Y., Song, Y., Zhu, T., Zhang, X., Yu, Z., Chen, H.,... & Zhang, S. (2025). TrustJudge: Inconsistencies of LLM-as-a-Judge and How to Alleviate Them. arXiv preprint arXiv:2509.21117."},{"key":"10.1016\/j.procs.2026.01.060_bib5","doi-asserted-by":"crossref","unstructured":"Sheng, H., Liu, X., He, H., Zhao, J., & Kang, J. (2025). Analyzing Uncertainty of LLM-as-a-Judge: Interval Evaluations with Conformal Prediction. arXiv preprint arXiv:2509.18658.","DOI":"10.18653\/v1\/2025.emnlp-main.569"},{"key":"10.1016\/j.procs.2026.01.060_bib6","doi-asserted-by":"crossref","unstructured":"Dechtiar, M., Katz, D. M., Sundaresan, M., Jaume, S., & Wang, H. (2025). GRAPH-GRPO-LEX: Contract Graph Modeling and Reinforcement Learning with Group Relative Policy Optimization. Available at SSRN 5566538.","DOI":"10.2139\/ssrn.5566538"},{"key":"10.1016\/j.procs.2026.01.060_bib7","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., & Zhu, W. J. (2002, July). Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics (pp. 311-318).","DOI":"10.3115\/1073083.1073135"},{"issue":"1","key":"10.1016\/j.procs.2026.01.060_bib8","doi-asserted-by":"crossref","first-page":"25","DOI":"10.14778\/1687627.1687631","article-title":"Comparing stars: On approximating graph edit distance","volume":"2","author":"Zeng","year":"2009","journal-title":"Proceedings of the VLDB Endowment"},{"issue":"2","key":"10.1016\/j.procs.2026.01.060_bib9","doi-asserted-by":"crossref","first-page":"494","DOI":"10.1109\/TNNLS.2021.3070843","article-title":"A survey on knowledge graphs: Representation, acquisition, and applications","volume":"33","author":"Ji","year":"2021","journal-title":"IEEE transactions on neural networks and learning systems"},{"key":"10.1016\/j.procs.2026.01.060_bib10","series-title":"Beyond accuracy: Behavioral testing of NLP models with CheckList. In Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics (pp. 4902\u20134912)","author":"Ribeiro","year":"2020"},{"key":"10.1016\/j.procs.2026.01.060_bib11","unstructured":"Chan, C. M., Chen, W., Su, Y., Yu, J., Xue, W., Zhang, S.,... & Liu, Z. (2023). Chateval: Towards better llm-based evaluators through multi-agent debate. arXiv preprint arXiv:2308.07201."},{"key":"10.1016\/j.procs.2026.01.060_bib12","series-title":"On calibration of modern neural networks. In International conference on machine learning (pp. 1321-1330)","author":"Guo","year":"2017"},{"key":"10.1016\/j.procs.2026.01.060_bib13","unstructured":"Lakshminarayanan, B., Pritzel, A., & Blundell, C. (2017). Simple and scalable predictive uncertainty estimation using deep ensembles. Advances in neural information processing systems, 30."},{"key":"10.1016\/j.procs.2026.01.060_bib14","doi-asserted-by":"crossref","unstructured":"Lee, D., Hwang, Y., Kim, Y., Park, J., & Jung, K. (2024). Are llm-judges robust to expressions of uncertainty? investigating the effect of epistemic markers on llm-based evaluation. arXiv preprint arXiv:2410.20774.","DOI":"10.18653\/v1\/2025.naacl-long.452"},{"key":"10.1016\/j.procs.2026.01.060_bib15","first-page":"46534","article-title":"Self-refine: Iterative refinement with self-feedback","volume":"36","author":"Madaan","year":"2023","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.procs.2026.01.060_bib16","first-page":"8634","article-title":"Reflexion: Language agents with verbal reinforcement learning","volume":"36","author":"Shinn","year":"2023","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.procs.2026.01.060_bib17","unstructured":"Bai, Y., Kadavath, S., Kundu, S., Askell, A., Kernion, J., Jones, A.,... & Kaplan, J. (2022). Constitutional ai: Harmlessness from ai feedback. arXiv preprint arXiv:2212.08073."},{"key":"10.1016\/j.procs.2026.01.060_bib18","first-page":"27730","article-title":"Training language models to follow instructions with human feedback","volume":"35","author":"Ouyang","year":"2022","journal-title":"Advances in neural information processing systems"},{"key":"10.1016\/j.procs.2026.01.060_bib19","unstructured":"Dong, Z., Peng, B., Wang, Y., Fu, J., Wang, X., Shan, Y., & Zhou, X. (2024). Effiqa: Efficient question-answering with strategic multi-model collaboration on knowledge graphs. arXiv preprint arXiv:2406.01238."},{"key":"10.1016\/j.procs.2026.01.060_bib20","unstructured":"Kadavath, S., Conerly, T., Askell, A., Henighan, T., Drain, D., Perez, E.,... & Kaplan, J. (2022). Language models (mostly) know what they know. arXiv preprint arXiv:2207.05221."},{"key":"10.1016\/j.procs.2026.01.060_bib21","doi-asserted-by":"crossref","first-page":"172","DOI":"10.18653\/v1\/2021.nllp-1.18","article-title":"Named Entity Recognition in Historic Legal Text: A Transformer and State Machine Ensemble Method","author":"Trias","year":"2021","journal-title":"in Proceedings of the Natural Legal Language Processing Workshop, EMNLP 2021"},{"key":"10.1016\/j.procs.2026.01.060_bib22","doi-asserted-by":"crossref","unstructured":"Chalkidis, I., Jana, A., Hartung, D., Bommarito, M., Androutsopoulos, I., Katz, D. M., & Aletras, N. (2021). LexGLUE: A benchmark dataset for legal language understanding in English. In Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (pp.4310-4330).","DOI":"10.18653\/v1\/2022.acl-long.297"},{"key":"10.1016\/j.procs.2026.01.060_bib23","first-page":"44123","article-title":"Legalbench: A collaboratively built benchmark for measuring legal reasoning in large language models","volume":"36","author":"Guha","year":"2023","journal-title":"Advances in neural information processing systems"},{"key":"10.1016\/j.procs.2026.01.060_bib24","unstructured":"Manor, L., & Li, J. J. (2019). Plain English summarization of contracts. arXiv preprint arXiv:1906.00424."},{"key":"10.1016\/j.procs.2026.01.060_bib25","unstructured":"Wang, W., Wei, F., Dong, L., Bao, H., Yang, N., & Zhou, M. (2020). MiniLM: Deep self-attention distillation for task-agnostic compression of pre-trained transformers. Advances in Neural Information Processing Systems, 33."}],"container-title":["Procedia Computer Science"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1877050926000608?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1877050926000608?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T11:21:39Z","timestamp":1777893699000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S1877050926000608"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"references-count":25,"alternative-id":["S1877050926000608"],"URL":"https:\/\/doi.org\/10.1016\/j.procs.2026.01.060","relation":{},"ISSN":["1877-0509"],"issn-type":[{"value":"1877-0509","type":"print"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"LLM as a Judge for Evaluating Contract Graphs: Multi-Judge Benchmarking and Agentic Uncertainty-Aware Refinement","name":"articletitle","label":"Article Title"},{"value":"Procedia Computer Science","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.procs.2026.01.060","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 The Author(s). Published by Elsevier B.V.","name":"copyright","label":"Copyright"}]}}