{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T10:40:02Z","timestamp":1755859202267,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":49,"publisher":"ACM","funder":[{"name":"PWC Research and Development Centre","award":["R5212ECS"],"award-info":[{"award-number":["R5212ECS"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,7,13]]},"DOI":"10.1145\/3726302.3730047","type":"proceedings-article","created":{"date-parts":[[2025,7,14]],"date-time":"2025-07-14T01:21:38Z","timestamp":1752456098000},"page":"1109-1119","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["OBELLA: Open the Book for Evaluating Long-Form Large Language Model Answers in Open-Domain Question Answering"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-5330-1979","authenticated-orcid":false,"given":"Tianyu","family":"Ren","sequence":"first","affiliation":[{"name":"Queen's University Belfast, Belfast, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4303-2806","authenticated-orcid":false,"given":"Zhaoyu","family":"Zhang","sequence":"additional","affiliation":[{"name":"Queen's University Belfast, Belfast, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2633-6015","authenticated-orcid":false,"given":"Hui","family":"Wang","sequence":"additional","affiliation":[{"name":"Queen's University Belfast, Belfast, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7443-7876","authenticated-orcid":false,"given":"Karen","family":"Rafferty","sequence":"additional","affiliation":[{"name":"Queen's University Belfast, Belfast, United Kingdom"}]}],"member":"320","published-online":{"date-parts":[[2025,7,13]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Jamie Ryan Kiros, and Geoffrey E. Hinton","author":"Ba Jimmy Lei","year":"2016","unstructured":"Jimmy Lei Ba, Jamie Ryan Kiros, and Geoffrey E. Hinton. 2016. Layer Normalization. arxiv:1607.06450 [stat.ML] https:\/\/arxiv.org\/abs\/1607.06450"},{"key":"e_1_3_2_1_2_1","first-page":"65 05","volume-title":"Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization. Association for Computational Linguistics","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An Automatic Metric for MT Evaluation with Improved Correlation with Human Judgments. In Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization. Association for Computational Linguistics, Ann Arbor, Michigan, 65-72. https:\/\/aclanthology.org\/W05-0909"},{"key":"e_1_3_2_1_3_1","unstructured":"Tom B. Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell Sandhini Agarwal Ariel Herbert-Voss Gretchen Krueger Tom Henighan Rewon Child Aditya Ramesh Daniel M. Ziegler Jeffrey Wu Clemens Winter Christopher Hesse Mark Chen Eric Sigler Mateusz Litwin Scott Gray Benjamin Chess Jack Clark Christopher Berner Sam McCandlish Alec Radford Ilya Sutskever and Dario Amodei. 2020. Language Models are Few-Shot Learners. arxiv:2005.14165 [cs.CL] https:\/\/arxiv.org\/abs\/2005.14165"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.20"},{"key":"e_1_3_2_1_5_1","volume-title":"Advances in Neural Information Processing Systems","volume":"31","author":"Camburu Oana-Maria","year":"2018","unstructured":"Oana-Maria Camburu, Tim Rockt\u00e4schel, Thomas Lukasiewicz, and Phil Blunsom. 2018. e-SNLI: Natural Language Inference with Natural Language Explanations. In Advances in Neural Information Processing Systems, Vol. 31. Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2018\/file\/4c7a167bb329bd92580a99ce422d6fa6-Paper.pdf"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.528"},{"key":"e_1_3_2_1_7_1","volume-title":"CLEX: Continuous Length Extrapolation for Large Language Models. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=wXpSidPpc5","author":"Chen Guanzheng","year":"2024","unstructured":"Guanzheng Chen, Xin Li, Zaiqiao Meng, Shangsong Liang, and Lidong Bing. 2024a. CLEX: Continuous Length Extrapolation for Large Language Models. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=wXpSidPpc5"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-emnlp.324"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.845"},{"key":"e_1_3_2_1_10_1","unstructured":"Ta-Chung Chi. 2024. Toward Length-Extrapolatable Transformers. Ph.D. Dissertation. Carnegie Mellon University."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1177\/001316446002000104"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19--1423"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.467"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","unstructured":"Matteo Gabburo Siddhant Garg Rik Koncel-Kedziorski and Alessandro Moschitti. 2023b. SQUARE: Automatic Question Answering Evaluation using Multiple Positive and Negative References. In Proceedings of the 13th International Joint Conference on Natural Language Processing and the 3rd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics (Volume 2: Short Papers). Association for Computational Linguistics Nusa Dua Bali 20-28. https:\/\/aclanthology.org\/2023.ijcnlp-short.3","DOI":"10.18653\/v1\/2023.ijcnlp-short.3"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.3115\/1220175.1220289"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_17_1","volume-title":"LoRA: Low-Rank Adaptation of Large Language Models. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=nZeVKeeFYf9","author":"Hu Edward J","year":"2022","unstructured":"Edward J Hu, yelong shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2022. LoRA: Low-Rank Adaptation of Large Language Models. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=nZeVKeeFYf9"},{"key":"e_1_3_2_1_18_1","unstructured":"Lei Huang Weijiang Yu Weitao Ma Weihong Zhong Zhangyin Feng Haotian Wang Qianglong Chen Weihua Peng Xiaocheng Feng Bing Qin and Ting Liu. 2023. A Survey on Hallucination in Large Language Models: Principles Taxonomy Challenges and Open Questions. arxiv:2311.05232 [cs.CL] https:\/\/arxiv.org\/abs\/2311.05232"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.eacl-main.74"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1147"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.307"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00276"},{"key":"e_1_3_2_1_23_1","volume-title":"ALBERT: A Lite BERT for Self-supervised Learning of Language Representations. CoRR","author":"Lan Zhenzhong","year":"2019","unstructured":"Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, and Radu Soricut. 2019. ALBERT: A Lite BERT for Self-supervised Learning of Language Representations. CoRR, Vol. abs\/1909.11942 (2019). arxiv:1909.11942 http:\/\/arxiv.org\/abs\/1909.11942"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.772"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657778"},{"key":"e_1_3_2_1_26_1","volume-title":"ROUGE: A Package for Automatic Evaluation of Summaries. In Text Summarization Branches Out","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. ROUGE: A Package for Automatic Evaluation of Summaries. In Text Summarization Branches Out. Association for Computational Linguistics, Barcelona, Spain, 74-81. https:\/\/aclanthology.org\/W04--1013"},{"key":"e_1_3_2_1_27_1","unstructured":"AI @ Meta Llama Team. 2024. The Llama 3 Herd of Models. arxiv:2407.21783 [cs.AI] https:\/\/arxiv.org\/abs\/2407.21783"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.466"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657855"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","unstructured":"Yixin Nie Haonan Chen and Mohit Bansal. 2019. Combining fact extraction and verification with neural semantic matching networks. In Proceedings of the Thirty-Third AAAI Conference on Artificial Intelligence and Thirty-First Innovative Applications of Artificial Intelligence Conference and Ninth AAAI Symposium on Educational Advances in Artificial Intelligence (Honolulu Hawaii USA) (AAAI'19\/IAAI'19\/EAAI'19). AAAI Press Article 842 8 pages. doi:10.1609\/aaai.v33i01.33016859","DOI":"10.1609\/aaai.v33i01.33016859"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.441"},{"key":"e_1_3_2_1_32_1","unstructured":"OpenAI. 2023. GPT-4 Technical Report. arXiv preprint arXiv:2303.08774 (2023). https:\/\/arxiv.org\/abs\/2303.08774"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.5555\/3600270.3602281"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"e_1_3_2_1_35_1","volume-title":"Test Long: Attention with Linear Biases Enables Input Length Extrapolation. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=R8sQPpGCv0","author":"Press Ofir","year":"2022","unstructured":"Ofir Press, Noah Smith, and Mike Lewis. 2022. Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=R8sQPpGCv0"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1264"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i23.34693"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D13-1020"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1561\/1500000019"},{"key":"e_1_3_2_1_40_1","volume-title":"Advances in Neural Information Processing Systems","volume":"30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems, Vol. 30. Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.412"},{"key":"e_1_3_2_1_42_1","volume-title":"Evaluating Open-QA Evaluation. In Thirty-seventh Conference on Neural Information Processing Systems Datasets and Benchmarks Track. https:\/\/openreview.net\/forum?id=UErNpveP6R","author":"Wang Cunxiang","year":"2023","unstructured":"Cunxiang Wang, Sirui Cheng, Qipeng Guo, Yuanhao Yue, Bowen Ding, Zhikun Xu, Yidong Wang, Xiangkun Hu, Zheng Zhang, and Yue Zhang. 2023. Evaluating Open-QA Evaluation. In Thirty-seventh Conference on Neural Information Processing Systems Datasets and Benchmarks Track. https:\/\/openreview.net\/forum?id=UErNpveP6R"},{"key":"e_1_3_2_1_43_1","volume-title":"The Thirty-eighth Annual Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=4M9f8VMt2C","author":"Wei Jerry","year":"2024","unstructured":"Jerry Wei, Chengrun Yang, Xinying Song, Yifeng Lu, Nathan Zixia Hu, Jie Huang, Dustin Tran, Daiyi Peng, Ruibo Liu, Da Huang, Cosmo Du, and Quoc V Le. 2024. Long-form factuality in large language models. In The Thirty-eighth Annual Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=4M9f8VMt2C"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.181"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3661370"},{"key":"e_1_3_2_1_46_1","volume-title":"Llm lies: Hallucinations are not bugs, but features as adversarial examples. arXiv preprint arXiv:2310.01469","author":"Yao Jia-Yu","year":"2023","unstructured":"Jia-Yu Yao, Kun-Peng Ning, Zhen-Hui Liu, Mu-Nan Ning, and Li Yuan. 2023. Llm lies: Hallucinations are not bugs, but features as adversarial examples. arXiv preprint arXiv:2310.01469 (2023). https:\/\/arxiv.org\/abs\/2310.01469"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.151"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.365"},{"key":"e_1_3_2_1_49_1","volume-title":"BERTScore: Evaluating Text Generation with BERT. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=SkeHuCVFDr","author":"Zhang Tianyi","year":"2020","unstructured":"Tianyi Zhang, Varsha Kishore, Felix Wu, Kilian Q. Weinberger, and Yoav Artzi. 2020. BERTScore: Evaluating Text Generation with BERT. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=SkeHuCVFDr"}],"event":{"name":"SIGIR '25: The 48th International ACM SIGIR Conference on Research and Development in Information Retrieval","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"],"location":"Padua Italy","acronym":"SIGIR '25"},"container-title":["Proceedings of the 48th International ACM SIGIR Conference on Research and Development in Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3726302.3730047","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T10:03:21Z","timestamp":1755857001000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3726302.3730047"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,13]]},"references-count":49,"alternative-id":["10.1145\/3726302.3730047","10.1145\/3726302"],"URL":"https:\/\/doi.org\/10.1145\/3726302.3730047","relation":{},"subject":[],"published":{"date-parts":[[2025,7,13]]},"assertion":[{"value":"2025-07-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}