{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,9]],"date-time":"2025-10-09T21:05:49Z","timestamp":1760043949660,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":49,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,5,14]],"date-time":"2025-05-14T00:00:00Z","timestamp":1747180800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100006374","name":"National Science Foundation","doi-asserted-by":"publisher","award":["2238665, 2402696"],"award-info":[{"award-number":["2238665, 2402696"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,5,14]]},"DOI":"10.1145\/3713082.3730382","type":"proceedings-article","created":{"date-parts":[[2025,6,6]],"date-time":"2025-06-06T09:53:51Z","timestamp":1749203631000},"page":"34-41","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Can Large Language Models Verify System Software? A Case Study Using FSCQ as a Benchmark"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-1581-9577","authenticated-orcid":false,"given":"Jianxing","family":"Qin","sequence":"first","affiliation":[{"name":"Duke University"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-5268-4934","authenticated-orcid":false,"given":"Alexander","family":"Du","sequence":"additional","affiliation":[{"name":"Duke University"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1942-6872","authenticated-orcid":false,"given":"Danfeng","family":"Zhang","sequence":"additional","affiliation":[{"name":"Duke University"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1034-2736","authenticated-orcid":false,"given":"Matthew","family":"Lentz","sequence":"additional","affiliation":[{"name":"Duke University"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0611-3941","authenticated-orcid":false,"given":"Danyang","family":"Zhuo","sequence":"additional","affiliation":[{"name":"Duke University"}]}],"member":"320","published-online":{"date-parts":[[2025,6,6]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2025. Chatbot Arena (formerly LMSYS) Free AI Chat to Compare & Test Best AI Chatbots. https:\/\/lmarena.ai\/"},{"key":"e_1_3_2_1_2_1","unstructured":"2025. Cursor - The AI code editor. https:\/\/www.cursor.com\/"},{"key":"e_1_3_2_1_3_1","volume-title":"ProofNet: Autoformalizing and Formally Proving Undergraduate-Level Mathematics. In Conference on Neural Information Processing Systems (NeurIPS).","author":"Azerbayev Zhangir","year":"2025","unstructured":"Zhangir Azerbayev, Bartosz Piotrowski, Hailey Schoelkopf, Edward W Ayers, Dragomir Radev, and Jeremy Avigad. 2025. ProofNet: Autoformalizing and Formally Proving Undergraduate-Level Mathematics. In Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_4_1","volume-title":"Llemma: An Open Language Model for Mathematics. In International Conference on Learning Representations (ICLR).","author":"Azerbayev Zhangir","year":"2023","unstructured":"Zhangir Azerbayev, Hailey Schoelkopf, Keiran Paster, Marco Dos Santos, Stephen McAleer, Albert Q Jiang, Jia Deng, Stella Biderman, and Sean Welleck. 2023. Llemma: An Open Language Model for Mathematics. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-22102-1_4"},{"key":"e_1_3_2_1_6_1","volume-title":"Practical Verification of System-Software Components Written in Standard C. In ACM Symposium on Operating Systems Principles (SOSP).","author":"Cebeci Can","year":"2024","unstructured":"Can Cebeci, Yonghao Zou, Diyu Zhou, George Candea, and Cl\u00e9ment Pit-Claudel. 2024. Practical Verification of System-Software Components Written in Standard C. In ACM Symposium on Operating Systems Principles (SOSP)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.5555\/AAI30330907"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Saikat Chakraborty Shuvendu K Lahiri Sarah Fakhoury Madanlal Musuvathi Akash Lal Aseem Rastogi Aditya Senthilnathan Rahul Sharma and Nikhil Swamy. 2023. Ranking LLM-Generated Loop Invariants for Program Verification. In Empirical Methods in Natural Language Processing (EMNLP).","DOI":"10.18653\/v1\/2023.findings-emnlp.614"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3132747.3132776"},{"key":"e_1_3_2_1_10_1","volume-title":"Using Crash Hoare Logic for Certifying the FSCQ File System. In ACM Symposium on Operating Systems Principles (SOSP).","author":"Chen Haogang","year":"2015","unstructured":"Haogang Chen, Daniel Ziegler, Tej Chajed, Adam Chlipala, M. Frans Kaashoek, and Nickolai Zeldovich. 2015. Using Crash Hoare Logic for Certifying the FSCQ File System. In ACM Symposium on Operating Systems Principles (SOSP)."},{"key":"e_1_3_2_1_11_1","unstructured":"Papers With Code. 2025. spider Benchmark (Text-To-SQL). https:\/\/paperswithcode.com\/sota\/text-to-sql-on-spider"},{"key":"e_1_3_2_1_12_1","volume-title":"Baldur: Whole-Proof Generation and Repair with Large Language Models. In ACM International Conference on the Foundations of Software Engineering (FSE).","author":"First Emily","year":"2023","unstructured":"Emily First, Markus N. Rabe, Talia Ringer, and Yuriy Brun. 2023. Baldur: Whole-Proof Generation and Repair with Large Language Models. In ACM International Conference on the Foundations of Software Engineering (FSE)."},{"key":"e_1_3_2_1_13_1","volume-title":"Data-Centric Serialization for Coq. HAL preprint HAL:01384408","author":"Gallego Arias Emilio Jes\u00fas","year":"2016","unstructured":"Emilio Jes\u00fas Gallego Arias. 2016. SerAPI: Machine-Friendly, Data-Centric Serialization for Coq. HAL preprint HAL:01384408 (2016)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10817-020-09580-x"},{"key":"e_1_3_2_1_15_1","unstructured":"GitHub. 2025. GitHub Copilot \u00b7 Your AI pair programmer. https:\/\/github.com\/features\/copilot"},{"key":"e_1_3_2_1_16_1","volume-title":"CertiKOS: An Extensible Architecture for Building Certified Concurrent OS Kernels. In Symposium on Operating Systems Design and Implementation (OSDI).","author":"Gu Ronghui","year":"2016","unstructured":"Ronghui Gu, Zhong Shao, Hao Chen, Xiongnan Wu, Jieung Kim, Vilhelm Sj\u00f6berg, and David Costanzo. 2016. CertiKOS: An Extensible Architecture for Building Certified Concurrent OS Kernels. In Symposium on Operating Systems Design and Implementation (OSDI)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/2815400.2815428"},{"key":"e_1_3_2_1_18_1","unstructured":"Ziwei Ji Tiezheng Yu Yan Xu Nayeon Lee Etsuko Ishii and Pascale Fung. 2023. Towards Mitigating LLM Hallucination Via Self Reflection. In Empirical Methods in Natural Language Processing (EMNLP)."},{"volume-title":"Automated Reasoning with Otter","author":"Kalman John Arnold","key":"e_1_3_2_1_19_1","unstructured":"John Arnold Kalman. 2001. Automated Reasoning with Otter. Rinton Press Princeton NJ."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/1629575.1629596"},{"key":"e_1_3_2_1_21_1","volume-title":"First-Order Theorem Proving and Vampire. In International Conference on Computer Aided Verification (CAV).","author":"Kov\u00e1cs Laura","year":"2013","unstructured":"Laura Kov\u00e1cs and Andrei Voronkov. 2013. First-Order Theorem Proving and Vampire. In International Conference on Computer Aided Verification (CAV)."},{"key":"e_1_3_2_1_22_1","volume-title":"Verus: A Practical Foundation for Systems Verification. In ACM Symposium on Operating Systems Principles (SOSP).","author":"Lattuada Andrea","year":"2024","unstructured":"Andrea Lattuada, Travis Hance, Jay Bosamiya, Matthias Brun, Chanhee Cho, Hayley LeBlanc, Pranav Srinivasan, Reto Achermann, Tej Chajed, Chris Hawblitzel, Jon Howell, Jacob R. Lorch, Oded Padon, and Bryan Parno. 2024. Verus: A Practical Foundation for Systems Verification. In ACM Symposium on Operating Systems Principles (SOSP)."},{"key":"e_1_3_2_1_23_1","volume-title":"Dafny: An Automatic Program Verifier for Functional Correctness. In Conference on Logic for Programming, Artificial Intelligence, and Reasoning (LPAR).","author":"Leino K. Rustan M.","year":"2010","unstructured":"K. Rustan M. Leino. 2010. Dafny: An Automatic Program Verifier for Functional Correctness. In Conference on Logic for Programming, Artificial Intelligence, and Reasoning (LPAR)."},{"key":"e_1_3_2_1_24_1","volume-title":"Formal Verification of a Realistic Compiler. Commun. ACM 52, 7","author":"Leroy Xavier","year":"2009","unstructured":"Xavier Leroy. 2009. Formal Verification of a Realistic Compiler. Commun. ACM 52, 7 (2009)."},{"key":"e_1_3_2_1_25_1","volume-title":"Conference on Neural Information Processing Systems (NeurIPS).","author":"Lin Xiaohan","year":"2024","unstructured":"Xiaohan Lin, Qingxing Cao, Yinya Huang, Haiming Wang, Jianqiao Lu, Zhengying Liu, Linqi Song, and Xiaodan Liang. 2024. FVEL: Interactive Formal Verification Environment with Large Language Models via Theorem Proving. In Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_26_1","volume-title":"Lost in the Middle: How Language Models Use Long Contexts. Transactions of the Association for Computational Linguistics 12","author":"Liu Nelson F","year":"2024","unstructured":"Nelson F Liu, Kevin Lin, John Hewitt, Ashwin Paranjape, Michele Bevilacqua, Fabio Petroni, and Percy Liang. 2024. Lost in the Middle: How Language Models Use Long Contexts. Transactions of the Association for Computational Linguistics 12 (2024)."},{"key":"e_1_3_2_1_27_1","volume-title":"Proof Automation with Large Language Models. In IEEE\/ACM International Conference on Automated Software Engineering (ASE).","author":"Lu Minghai","year":"2024","unstructured":"Minghai Lu, Benjamin Delaware, and Tianyi Zhang. 2024. Proof Automation with Large Language Models. In IEEE\/ACM International Conference on Automated Software Engineering (ASE)."},{"key":"e_1_3_2_1_28_1","volume-title":"Towards AI-assisted Synthesis of Verified Dafny Methods. In ACM International Conference on the Foundations of Software Engineering (FSE).","author":"Hossain Misu Md Rakib","year":"2024","unstructured":"Md Rakib Hossain Misu, Cristina V. Lopes, Iris Ma, and James Noble. 2024. Towards AI-assisted Synthesis of Verified Dafny Methods. In ACM International Conference on the Foundations of Software Engineering (FSE)."},{"key":"e_1_3_2_1_29_1","volume-title":"Scaling Symbolic Evaluation for Automated Verification of Systems Code with Serval. In ACM Symposium on Operating Systems Principles (SOSP).","author":"Nelson Luke","year":"2019","unstructured":"Luke Nelson, James Bornholt, Ronghui Gu, Andrew Baumann, Emina Torlak, and Xi Wang. 2019. Scaling Symbolic Evaluation for Automated Verification of Systems Code with Serval. In ACM Symposium on Operating Systems Principles (SOSP)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3132747.3132748"},{"key":"e_1_3_2_1_31_1","unstructured":"OpenAI. 2025. Learning to reason with LLMs. https:\/\/openai.com\/index\/learning-to-reason-with-llms\/"},{"key":"e_1_3_2_1_32_1","volume-title":"International Conference on Machine Learning (ICML).","author":"Pei Kexin","year":"2023","unstructured":"Kexin Pei, David Bieber, Kensen Shi, Charles Sutton, and Pengcheng Yin. 2023. Can Large Language Models Reason about Program Invariants?. In International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3691620.3695512"},{"key":"e_1_3_2_1_34_1","volume-title":"Formal Mathematics Statement Curriculum Learning. In Conference on Neural Information Processing Systems (NeurIPS).","author":"Polu Stanislas","year":"2023","unstructured":"Stanislas Polu, Jesse Michael Han, Kunhao Zheng, Mantas Baksys, Igor Babuschkin, and Ilya Sutskever. 2023. Formal Mathematics Statement Curriculum Learning. In Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_35_1","volume-title":"Generative Language Modeling for Automated Theorem Proving. arXiv preprint arXiv:2009.03393","author":"Polu Stanislas","year":"2020","unstructured":"Stanislas Polu and Ilya Sutskever. 2020. Generative Language Modeling for Automated Theorem Proving. arXiv preprint arXiv:2009.03393 (2020)."},{"key":"e_1_3_2_1_36_1","volume-title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models. arXiv preprint arXiv:2402.03300","author":"Shao Zhihong","year":"2024","unstructured":"Zhihong Shao, Peiyi Wang, Qihao Zhu, Runxin Xu, Junxiao Song, Xiao Bi, Haowei Zhang, Mingchuan Zhang, YK Li, Y. Wu, and Guo Daya. 2024. DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models. arXiv preprint arXiv:2402.03300 (2024)."},{"key":"e_1_3_2_1_37_1","volume-title":"Symposium on Operating Systems Design and Implementation (OSDI).","author":"Sigurbjarnarson Helgi","year":"2016","unstructured":"Helgi Sigurbjarnarson, James Bornholt, Emina Torlak, and Xi Wang. 2016. Push-Button Verification of File Systems via Crash Refinement. In Symposium on Operating Systems Design and Implementation (OSDI)."},{"key":"e_1_3_2_1_38_1","volume-title":"Dependent Types and Multi-Monadic Effects in F. In ACM SIGPLAN-SIGACT Symposium on Principles of Programming Languages (POPL).","author":"Swamy Nikhil","year":"2016","unstructured":"Nikhil Swamy, C\u0103t\u0103lin Hri\u0163cu, Chantal Keller, Aseem Rastogi, Antoine Delignat-Lavaud, Simon Forest, Karthikeyan Bhargavan, C\u00e9dric Fournet, Pierre-Yves Strub, Markulf Kohlweiss, Jean-Karim Zinzindohoue, and Santiago Zanella-B\u00e9guelin. 2016. Dependent Types and Multi-Monadic Effects in F. In ACM SIGPLAN-SIGACT Symposium on Principles of Programming Languages (POPL)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477132.3483560"},{"key":"e_1_3_2_1_40_1","volume-title":"Rango: Adaptive Retrieval-Augmented Proving for Automated Software Verification. In International Conference on Software Engineering (ICSE).","author":"Thompson Kyle","year":"2025","unstructured":"Kyle Thompson, Nuno Saavedra, Pedro Carrott, Kevin Fisher, Alex Sanchez-Stern, Yuriy Brun, Jo\u00e3o F. Ferreira, Sorin Lerner, and Emily First. 2025. Rango: Adaptive Retrieval-Augmented Proving for Automated Software Verification. In International Conference on Software Engineering (ICSE)."},{"key":"e_1_3_2_1_41_1","volume-title":"Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. In Conference on Neural Information Processing Systems (NeurIPS).","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, and Denny Zhou. 2022. Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. In Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_42_1","volume-title":"Autoformalization with Large Language Models. In Conference on Neural Information Processing Systems (NeurIPS).","author":"Wu Yuhuai","year":"2022","unstructured":"Yuhuai Wu, Albert Qiaochu Jiang, Wenda Li, Markus Rabe, Charles Staats, Mateja Jamnik, and Christian Szegedy. 2022. Autoformalization with Large Language Models. In Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_43_1","volume-title":"Toward Mathematical Expert Model via Self-Improvement. arXiv preprint arXiv:2409.12122","author":"Yang An","year":"2024","unstructured":"An Yang, Beichen Zhang, Binyuan Hui, Bofei Gao, Bowen Yu, Chengpeng Li, Dayiheng Liu, Jianhong Tu, Jingren Zhou, Junyang Lin, Keming Lu, Mingfeng Xue, Runji Lin, Tianyu Liu, Xingzhang Ren, and Zhenru Zhang. 2024. Qwen2.5-Math Technical Report: Toward Mathematical Expert Model via Self-Improvement. arXiv preprint arXiv:2409.12122 (2024)."},{"key":"e_1_3_2_1_44_1","volume-title":"AutoVerus: Automated Proof Generation for Rust Code. In International Conference on Learning Representations (ICLR).","author":"Yang Chenyuan","year":"2025","unstructured":"Chenyuan Yang, Xuheng Li, Md Rakib Hossain Misu, Jianan Yao, Weidong Cui, Yeyun Gong, Chris Hawblitzel, Shuvendu Lahiri, Jacob R. Lorch, Shuai Lu, Fan Yang, Ziqiao Zhou, and Shan Lu. 2025. AutoVerus: Automated Proof Generation for Rust Code. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_45_1","volume-title":"LeanDojo: Theorem Proving with Retrieval-Augmented Language Models. In Conference on Neural Information Processing Systems (NeurIPS).","author":"Yang Kaiyu","year":"2024","unstructured":"Kaiyu Yang, Aidan Swope, Alex Gu, Rahul Chalamala, Peiyang Song, Shixing Yu, Saad Godil, Ryan J Prenger, and Animashree Anandkumar. 2024. LeanDojo: Theorem Proving with Retrieval-Augmented Language Models. In Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_46_1","volume-title":"MetaMath: Bootstrap Your Own Mathematical Questions for Large Language Models. In International Conference on Learning Representations (ICLR).","author":"Yu Longhui","year":"2024","unstructured":"Longhui Yu, Weisen Jiang, Han Shi, Jincheng Yu, Zhengying Liu, Yu Zhang, James T Kwok, Zhenguo Li, Adrian Weller, and Weiyang Liu. 2024. MetaMath: Bootstrap Your Own Mathematical Questions for Large Language Models. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_47_1","volume-title":"Selene: Pioneering Automated Proof in Software Verification. In Annual Meeting of the Association for Computational Linguistics (ACL).","author":"Zhang Lichen","year":"2024","unstructured":"Lichen Zhang, Shuai Lu, and Nan Duan. 2024. Selene: Pioneering Automated Proof in Software Verification. In Annual Meeting of the Association for Computational Linguistics (ACL)."},{"key":"e_1_3_2_1_48_1","volume-title":"International Conference on Learning Representations (ICLR).","author":"Zheng Kunhao","year":"2022","unstructured":"Kunhao Zheng, Jesse Michael Han, and Stanislas Polu. 2022. miniF2F: A Cross-System Benchmark for Formal Olympiad-Level Mathematics. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_49_1","volume-title":"Using Concurrent Relational Logic with Helpers for Verifying the AtomFS File System. In ACM Symposium on Operating Systems Principles (SOSP).","author":"Zou Mo","year":"2019","unstructured":"Mo Zou, Haoran Ding, Dong Du, Ming Fu, Ronghui Gu, and Haibo Chen. 2019. Using Concurrent Relational Logic with Helpers for Verifying the AtomFS File System. In ACM Symposium on Operating Systems Principles (SOSP)."}],"event":{"name":"HOTOS '25: Workshop on Hot Topics in Operating Systems","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"],"location":"Banff AB Canada","acronym":"HOTOS '25"},"container-title":["Proceedings of the Workshop on Hot Topics in Operating Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3713082.3730382","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3713082.3730382","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,29]],"date-time":"2025-08-29T16:49:10Z","timestamp":1756486150000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3713082.3730382"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,14]]},"references-count":49,"alternative-id":["10.1145\/3713082.3730382","10.1145\/3713082"],"URL":"https:\/\/doi.org\/10.1145\/3713082.3730382","relation":{},"subject":[],"published":{"date-parts":[[2025,5,14]]},"assertion":[{"value":"2025-06-06","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}