{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,24]],"date-time":"2026-06-24T06:22:21Z","timestamp":1782282141258,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":127,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,4,25]],"date-time":"2025-04-25T00:00:00Z","timestamp":1745539200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["1925043"],"award-info":[{"award-number":["1925043"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,4,26]]},"DOI":"10.1145\/3706598.3714113","type":"proceedings-article","created":{"date-parts":[[2025,4,24]],"date-time":"2025-04-24T05:30:26Z","timestamp":1745472626000},"page":"1-19","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":19,"title":["VeriPlan: Integrating Formal Verification and LLMs into End-User Planning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0991-8072","authenticated-orcid":false,"given":"Christine P.","family":"Lee","sequence":"first","affiliation":[{"name":"Department of Computer Sciences, University of Wisconsin-Madison, Madison, Wisconsin, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5383-3266","authenticated-orcid":false,"given":"David","family":"Porfirio","sequence":"additional","affiliation":[{"name":"Navy Center for Applied Research in AI, U.S. Naval Research Laboratory, Washington, District of Columbia, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-5519-8432","authenticated-orcid":false,"given":"Xinyu Jessica","family":"Wang","sequence":"additional","affiliation":[{"name":"University of Wisconsin - Madison, Madison, Wisconsin, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-6349-2862","authenticated-orcid":false,"given":"Kevin Chenkai","family":"Zhao","sequence":"additional","affiliation":[{"name":"People and Robots Lab, University of Wisconsin-Madison, Madison, Wisconsin, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9456-1495","authenticated-orcid":false,"given":"Bilge","family":"Mutlu","sequence":"additional","affiliation":[{"name":"Department of Computer Sciences, University of Wisconsin-Madison, Madison, Wisconsin, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,4,25]]},"reference":[{"key":"e_1_3_3_3_2_2","unstructured":"Josh Achiam Steven Adler Sandhini Agarwal Lama Ahmad Ilge Akkaya Florencia\u00a0Leoni Aleman Diogo Almeida Janko Altenschmidt Sam Altman Shyamal Anadkat et\u00a0al. 2023. Gpt-4 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.08774 (2023)."},{"key":"e_1_3_3_3_3_2","doi-asserted-by":"publisher","DOI":"10.1145\/3290605.3300233"},{"key":"e_1_3_3_3_4_2","doi-asserted-by":"publisher","DOI":"10.1145\/2930238.2930280"},{"key":"e_1_3_3_3_5_2","volume-title":"Principles of model checking","author":"Baier Christel","year":"2008","unstructured":"Christel Baier and Joost-Pieter Katoen. 2008. Principles of model checking. MIT press."},{"key":"e_1_3_3_3_6_2","doi-asserted-by":"crossref","unstructured":"Charles Bellemare Luc Bissonnette and Sabine Kr\u00f6ger. 2014. Statistical power of within and between-subjects designs in economic experiments. (2014).","DOI":"10.2139\/ssrn.2529895"},{"key":"e_1_3_3_3_7_2","doi-asserted-by":"publisher","DOI":"10.1145\/3442188.3445922"},{"key":"e_1_3_3_3_8_2","doi-asserted-by":"publisher","DOI":"10.1145\/3461702.3462571"},{"key":"e_1_3_3_3_9_2","doi-asserted-by":"crossref","unstructured":"David Bourne Jonathan Corney and Satyandra\u00a0K Gupta. 2011. Recent advances and future challenges in automated manufacturing planning. (2011).","DOI":"10.1115\/1.3593411"},{"key":"e_1_3_3_3_10_2","unstructured":"Chi-Min Chan Weize Chen Yusheng Su Jianxuan Yu Wei Xue Shanghang Zhang Jie Fu and Zhiyuan Liu. 2023. Chateval: Towards better llm-based evaluators through multi-agent debate. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.07201 (2023)."},{"key":"e_1_3_3_3_11_2","unstructured":"Xinyun Chen Maxwell Lin Nathanael Sch\u00e4rli and Denny Zhou. 2023. Teaching large language models to self-debug. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.05128 (2023)."},{"key":"e_1_3_3_3_12_2","unstructured":"Yanan Chen Ali Pesaranghader Tanmana Sadhu and Dong\u00a0Hoon Yi. 2024. Can We Rely on LLM Agents to Draft Long-Horizon Plans? Let\u2019s Take TravelPlanner as an Example. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.06318 (2024)."},{"key":"e_1_3_3_3_13_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4614-5583-7_311"},{"key":"e_1_3_3_3_14_2","doi-asserted-by":"publisher","unstructured":"Emanuele De\u00a0Pellegrin and Ronald P.\u00a0A. Petrick. 2024. Planning Domain Simulation: An Interactive System for Plan Visualisation. Proceedings of the International Conference on Automated Planning and Scheduling 34 1 (May 2024) 133\u2013141. 10.1609\/icaps.v34i1.31469","DOI":"10.1609\/icaps.v34i1.31469"},{"key":"e_1_3_3_3_15_2","doi-asserted-by":"publisher","DOI":"10.1145\/3640544.3645216"},{"key":"e_1_3_3_3_16_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10401-0_9"},{"key":"e_1_3_3_3_17_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4757-2355-7"},{"key":"e_1_3_3_3_18_2","doi-asserted-by":"crossref","unstructured":"Zhou Fang Jiaxin Qi Lubin Fan Jianqiang Huang Ying Jin and Tianren Yang. 2022. A framework for human-computer interactive street network design based on a multi-stage deep learning approach. Computers Environment and Urban Systems 96 (2022) 101853.","DOI":"10.1016\/j.compenvurbsys.2022.101853"},{"key":"e_1_3_3_3_19_2","doi-asserted-by":"publisher","unstructured":"Maria Fox and Derek Long. 2003. PDDL2. 1: An Extension to PDDL for Expressing Temporal Planning Domains. Journal of Artificial Intelligence Research 20 (2003) 61\u2013124. 10.1613\/jair.1129","DOI":"10.1613\/jair.1129"},{"key":"e_1_3_3_3_20_2","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9781139583923"},{"key":"e_1_3_3_3_21_2","unstructured":"Zhibin Gou Zhihong Shao Yeyun Gong Yelong Shen Yujiu Yang Nan Duan and Weizhu Chen. 2023. Critic: Large language models can self-correct with tool-interactive critiquing. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.11738 (2023)."},{"key":"e_1_3_3_3_22_2","unstructured":"Atharva Gundawar Mudit Verma Lin Guan Karthik Valmeekam Siddhant Bhambri and Subbarao Kambhampati. 2024. Robust Planning with LLM-Modulo Framework: Case Study in Travel Planning. arxiv:https:\/\/arXiv.org\/abs\/2405.20625\u00a0[cs.AI] https:\/\/arxiv.org\/abs\/2405.20625"},{"key":"e_1_3_3_3_23_2","doi-asserted-by":"publisher","DOI":"10.1145\/3298689.3347014"},{"key":"e_1_3_3_3_24_2","doi-asserted-by":"publisher","DOI":"10.1145\/2792838.2800179"},{"key":"e_1_3_3_3_25_2","doi-asserted-by":"publisher","DOI":"10.23919\/DATE58400.2024.10546729"},{"key":"e_1_3_3_3_26_2","doi-asserted-by":"crossref","unstructured":"Malte Helmert. 2009. Concise finite-domain representations for PDDL planning tasks. Artificial Intelligence 173 5-6 (2009) 503\u2013535.","DOI":"10.1016\/j.artint.2008.10.013"},{"key":"e_1_3_3_3_27_2","unstructured":"Christian Hensel Sebastian Junges Joost-Pieter Katoen Tim Quatmann and Matthias Volk. 2022. The probabilistic model checker Storm. International Journal on Software Tools for Technology Transfer (2022) 1\u201322."},{"key":"e_1_3_3_3_28_2","doi-asserted-by":"crossref","unstructured":"Ahmed Hosny Chintan Parmar John Quackenbush Lawrence\u00a0H Schwartz and Hugo\u00a0JWL Aerts. 2018. Artificial intelligence in radiology. Nature Reviews Cancer 18 8 (2018) 500\u2013510.","DOI":"10.1038\/s41568-018-0016-5"},{"key":"e_1_3_3_3_29_2","unstructured":"Hui Huang Yingqi Qu Jing Liu Muyun Yang and Tiejun Zhao. 2024. An empirical study of llm-as-a-judge for llm evaluation: Fine-tuned judge models are task-specific classifiers. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.02839 (2024)."},{"key":"e_1_3_3_3_30_2","unstructured":"Shiyuan Huang Siddarth Mamidanna Shreedhar Jangam Yilun Zhou and Leilani\u00a0H Gilpin. 2023. Can large language models explain themselves? a study of llm-generated self-explanations. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.11207 (2023)."},{"key":"e_1_3_3_3_31_2","unstructured":"Xu Huang Weiwen Liu Xiaolong Chen Xingmei Wang Hao Wang Defu Lian Yasheng Wang Ruiming Tang and Enhong Chen. 2024. Understanding the planning of LLM agents: A survey. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.02716 (2024)."},{"key":"e_1_3_3_3_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/1774088.1774614"},{"key":"e_1_3_3_3_33_2","doi-asserted-by":"crossref","unstructured":"Ziwei Ji Nayeon Lee Rita Frieske Tiezheng Yu Dan Su Yan Xu Etsuko Ishii Ye\u00a0Jin Bang Andrea Madotto and Pascale Fung. 2023. Survey of hallucination in natural language generation. Comput. Surveys 55 12 (2023) 1\u201338.","DOI":"10.1145\/3571730"},{"key":"e_1_3_3_3_34_2","unstructured":"Zhenlan Ji Daoyuan Wu Pingchuan Ma Zongjie Li and Shuai Wang. 2024. Testing and Understanding Erroneous Planning in LLM Agents through Synthesized User Inputs. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.17833 (2024)."},{"key":"e_1_3_3_3_35_2","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606737"},{"key":"e_1_3_3_3_36_2","first-page":"35","volume-title":"IntRS@ RecSys","author":"Jin Yucheng","year":"2017","unstructured":"Yucheng Jin, Bruno De Lemos Ribeiro\u00a0Pinto Cardoso, and Katrien Verbert. 2017. How do different levels of user control affect cognitive load and acceptance of recommendations?. In IntRS@ RecSys. 35\u201342."},{"key":"e_1_3_3_3_37_2","unstructured":"Subbarao Kambhampati Karthik Valmeekam Lin Guan Kaya Stechly Mudit Verma Siddhant Bhambri Lucas Saldyt and Anil Murthy. 2024. LLMs Can\u2019t Plan But Can Help Planning in LLM-Modulo Frameworks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.01817 (2024)."},{"key":"e_1_3_3_3_38_2","series-title":"Proceedings of Machine Learning Research","first-page":"22895","volume-title":"Proceedings of the 41st International Conference on Machine Learning","volume":"235","author":"Kambhampati Subbarao","year":"2024","unstructured":"Subbarao Kambhampati, Karthik Valmeekam, Lin Guan, Mudit Verma, Kaya Stechly, Siddhant Bhambri, Lucas\u00a0Paul Saldyt, and Anil B\u00a0Murthy. 2024. Position: LLMs Can\u2019t Plan, But Can Help Planning in LLM-Modulo Frameworks. In Proceedings of the 41st International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a0235), Ruslan Salakhutdinov, Zico Kolter, Katherine Heller, Adrian Weller, Nuria Oliver, Jonathan Scarlett, and Felix Berkenkamp (Eds.). PMLR, 22895\u201322907. https:\/\/proceedings.mlr.press\/v235\/kambhampati24a.html"},{"key":"e_1_3_3_3_39_2","unstructured":"K Kapellos A Micheli and A Valentini. [n. d.]. AIPlan4EU: Planning and Scheduling for Space Applications. ([n. d.])."},{"key":"e_1_3_3_3_40_2","doi-asserted-by":"publisher","DOI":"10.1145\/3640543.3645200"},{"key":"e_1_3_3_3_41_2","unstructured":"Callie\u00a0Y Kim Christine\u00a0P Lee and Bilge Mutlu. 2024. Understanding Large-Language Model (LLM)-powered Human-Robot Interaction. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.03217 (2024)."},{"key":"e_1_3_3_3_42_2","unstructured":"Hannah\u00a0Rose Kirk Andrew\u00a0M Bean Bertie Vidgen Paul R\u00f6ttger and Scott\u00a0A Hale. 2023. The past present and better future of feedback learning in large language models for subjective human preferences and values. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.07629 (2023)."},{"key":"e_1_3_3_3_43_2","doi-asserted-by":"crossref","unstructured":"Shunsuke Koga. 2023. Exploring the pitfalls of large language models: Inconsistency and inaccuracy in answering pathology board examination-style questions. medRxiv (2023) 2023\u201308.","DOI":"10.1101\/2023.08.03.23293401"},{"key":"e_1_3_3_3_44_2","unstructured":"Ryan Koo Minhwa Lee Vipul Raheja Jong\u00a0Inn Park Zae\u00a0Myung Kim and Dongyeop Kang. 2023. Benchmarking cognitive biases in large language models as evaluators. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.17012 (2023)."},{"key":"e_1_3_3_3_45_2","doi-asserted-by":"crossref","unstructured":"Hadas Kress-Gazit Morteza Lahijanian and Vasumathi Raman. 2018. Synthesis for robots: Guarantees and feedback for robot behavior. Annual Review of Control Robotics and Autonomous Systems 1 (2018) 211\u2013236.","DOI":"10.1146\/annurev-control-060117-104838"},{"key":"e_1_3_3_3_46_2","doi-asserted-by":"crossref","unstructured":"Satyapriya Krishna Jiaqi Ma Dylan Slack Asma Ghandeharioun Sameer Singh and Himabindu Lakkaraju. 2024. Post hoc explanations of language models can improve language models. Advances in Neural Information Processing Systems 36 (2024).","DOI":"10.21203\/rs.3.rs-3006112\/v1"},{"key":"e_1_3_3_3_47_2","doi-asserted-by":"crossref","unstructured":"Sushant Kumar Sumit Datta Vishakha Singh Deepanwita Datta Sanjay\u00a0Kumar Singh and Ritesh Sharma. 2024. Applications Challenges and Future Directions of Human-in-the-Loop Learning. IEEE Access (2024).","DOI":"10.1109\/ACCESS.2024.3401547"},{"key":"e_1_3_3_3_48_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-22110-1_47"},{"key":"e_1_3_3_3_49_2","doi-asserted-by":"publisher","DOI":"10.1145\/3643834.3661576"},{"key":"e_1_3_3_3_50_2","doi-asserted-by":"publisher","DOI":"10.1145\/3643834.3661559"},{"key":"e_1_3_3_3_51_2","doi-asserted-by":"crossref","unstructured":"Peter Lee Sebastien Bubeck and Joseph Petro. 2023. Benefits limits and risks of GPT-4 as an AI chatbot for medicine. New England Journal of Medicine 388 13 (2023) 1233\u20131239.","DOI":"10.1056\/NEJMsr2214184"},{"key":"e_1_3_3_3_52_2","doi-asserted-by":"publisher","DOI":"10.1145\/3603555.3603565"},{"key":"e_1_3_3_3_53_2","doi-asserted-by":"crossref","unstructured":"Matteo Leonetti Luca Iocchi and Peter Stone. 2016. A synthesis of automated planning and reinforcement learning for efficient robust decision-making. Artificial Intelligence 241 (2016) 103\u2013130.","DOI":"10.1016\/j.artint.2016.07.004"},{"key":"e_1_3_3_3_54_2","unstructured":"Patrick Lewis Ethan Perez Aleksandra Piktus Fabio Petroni Vladimir Karpukhin Naman Goyal Heinrich K\u00fcttler Mike Lewis Wen-tau Yih Tim Rockt\u00e4schel et\u00a0al. 2020. Retrieval-augmented generation for knowledge-intensive nlp tasks. Advances in Neural Information Processing Systems 33 (2020) 9459\u20139474."},{"key":"e_1_3_3_3_55_2","unstructured":"Shiyang Li Jianshu Chen Yelong Shen Zhiyu Chen Xinlu Zhang Zekun Li Hong Wang Jing Qian Baolin Peng Yi Mao et\u00a0al. 2022. Explanations from large language models make small reasoners better. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2210.06726 (2022)."},{"key":"e_1_3_3_3_56_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.291"},{"key":"e_1_3_3_3_57_2","unstructured":"Zhenwen Liang Ye Liu Tong Niu Xiangliang Zhang Yingbo Zhou and Semih Yavuz. 2024. Improving llm reasoning through scaling inference computation with collaborative verification. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.05318 (2024)."},{"key":"e_1_3_3_3_58_2","doi-asserted-by":"publisher","DOI":"10.1145\/3652583.3658104"},{"key":"e_1_3_3_3_59_2","unstructured":"Fang Liu Yang Liu Lin Shi Houkun Huang Ruifeng Wang Zhen Yang Li Zhang Zhongqi Li and Yuchi Ma. 2024. Exploring and evaluating hallucinations in llm-powered code generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.00971 (2024)."},{"key":"e_1_3_3_3_60_2","first-page":"1084","volume-title":"Conference on Robot Learning","author":"Liu Jason\u00a0Xinyu","year":"2023","unstructured":"Jason\u00a0Xinyu Liu, Ziyi Yang, Ifrah Idrees, Sam Liang, Benjamin Schornstein, Stefanie Tellex, and Ankit Shah. 2023. Grounding complex natural language commands for temporal tasks in unseen environments. In Conference on Robot Learning. PMLR, 1084\u20131110."},{"key":"e_1_3_3_3_61_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613905.3650756"},{"key":"e_1_3_3_3_62_2","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376739"},{"key":"e_1_3_3_3_63_2","first-page":"43447","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Lu Pan","year":"2023","unstructured":"Pan Lu, Baolin Peng, Hao Cheng, Michel Galley, Kai-Wei Chang, Ying\u00a0Nian Wu, Song-Chun Zhu, and Jianfeng Gao. 2023. Chameleon: Plug-and-Play Compositional Reasoning with Large Language Models. In Advances in Neural Information Processing Systems , A.\u00a0Oh, T.\u00a0Naumann, A.\u00a0Globerson, K.\u00a0Saenko, M.\u00a0Hardt, and S.\u00a0Levine (Eds.), Vol.\u00a036. Curran Associates, Inc., 43447\u201343478. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2023\/file\/871ed095b734818cfba48db6aeb25a62-Paper-Conference.pdf"},{"key":"e_1_3_3_3_64_2","unstructured":"Arnold Lund. 2001. Measuring Usability with the USE Questionnaire. Usability and User Experience Newsletter of the STC Usability SIG 8 (01 2001)."},{"key":"e_1_3_3_3_65_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613905.3651093"},{"key":"e_1_3_3_3_66_2","unstructured":"Ana Marasovi\u0107 Iz Beltagy Doug Downey and Matthew\u00a0E Peters. 2021. Few-shot self-rationalization with natural language prompts. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2111.08284 (2021)."},{"key":"e_1_3_3_3_67_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-22868-2_90"},{"key":"e_1_3_3_3_68_2","unstructured":"Anne-Sophie Mayer Franz Strich and Marina Fiedler. 2020. Unintended Consequences of Introducing AI Systems for Decision Making. MIS Quarterly Executive 19 4 (2020)."},{"key":"e_1_3_3_3_69_2","doi-asserted-by":"crossref","unstructured":"Joshua Maynez Shashi Narayan Bernd Bohnet and Ryan McDonald. 2020. On faithfulness and factuality in abstractive summarization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2005.00661 (2020).","DOI":"10.18653\/v1\/2020.acl-main.173"},{"key":"e_1_3_3_3_70_2","doi-asserted-by":"publisher","unstructured":"Nora McDonald Sarita Schoenebeck and Andrea Forte. 2019. Reliability and Inter-rater Reliability in Qualitative Research: Norms and Guidelines for CSCW and HCI Practice. Proceedings of the ACM on Human-Computer Interaction 3 (11 2019) 1\u201323. 10.1145\/3359174","DOI":"10.1145\/3359174"},{"key":"e_1_3_3_3_71_2","unstructured":"Shervin Minaee Tomas Mikolov Narjes Nikzad Meysam Chenaghlu Richard Socher Xavier Amatriain and Jianfeng Gao. 2024. Large language models: A survey. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.06196 (2024)."},{"key":"e_1_3_3_3_72_2","unstructured":"Suvir Mirchandani Fei Xia Pete Florence Brian Ichter Danny Driess Montserrat\u00a0Gonzalez Arenas Kanishka Rao Dorsa Sadigh and Andy Zeng. 2023. Large language models as general pattern machines. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.04721 (2023)."},{"key":"e_1_3_3_3_73_2","doi-asserted-by":"crossref","unstructured":"Mahdi Mostajabdaveh Timothy\u00a0T Yu Rindranirina Ramamonjison Giuseppe Carenini Zirui Zhou and Yong Zhang. 2024. Optimization modeling and verification from problem specifications using a multi-agent multi-stage LLM framework. INFOR: Information Systems and Operational Research (2024) 1\u201319.","DOI":"10.1080\/03155986.2024.2381306"},{"key":"e_1_3_3_3_74_2","unstructured":"Dana Nau Yash Bansod Sunandita Patra Mark Roberts and Ruoxi Li. [n. d.]. GTPyhop: A hierarchical goal+ task planner implemented in Python. HPlan 2021 ([n. d.]) 21."},{"key":"e_1_3_3_3_75_2","first-page":"26106","volume-title":"International Conference on Machine Learning","author":"Ni Ansong","year":"2023","unstructured":"Ansong Ni, Srini Iyer, Dragomir Radev, Veselin Stoyanov, Wen-tau Yih, Sida Wang, and Xi\u00a0Victoria Lin. 2023. Lever: Learning to verify language-to-code generation with execution. In International Conference on Machine Learning. PMLR, 26106\u201326128."},{"key":"e_1_3_3_3_76_2","volume-title":"Designing web usability: The practice of simplicity","author":"Nielsen Jakob","year":"1999","unstructured":"Jakob Nielsen. 1999. Designing web usability: The practice of simplicity. New riders publishing."},{"key":"e_1_3_3_3_77_2","unstructured":"Lin Ning Luyang Liu Jiaxing Wu Neo Wu Devora Berlowitz Sushant Prakash Bradley Green Shawn O\u2019Banion and Jun Xie. 2024. User-LLM: Efficient LLM Contextualization with User Embeddings. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.13598 (2024)."},{"key":"e_1_3_3_3_78_2","doi-asserted-by":"publisher","DOI":"10.1609\/icaps.v34i1.31503"},{"key":"e_1_3_3_3_79_2","doi-asserted-by":"crossref","unstructured":"Junsoo Park Seungyeon Jwa Meiying Ren Daeyoung Kim and Sanghyuk Choi. 2024. Offsetbias: Leveraging debiased data for tuning evaluators. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.06551 (2024).","DOI":"10.18653\/v1\/2024.findings-emnlp.57"},{"key":"e_1_3_3_3_80_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-30122-6_11"},{"key":"e_1_3_3_3_81_2","doi-asserted-by":"publisher","DOI":"10.1145\/3610977.3634974"},{"key":"e_1_3_3_3_82_2","doi-asserted-by":"publisher","DOI":"10.1145\/3242587.3242634"},{"key":"e_1_3_3_3_83_2","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376355"},{"key":"e_1_3_3_3_84_2","unstructured":"Vipula Rawte Amit Sheth and Amitava Das. 2023. A survey of hallucination in large foundation models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.05922 (2023)."},{"key":"e_1_3_3_3_85_2","first-page":"887","volume-title":"Healthcare","author":"Sallam Malik","year":"2023","unstructured":"Malik Sallam. 2023. ChatGPT utility in healthcare education, research, and practice: systematic review on the promising perspectives and valid concerns. In Healthcare , Vol.\u00a011. MDPI, 887."},{"key":"e_1_3_3_3_86_2","doi-asserted-by":"publisher","DOI":"10.1145\/3596671.3597650"},{"key":"e_1_3_3_3_87_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-04870-8_9"},{"key":"e_1_3_3_3_88_2","doi-asserted-by":"crossref","unstructured":"Wout Schellaert Fernando Mart\u00ednez-Plumed Karina Vold John Burden Pablo\u00a0AM Casares Bao\u00a0Sheng Loe Roi Reichart Anna Korhonen Jos\u00e9 Hern\u00e1ndez-Orallo et\u00a0al. 2023. Your prompt is my command: on assessing the human-centred generality of multimodal models. Journal of Artificial Intelligence Research 77 (2023) 377\u2013394.","DOI":"10.1613\/jair.1.14157"},{"key":"e_1_3_3_3_89_2","doi-asserted-by":"publisher","DOI":"10.1145\/3379337.3415872"},{"key":"e_1_3_3_3_90_2","unstructured":"M Shah Luk\u00e1s Chrpa Falilat Jimoh D Kitchin T McCluskey Simon Parkinson and Mauro Vallati. 2013. Knowledge engineering tools in planning: State-of-the-art and future challenges. Knowledge engineering for planning and scheduling 53 (2013) 53."},{"key":"e_1_3_3_3_91_2","doi-asserted-by":"crossref","unstructured":"Fu Shang Fanyi Zhao Mingxuan Zhang Jun Sun and Jiatu Shi. 2024. Personalized recommendation systems powered by large language models: Integrating semantic understanding and user preferences. International Journal of Innovative Research in Engineering and Management 11 4 (2024) 39\u201349.","DOI":"10.55524\/ijirem.2024.11.4.6"},{"key":"e_1_3_3_3_92_2","doi-asserted-by":"publisher","DOI":"10.1145\/3654777.3676450"},{"key":"e_1_3_3_3_93_2","doi-asserted-by":"crossref","unstructured":"Donghee Shin. 2021. The effects of explainability and causability on perception trust and acceptance: Implications for explainable AI. International journal of human-computer studies 146 (2021) 102551.","DOI":"10.1016\/j.ijhcs.2020.102551"},{"key":"e_1_3_3_3_94_2","volume-title":"Designing the user interface: strategies for effective human-computer interaction","author":"Shneiderman Ben","year":"2010","unstructured":"Ben Shneiderman and Catherine Plaisant. 2010. Designing the user interface: strategies for effective human-computer interaction. Pearson Education India."},{"key":"e_1_3_3_3_95_2","doi-asserted-by":"publisher","unstructured":"Tom Silver Soham Dan Kavitha Srinivas Joshua\u00a0B. Tenenbaum Leslie Kaelbling and Michael Katz. 2024. Generalized Planning in PDDL Domains with Pretrained Large Language Models. Proceedings of the AAAI Conference on Artificial Intelligence 38 18 (Mar. 2024) 20256\u201320264. 10.1609\/aaai.v38i18.30006","DOI":"10.1609\/aaai.v38i18.30006"},{"key":"e_1_3_3_3_96_2","doi-asserted-by":"crossref","unstructured":"Itamar Simonson. 2005. Determinants of customers\u2019 responses to customized offers: Conceptual framework and research propositions. Journal of marketing 69 1 (2005) 32\u201345.","DOI":"10.1509\/jmkg.69.1.32.55512"},{"key":"e_1_3_3_3_97_2","unstructured":"Guijin Son Hyunwoo Ko Hoyoung Lee Yewon Kim and Seunghyeok Hong. 2024. Llm-as-a-judge & reward model: What they can and cannot do. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.11239 (2024)."},{"key":"e_1_3_3_3_98_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00280"},{"key":"e_1_3_3_3_99_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00280"},{"key":"e_1_3_3_3_100_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642754"},{"key":"e_1_3_3_3_101_2","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606756"},{"key":"e_1_3_3_3_102_2","doi-asserted-by":"publisher","DOI":"10.1145\/3490099.3511119"},{"key":"e_1_3_3_3_103_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642902"},{"key":"e_1_3_3_3_104_2","unstructured":"Lukas Teufelberger Xintong Liu Zhipeng Li Max Moebus and Christian Holz. 2024. LLM-for-X: Application-agnostic Integration of Large Language Models to Support Personal Writing Workflows. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.21593 (2024)."},{"key":"e_1_3_3_3_105_2","unstructured":"Miles Turpin Julian Michael Ethan Perez and Samuel Bowman. 2024. Language models don\u2019t always say what they think: unfaithful explanations in chain-of-thought prompting. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_3_3_106_2","unstructured":"Karthik Valmeekam Matthew Marquez Sarath Sreedharan and Subbarao Kambhampati. 2023. On the planning abilities of large language models-a critical investigation. Advances in Neural Information Processing Systems 36 (2023) 75993\u201376005."},{"key":"e_1_3_3_3_107_2","volume-title":"NeurIPS 2022 Foundation Models for Decision Making Workshop","author":"Valmeekam Karthik","year":"2022","unstructured":"Karthik Valmeekam, Alberto Olmo, Sarath Sreedharan, and Subbarao Kambhampati. 2022. Large language models still can\u2019t plan (a benchmark for LLMs on planning and reasoning about change). In NeurIPS 2022 Foundation Models for Decision Making Workshop."},{"key":"e_1_3_3_3_108_2","unstructured":"Helena Vasconcelos Gagan Bansal Adam Fourney Q\u00a0Vera Liao and Jennifer\u00a0Wortman Vaughan. 2023. Generation probabilities are not enough: Exploring the effectiveness of uncertainty highlighting in AI-powered code completions. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.07248 (2023)."},{"key":"e_1_3_3_3_109_2","unstructured":"Peiyi Wang Lei Li Liang Chen Zefan Cai Dawei Zhu Binghuai Lin Yunbo Cao Qi Liu Tianyu Liu and Zhifang Sui. 2023. Large language models are not fair evaluators. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.17926 (2023)."},{"key":"e_1_3_3_3_110_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3641960"},{"key":"e_1_3_3_3_111_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3641960"},{"key":"e_1_3_3_3_112_2","unstructured":"Zihao Wang Shaofei Cai Guanzhou Chen Anji Liu Xiaojian Ma and Yitao Liang. 2023. Describe explain plan and select: Interactive planning with large language models enables open-world multi-task agents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.01560 (2023)."},{"key":"e_1_3_3_3_113_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642335"},{"key":"e_1_3_3_3_114_2","unstructured":"Sarah Wiegreffe Jack Hessel Swabha Swayamdipta Mark Riedl and Yejin Choi. 2021. Reframing human-AI collaboration for generating free-text explanations. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2112.08674 (2021)."},{"key":"e_1_3_3_3_115_2","volume-title":"ICLR 2024 Workshop on Large Language Model (LLM) Agents","author":"Wu Yiran","year":"2024","unstructured":"Yiran Wu, Feiran Jia, Shaokun Zhang, Hangyu Li, Erkang Zhu, Yue Wang, Yin\u00a0Tat Lee, Richard Peng, Qingyun Wu, and Chi Wang. 2024. MathChat: Converse to Tackle Challenging Math Problems with LLM Agents. In ICLR 2024 Workshop on Large Language Model (LLM) Agents."},{"key":"e_1_3_3_3_116_2","unstructured":"Jian Xie Kai Zhang Jiangjie Chen Tinghui Zhu Renze Lou Yuandong Tian Yanghua Xiao and Yu Su. 2024. Travelplanner: A benchmark for real-world planning with language agents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.01622 (2024)."},{"key":"e_1_3_3_3_117_2","doi-asserted-by":"crossref","unstructured":"Jingfeng Yang Hongye Jin Ruixiang Tang Xiaotian Han Qizhang Feng Haoming Jiang Shaochen Zhong Bing Yin and Xia Hu. 2024. Harnessing the power of llms in practice: A survey on chatgpt and beyond. ACM Transactions on Knowledge Discovery from Data 18 6 (2024) 1\u201332.","DOI":"10.1145\/3649506"},{"key":"e_1_3_3_3_118_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10611447"},{"key":"e_1_3_3_3_119_2","unstructured":"Jia-Yu Yao Kun-Peng Ning Zhen-Hui Liu Mu-Nan Ning Yu-Yang Liu and Li Yuan. 2023. Llm lies: Hallucinations are not bugs but features as adversarial examples. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.01469 (2023)."},{"key":"e_1_3_3_3_120_2","unstructured":"Shunyu Yao Jeffrey Zhao Dian Yu Nan Du Izhak Shafran Karthik Narasimhan and Yuan Cao. 2022. React: Synergizing reasoning and acting in language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2210.03629 (2022)."},{"key":"e_1_3_3_3_121_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581388"},{"key":"e_1_3_3_3_122_2","unstructured":"Bin Zhang Hangyu Mao Jingqing Ruan Ying Wen Yang Li Shao Zhang Zhiwei Xu Dapeng Li Ziyue Li Rui Zhao et\u00a0al. 2023. Controlling large language model-based agents for large-scale decision-making: An actor-critic approach. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.13884 (2023)."},{"key":"e_1_3_3_3_123_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73242-3_10"},{"key":"e_1_3_3_3_124_2","unstructured":"Tao Zhang Yanjun Shen Wenjing Luo Yan Zhang Hao Liang Fan Yang Mingan Lin Yujing Qiao Weipeng Chen Bin Cui et\u00a0al. 2024. Cfbench: A comprehensive constraints-following benchmark for llms. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.01122 (2024)."},{"key":"e_1_3_3_3_125_2","unstructured":"Xinghua Zhang Bowen Yu Haiyang Yu Yangyu Lv Tingwen Liu Fei Huang Hongbo Xu and Yongbin Li. 2023. Wider and deeper llm networks are fairer llm evaluators. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.01862 (2023)."},{"key":"e_1_3_3_3_126_2","doi-asserted-by":"crossref","unstructured":"Haiyan Zhao Hanjie Chen Fan Yang Ninghao Liu Huiqi Deng Hengyi Cai Shuaiqiang Wang Dawei Yin and Mengnan Du. 2024. Explainability for large language models: A survey. ACM Transactions on Intelligent Systems and Technology 15 2 (2024) 1\u201338.","DOI":"10.1145\/3639372"},{"key":"e_1_3_3_3_127_2","unstructured":"Wayne\u00a0Xin Zhao Kun Zhou Junyi Li Tianyi Tang Xiaolei Wang Yupeng Hou Yingqian Min Beichen Zhang Junjie Zhang Zican Dong et\u00a0al. 2023. A survey of large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.18223 (2023)."},{"key":"e_1_3_3_3_128_2","unstructured":"Lianmin Zheng Wei-Lin Chiang Ying Sheng Siyuan Zhuang Zhanghao Wu Yonghao Zhuang Zi Lin Zhuohan Li Dacheng Li Eric Xing et\u00a0al. 2024. Judging llm-as-a-judge with mt-bench and chatbot arena. Advances in Neural Information Processing Systems 36 (2024)."}],"event":{"name":"CHI 2025: CHI Conference on Human Factors in Computing Systems","location":"Yokohama Japan","acronym":"CHI '25","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the 2025 CHI Conference on Human Factors in Computing Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3706598.3714113","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3706598.3714113","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3706598.3714113","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,7,4]],"date-time":"2025-07-04T05:57:00Z","timestamp":1751608620000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3706598.3714113"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,25]]},"references-count":127,"alternative-id":["10.1145\/3706598.3714113","10.1145\/3706598"],"URL":"https:\/\/doi.org\/10.1145\/3706598.3714113","relation":{},"subject":[],"published":{"date-parts":[[2025,4,25]]},"assertion":[{"value":"2025-04-25","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}