{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,11]],"date-time":"2026-06-11T02:07:36Z","timestamp":1781143656135,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":107,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,3]]},"DOI":"10.1145\/3711896.3736572","type":"proceedings-article","created":{"date-parts":[[2025,8,3]],"date-time":"2025-08-03T21:04:26Z","timestamp":1754255066000},"page":"6162-6172","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["The Hitchhikers Guide to Production-ready Trustworthy Foundation Model Powered Software (FMware)"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-0292-4970","authenticated-orcid":false,"given":"Kirill","family":"Vasilevski","sequence":"first","affiliation":[{"name":"Huawei Canada, Toronto, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1812-5365","authenticated-orcid":false,"given":"Gopi Krishnan","family":"Rajbahadur","sequence":"additional","affiliation":[{"name":"Huawei Canada, Kingston, Canada and Queen's University, Kingston, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5419-9284","authenticated-orcid":false,"given":"Gustavo A.","family":"Oliva","sequence":"additional","affiliation":[{"name":"Huawei Canada, Kingston, Canada and Queen's University, Kingston, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5947-2684","authenticated-orcid":false,"given":"Benjamin","family":"Rombaut","sequence":"additional","affiliation":[{"name":"Huawei Canada, Kingston, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5880-5114","authenticated-orcid":false,"given":"Keheliya","family":"Gallaba","sequence":"additional","affiliation":[{"name":"Huawei Canada, Kingston, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5494-685X","authenticated-orcid":false,"given":"Filipe R.","family":"Cogo","sequence":"additional","affiliation":[{"name":"Huawei Canada, Kingston, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7133-2219","authenticated-orcid":false,"given":"Jiahuei (Justina)","family":"Lin","sequence":"additional","affiliation":[{"name":"Huawei Canada, Kingston, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4034-6650","authenticated-orcid":false,"given":"Dayi","family":"Lin","sequence":"additional","affiliation":[{"name":"Huawei Canada, Toronto, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3921-1724","authenticated-orcid":false,"given":"Haoxiang","family":"Zhang","sequence":"additional","affiliation":[{"name":"Huawei Canada, Toronto, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9103-5820","authenticated-orcid":false,"given":"Bouyan","family":"Chen","sequence":"additional","affiliation":[{"name":"Huawei Canada, Toronto, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8649-6163","authenticated-orcid":false,"given":"Kishanthan","family":"Thangarajah","sequence":"additional","affiliation":[{"name":"Huawei Canada, Toronto, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7749-5513","authenticated-orcid":false,"given":"Ahmed E.","family":"Hassan","sequence":"additional","affiliation":[{"name":"Queen's University, Kingston, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3063-3197","authenticated-orcid":false,"given":"Zhen Ming (Jack)","family":"Jiang","sequence":"additional","affiliation":[{"name":"York University, Toronto, Canada"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,8,3]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2023. Testing LLM-Based Applications: Strategy and Challenges - blog.scottlogic.com. https:\/\/blog.scottlogic.com\/2023\/11\/14\/testing-LLM-based-applications-strategy-and-challenges.html. [Accessed 07-10-2024]."},{"key":"e_1_3_2_1_2_1","unstructured":"2024. Musings on Building a Generative AI Product - linkedin.com. https:\/\/www.linkedin.com\/blog\/engineering\/generative-ai\/musings-on-building-a-generative-ai-product. [Accessed 08-10-2024]."},{"key":"e_1_3_2_1_3_1","unstructured":"2025. Copilot Internals. https:\/\/thakkarparth007.github.io\/copilot-explorer\/posts\/copilot-internals last accessed: 2025-03-11."},{"key":"e_1_3_2_1_4_1","unstructured":"2025. LangSmith by LangChain. https:\/\/www.langchain.com\/langsmith\/ last accessed: 2025-03-11."},{"key":"e_1_3_2_1_5_1","unstructured":"2025. Traceloop. https:\/\/www.traceloop.com\/ last accessed: 2025-03-11."},{"key":"e_1_3_2_1_6_1","unstructured":"2025. Weights and Biases - Weave. https:\/\/wandb.ai\/site\/weave\/ last accessed: 2025-03-11."},{"key":"e_1_3_2_1_7_1","unstructured":"Dr. Assad Abbas. 2024. The Financial Challenges of Leading in AI: A Look at OpenAI's Operating Costs - unite.ai. https:\/\/www.unite.ai\/the-financial-challenges-of-leading-in-ai-a-look-at-openais-operating-costs\/. [Accessed 11-10-2024]."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASE56229.2023.00065"},{"key":"e_1_3_2_1_9_1","volume-title":"Shayne Longpre, Nathan Lambert, Xinyi Wang, Niklas Muennighoff, Bairu Hou, Liangming Pan, Haewon Jeong, et al.","author":"Albalak Alon","year":"2024","unstructured":"Alon Albalak, Yanai Elazar, Sang Michael Xie, Shayne Longpre, Nathan Lambert, Xinyi Wang, Niklas Muennighoff, Bairu Hou, Liangming Pan, Haewon Jeong, et al. 2024. A survey on data selection for language models. arXiv preprint arXiv:2402.16827 (2024)."},{"key":"e_1_3_2_1_10_1","volume-title":"How Good Is It? Evaluating the Efficacy of Common versus Domain-Specific Prompts on Foundational Large Language Models. arXiv preprint arXiv:2407.11006","author":"Amujo Oluyemi Enoch","year":"2024","unstructured":"Oluyemi Enoch Amujo and Shanchieh Jay Yang. 2024. How Good Is It? Evaluating the Efficacy of Common versus Domain-Specific Prompts on Foundational Large Language Models. arXiv preprint arXiv:2407.11006 (2024)."},{"key":"e_1_3_2_1_11_1","unstructured":"Anthropic. n.d.. All models overview - Anthropic. https:\/\/docs.anthropic.com\/en\/docs\/about-claude\/models\/all-models. Accessed: 2025-02-01."},{"key":"e_1_3_2_1_12_1","unstructured":"Anthropic. n.d.. Claude 3.7 Sonnet and Claude Code. https:\/\/www.anthropic.com\/news\/claude-3-7-sonnet. Accessed: 2025-02-01."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3563657.3596046"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CQR.2009.5137352"},{"key":"e_1_3_2_1_15_1","volume-title":"Current state of LLM Risks and AI Guardrails. arXiv preprint arXiv:2406.12934","author":"Ayyamperumal Suriya Ganesh","year":"2024","unstructured":"Suriya Ganesh Ayyamperumal and Limin Ge. 2024. Current state of LLM Risks and AI Guardrails. arXiv preprint arXiv:2406.12934 (2024)."},{"key":"e_1_3_2_1_16_1","unstructured":"R. Bommasani D.A Hudson E. Adeli Altman et al. 2021. On the opportunities and risks of foundation models. arXiv preprint arXiv:2108.07258 (2021)."},{"key":"e_1_3_2_1_17_1","unstructured":"D. Brajovic N. Renner V.P. Goebels P. Wagner B. Fresz et al. 2023. Model Reporting for Certifiable AI: A Proposal from Merging EU Regulation into AI Development. arXiv preprint arXiv:2307.11525 (2023)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/BigData.2017.8258038"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"K.K Chang M. Cramer S. Soni and D. Bamman. 2023. Speak memory: An archaeology of books known to chatgpt\/gpt-4. arXiv preprint arXiv:2305.00118 (2023).","DOI":"10.18653\/v1\/2023.emnlp-main.453"},{"key":"e_1_3_2_1_20_1","unstructured":"Dong Chen Shaoxin Lin Muhan Zeng Daoguang Zan Jian-Gang Wang Anton Cheshkov Jun Sun Hao Yu Guoliang Dong Artem Aliev et al. 2024. CodeR: Issue Resolving with Multi-Agent and Task Graphs. arXiv preprint arXiv:2406.01304 (2024)."},{"key":"e_1_3_2_1_21_1","volume-title":"Humans or llms as the judge? a study on judgement biases. arXiv preprint arXiv:2402.10669","author":"Chen Guiming Hardy","year":"2024","unstructured":"Guiming Hardy Chen, Shunian Chen, Ziche Liu, Feng Jiang, and Benyou Wang. 2024. Humans or llms as the judge? a study on judgement biases. arXiv preprint arXiv:2402.10669 (2024)."},{"key":"e_1_3_2_1_22_1","volume-title":"Michael JQ Zhang, and Eunsol Choi","author":"Chen Hung-Ting","year":"2022","unstructured":"Hung-Ting Chen, Michael JQ Zhang, and Eunsol Choi. 2022. Rich knowledge sources bring complex knowledge conflicts: Recalibrating models to reflect conflicting evidence. arXiv preprint arXiv:2210.13701 (2022)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"L. Chen M. Zaharia and J. Zou. 2023. How is ChatGPT's behavior changing over time? arXiv:2307.09009 [cs.CL]","DOI":"10.1162\/99608f92.5317da47"},{"key":"e_1_3_2_1_24_1","volume-title":"An Empirical Study on Challenges for LLM Application Developers. ACM Transactions on Software Engineering and Methodology","author":"Chen Xiang","year":"2025","unstructured":"Xiang Chen, Chaoyang Gao, Chunyang Chen, Guangbei Zhang, and Yong Liu. 2025. An Empirical Study on Challenges for LLM Application Developers. ACM Transactions on Software Engineering and Methodology (2025)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3531146.3533143"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657834"},{"key":"e_1_3_2_1_27_1","volume-title":"Architecture and Production Readiness Reviews in Practice. arXiv preprint arXiv:1305.2402","author":"Cusick James","year":"2013","unstructured":"James Cusick. 2013. Architecture and Production Readiness Reviews in Practice. arXiv preprint arXiv:1305.2402 (2013)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.222"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671452"},{"key":"e_1_3_2_1_30_1","volume-title":"A Framework for Real-time Safeguarding the Text Generation of Large Language. arXiv preprint arXiv:2404.19048","author":"Dong Ximing","year":"2024","unstructured":"Ximing Dong, Dayi Lin, Shaowei Wang, and Ahmed E Hassan. 2024. A Framework for Real-time Safeguarding the Text Generation of Large Language. arXiv preprint arXiv:2404.19048 (2024)."},{"key":"e_1_3_2_1_31_1","volume-title":"The EU AI Act: a summary of its significance and scope. AI (the EU AI Act) 1","author":"Edwards L.","year":"2021","unstructured":"L. Edwards. 2021. The EU AI Act: a summary of its significance and scope. AI (the EU AI Act) 1 (2021)."},{"key":"e_1_3_2_1_32_1","unstructured":"Hugging Face. n.d.. Models - Hugging Face. https:\/\/huggingface.co\/models. Accessed: 2025-02-01."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE-FoSE59343.2023.00008"},{"key":"e_1_3_2_1_34_1","volume-title":"Promptbreeder: Self-Referential Self-Improvement Via Prompt Evolution. arXiv:2309.16797 [cs.CL] https:\/\/arxiv.org\/abs\/2309.16797","author":"Fernando Chrisantha","year":"2023","unstructured":"Chrisantha Fernando, Dylan Banarse, Henryk Michalewski, Simon Osindero, and Tim Rockt\u00e4schel. 2023. Promptbreeder: Self-Referential Self-Improvement Via Prompt Evolution. arXiv:2309.16797 [cs.CL] https:\/\/arxiv.org\/abs\/2309.16797"},{"key":"e_1_3_2_1_35_1","volume-title":"Proc. of Annual Meeting of the ACL-IJCNLP. (ACL), 3816-3830","author":"Gao T.","unstructured":"T. Gao, A. Fisch, and D. Chen. 2021. Making Pre-trained Language Models Better Few-shot Learners. In Proc. of Annual Meeting of the ACL-IJCNLP. (ACL), 3816-3830."},{"key":"e_1_3_2_1_36_1","volume-title":"Prompt Cache: Modular Attention Reuse for Low-Latency Inference. arXiv preprint arXiv:2311.04934","author":"Gim I.","year":"2023","unstructured":"I. Gim, G. Chen, S.S. Lee, N. Sarda, A. Khandelwal, et al. 2023. Prompt Cache: Modular Attention Reuse for Low-Latency Inference. arXiv preprint arXiv:2311.04934 (2023)."},{"key":"e_1_3_2_1_37_1","unstructured":"Github. [n. d.]. GitHub Next | Copilot Workspace. https:\/\/githubnext.com\/projects\/copilot-workspace\/. Accessed 02-06-2024."},{"key":"e_1_3_2_1_38_1","volume-title":"And Segment Forecasts, 2024","author":"Research Grand View","year":"2023","unstructured":"Grand View Research. 2023. Large Language Model Market Size, Share & Trends Analysis Report By Application (Customer Service, Content Generation), By Deployment, By Industry Vertical, By Region, And Segment Forecasts, 2024 - 2030. https:\/\/www.grandviewresearch.com\/industry-analysis\/large-language-model-llm-market-report."},{"key":"e_1_3_2_1_39_1","unstructured":"R. Grosse J. Bae C. Anil N. Elhage A. Tamkin et al. 2023. Studying large language model generalization with influence functions. arXiv preprint arXiv:2308.03296 (2023)."},{"key":"e_1_3_2_1_40_1","unstructured":"S. Gunasekar Y. Zhang J. Aneja C.C.T. Mendes A. Del Giorno et al. 2023. Textbooks Are All You Need. arXiv preprint arXiv:2306.11644 (2023)."},{"key":"e_1_3_2_1_41_1","volume-title":"Large language model based multi-agents: A survey of progress and challenges. arXiv preprint arXiv:2402.01680","author":"Guo Taicheng","year":"2024","unstructured":"Taicheng Guo, Xiuying Chen, Yaqi Wang, Ruidi Chang, Shichao Pei, Nitesh V Chawla, Olaf Wiest, and Xiangliang Zhang. 2024. Large language model based multi-agents: A survey of progress and challenges. arXiv preprint arXiv:2402.01680 (2024)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1007\/s40747-024-01527-8"},{"key":"e_1_3_2_1_43_1","volume-title":"Keheliya Gallaba, Filipe Roseiro Cogo, Boyuan Chen, Haoxiang Zhang, Kishanthan Thangarajah, Gustavo Oliva, Jiahuei (Justina) Lin, Wali Mohammad Abdullah, and Zhen Ming (Jack) Jiang.","author":"Hassan Ahmed E.","year":"2024","unstructured":"Ahmed E. Hassan, Dayi Lin, Gopi Krishnan Rajbahadur, Keheliya Gallaba, Filipe Roseiro Cogo, Boyuan Chen, Haoxiang Zhang, Kishanthan Thangarajah, Gustavo Oliva, Jiahuei (Justina) Lin, Wali Mohammad Abdullah, and Zhen Ming (Jack) Jiang. 2024. Rethinking Software Engineering in the Era of Foundation Models: A Curated Catalogue of Challenges in the Development of Trustworthy FMware. In Companion Proceedings of the 32nd ACM International Conference on the Foundations of Software Engineering (Porto de Galinhas, Brazil). 294-305."},{"key":"e_1_3_2_1_44_1","unstructured":"Ahmed E. Hassan Gustavo A. Oliva Dayi Lin Boyuan Chen Zhen Ming and Jiang. 2024. Towards AI-Native Software Engineering (SE 3.0): A Vision and a Challenge Roadmap. arXiv:2410.06107"},{"key":"e_1_3_2_1_45_1","unstructured":"S. Hong M. Zhuge J. Chen X. Zheng Y. Cheng et al. 2023. MetaGPT: Meta Programming for A Multi-Agent Collaborative Framework. arXiv:2308.00352 [cs.AI]"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3695988"},{"key":"e_1_3_2_1_47_1","volume-title":"Understanding the planning of LLM agents: A survey. arXiv preprint arXiv:2402.02716","author":"Huang Xu","year":"2024","unstructured":"Xu Huang, Weiwen Liu, Xiaolong Chen, Xingmei Wang, Hao Wang, Defu Lian, Yasheng Wang, Ruiming Tang, and Enhong Chen. 2024. Understanding the planning of LLM agents: A survey. arXiv preprint arXiv:2402.02716 (2024)."},{"key":"e_1_3_2_1_48_1","unstructured":"A.Q Jiang A. Sablayrolles A. Mensch C. Bamford D.S. Chaplot et al. 2023. Mistral 7B. arXiv preprint arXiv:2310.06825 (2023)."},{"key":"e_1_3_2_1_49_1","volume-title":"A Survey on Large Language Models for Code Generation. arXiv preprint arXiv:2406.00515","author":"Jiang Juyong","year":"2024","unstructured":"Juyong Jiang, Fan Wang, Jiasi Shen, Sungju Kim, and Sunghun Kim. 2024. A Survey on Large Language Models for Code Generation. arXiv preprint arXiv:2406.00515 (2024)."},{"key":"e_1_3_2_1_50_1","volume-title":"From llms to llm-based agents for software engineering: A survey of current, challenges and future. arXiv preprint arXiv:2408.02479","author":"Jin Haolin","year":"2024","unstructured":"Haolin Jin, Linghan Huang, Haipeng Cai, Jun Yan, Bo Li, and Huaming Chen. 2024. From llms to llm-based agents for software engineering: A survey of current, challenges and future. arXiv preprint arXiv:2408.02479 (2024)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-65647-7_8"},{"key":"e_1_3_2_1_52_1","volume-title":"Renze Lou, Jihyun Janice Ahn, Yilun Zhao, Xiaoxin Lu, Nan Zhang, Yusen Zhang, Ranran Haoran Zhang, Sujeeth Reddy Vummanthala, et al.","author":"Kamoi Ryo","year":"2024","unstructured":"Ryo Kamoi, Sarkar Snigdha Sarathi Das, Renze Lou, Jihyun Janice Ahn, Yilun Zhao, Xiaoxin Lu, Nan Zhang, Yusen Zhang, Ranran Haoran Zhang, Sujeeth Reddy Vummanthala, et al. 2024. Evaluating LLMs at Detecting Errors in LLM Responses. arXiv preprint arXiv:2404.03602 (2024)."},{"key":"e_1_3_2_1_53_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Khattab Omar","year":"2024","unstructured":"Omar Khattab, Arnav Singhvi, Paridhi Maheshwari, Zhiyuan Zhang, Keshav Santhanam, Sri Vardhamanan, Saiful Haq, Ashutosh Sharma, Thomas T. Joshi, Hanna Moazam, Heather Miller, Matei Zaharia, and Christopher Potts. 2024. DSPy: Compiling Declarative Language Model Calls into Self-Improving Pipelines. The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_55_1","volume-title":"Summary of a haystack: A challenge to long-context llms and rag systems. arXiv preprint arXiv:2407.01370","author":"Laban Philippe","year":"2024","unstructured":"Philippe Laban, Alexander R Fabbri, Caiming Xiong, and Chien-ShengWu. 2024. Summary of a haystack: A challenge to long-context llms and rag systems. arXiv preprint arXiv:2407.01370 (2024)."},{"key":"e_1_3_2_1_56_1","first-page":"9459","volume-title":"Lin (Eds.)","volume":"33","author":"Lewis P.","year":"2020","unstructured":"P. Lewis, E. Perez, A. Piktus, F. Petroni, V. Karpukhin, et al. 2020. Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. In Advances in Neural Information Processing Systems, H. Larochelle, M. Ranzato, R. Hadsell, M.F. Balcan, and H. Lin (Eds.), Vol. 33. Curran Associates, Inc., 9459-9474."},{"key":"e_1_3_2_1_57_1","volume-title":"Hassan","author":"Lin Jiahuei","year":"2024","unstructured":"Jiahuei Lin, Dayi Lin, Sky Zhang, and Ahmed E. Hassan. 2024. Engineering AI Judge Systems. arXiv:2411.17793 [cs.SE] https:\/\/arxiv.org\/abs\/2411.17793"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00638"},{"key":"e_1_3_2_1_59_1","unstructured":"Ruibo Liu Jerry Wei Fangyu Liu Chenglei Si Yanzhe Zhang Jinmeng Rao Steven Zheng Daiyi Peng Diyi Yang Denny Zhou et al. 2024. Best practices and lessons learned on synthetic data for language models. arXiv preprint arXiv:2404.07503 (2024)."},{"key":"e_1_3_2_1_60_1","volume-title":"Gpteval: Nlg evaluation using gpt-4 with better human alignment. arXiv preprint arXiv:2303.16634","author":"Liu Y.","year":"2023","unstructured":"Y. Liu, D. Iter, Yi. Xu, S. Wang, R. Xu, and C. Zhu. 2023. Gpteval: Nlg evaluation using gpt-4 with better human alignment. arXiv preprint arXiv:2303.16634 (2023)."},{"key":"e_1_3_2_1_61_1","unstructured":"S. Longpre R. Mahari A. Chen N. Obeng-Marnu D. Sileo et al. 2023. The Data Provenance Initiative: A Large Scale Audit of Dataset Licensing & Attribution in AI. arXiv preprint arXiv:2310.16787 (2023)."},{"key":"e_1_3_2_1_62_1","volume-title":"Arena Learning: Build Data Flywheel for LLMs Post-training via Simulated Chatbot Arena. arXiv preprint arXiv:2407.10627","author":"Luo Haipeng","year":"2024","unstructured":"Haipeng Luo, Qingfeng Sun, Can Xu, Pu Zhao, Qingwei Lin, Jianguang Lou, Shifeng Chen, Yansong Tang, and Weizhu Chen. 2024. Arena Learning: Build Data Flywheel for LLMs Post-training via Simulated Chatbot Arena. arXiv preprint arXiv:2407.10627 (2024)."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1145\/3644815.3644950"},{"key":"e_1_3_2_1_64_1","unstructured":"Microsoft. [n. d.]. microsoft\/semantic-kernel: Integrate cutting-edge LLM technology quickly and easily into your apps. https:\/\/github.com\/microsoft\/semantic-kernel. Accessed 01-30-2024."},{"key":"e_1_3_2_1_65_1","volume-title":"Use of llms for illicit purposes: Threats, prevention measures, and vulnerabilities. arXiv preprint arXiv:2308.12833","author":"Mozes M.","year":"2023","unstructured":"M. Mozes, X. He, B. Kleinberg, and L.D Griffin. 2023. Use of llms for illicit purposes: Threats, prevention measures, and vulnerabilities. arXiv preprint arXiv:2308.12833 (2023)."},{"key":"e_1_3_2_1_66_1","unstructured":"V. Murali C. Maddila I. Ahmad M. Bolin D. Cheng et al. 2023. CodeCompose: A Large-Scale Industrial Deployment of AI-assisted Code Authoring. arXiv preprint arXiv:2305.12050 (2023)."},{"key":"e_1_3_2_1_67_1","volume-title":"Beyond the Comfort Zone: Emerging Solutions to Overcome Challenges in Integrating LLMs into Software Products. arXiv preprint arXiv:2410.12071","author":"Nahar Nadia","year":"2024","unstructured":"Nadia Nahar, Christian K\u00e4stner, Jenna Butler, Chris Parnin, Thomas Zimmermann, and Christian Bird. 2024. Beyond the Comfort Zone: Emerging Solutions to Overcome Challenges in Integrating LLMs into Software Products. arXiv preprint arXiv:2410.12071 (2024)."},{"key":"e_1_3_2_1_68_1","unstructured":"OpenAI. n.d.. Hello GPT-4o | OpenAI. https:\/\/openai.com\/index\/hello-gpt-4o\/. Accessed: 2025-02-01."},{"key":"e_1_3_2_1_69_1","unstructured":"OpenAI. n.d.. Models - OpenAI API. https:\/\/platform.openai.com\/docs\/models. Accessed: 2025-02-01."},{"key":"e_1_3_2_1_70_1","unstructured":"OpenAI. n.d.. OpenAI o1 Hub | OpenAI. https:\/\/openai.com\/o1\/. Accessed: 2025-02-01."},{"key":"e_1_3_2_1_71_1","volume-title":"Memgpt: Towards llms as operating systems. arXiv preprint arXiv:2310.08560","author":"Packer Charles","year":"2023","unstructured":"Charles Packer, Vivian Fang, Shishir G Patil, Kevin Lin, Sarah Wooders, and Joseph E Gonzalez. 2023. Memgpt: Towards llms as operating systems. arXiv preprint arXiv:2310.08560 (2023)."},{"key":"e_1_3_2_1_72_1","unstructured":"C. Parnin G. Soares R. Pandita S. Gulwani J. Rich et al. 2023. Building Your Own Product Copilot: Challenges Opportunities and Needs. arXiv:2312.14231 [cs.SE]"},{"key":"e_1_3_2_1_73_1","volume-title":"Proc. of Int. Conf. on Softw. Eng.: Software Engineering in Practice. 253-262","author":"Parry O.","unstructured":"O. Parry, G.M. Kapfhammer, M. Hilton, and P. McMinn. 2022. Surveying the developer experience of flaky tests. In Proc. of Int. Conf. on Softw. Eng.: Software Engineering in Practice. 253-262."},{"key":"e_1_3_2_1_74_1","volume-title":"Research, Best Practices, Applied Research Challenges and Opportunities. arXiv preprint arXiv:2408.13296","author":"Parthasarathy Venkatesh Balavadhani","year":"2024","unstructured":"Venkatesh Balavadhani Parthasarathy, Ahtsham Zafar, Aafaq Khan, and Arsalan Shahid. 2024. The Ultimate Guide to Fine-Tuning LLMs from Basics to Breakthroughs: An Exhaustive Review of Technologies, Research, Best Practices, Applied Research Challenges and Opportunities. arXiv preprint arXiv:2408.13296 (2024)."},{"key":"e_1_3_2_1_75_1","unstructured":"Baolin Peng Michel Galley Pengcheng He Hao Cheng Yujia Xie Yu Hu Qiuyuan Huang Lars Liden Zhou Yu Weizhu Chen and Jianfeng Gao. 2023. Check Your Facts and Try Again: Improving Large Language Models with External Knowledge and Automated Feedback. arXiv:2302.12813 [cs.CL] https:\/\/arxiv.org\/abs\/2302.12813"},{"key":"e_1_3_2_1_76_1","volume-title":"Hassan","author":"Rajbahadur Gopi Krishnan","year":"2025","unstructured":"Gopi Krishnan Rajbahadur, Gustavo A. Oliva, Dayi Lin, and Ahmed E. Hassan. 2025. From Cool Demos to Production-Ready FMware: Core Challenges and a Technology Roadmap. arXiv:2410.20791 [cs.SE] https:\/\/arxiv.org\/abs\/2410.20791"},{"key":"e_1_3_2_1_77_1","volume-title":"Navigating Complexity: Orchestrated Problem Solving with Multi-Agent LLMs. arXiv preprint arXiv:2402.16713","author":"Rasal Sumedh","year":"2024","unstructured":"Sumedh Rasal and EJ Hauer. 2024. Navigating Complexity: Orchestrated Problem Solving with Multi-Agent LLMs. arXiv preprint arXiv:2402.16713 (2024)."},{"key":"e_1_3_2_1_78_1","volume-title":"Nemo guardrails: A toolkit for controllable and safe llm applications with programmable rails. arXiv preprint arXiv:2310.10501","author":"Rebedea Traian","year":"2023","unstructured":"Traian Rebedea, Razvan Dinu, Makesh Sreedhar, Christopher Parisien, and Jonathan Cohen. 2023. Nemo guardrails: A toolkit for controllable and safe llm applications with programmable rails. arXiv preprint arXiv:2310.10501 (2023)."},{"key":"e_1_3_2_1_79_1","unstructured":"RobBagby. [n. d.]. Retry Storm antipattern - Performance antipatterns for cloud apps - learn.microsoft.com. https:\/\/learn.microsoft.com\/en-us\/azure\/architecture\/antipatterns\/retry-storm\/. [Accessed 07-10-2024]."},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"publisher","DOI":"10.3390\/math12060929"},{"key":"e_1_3_2_1_81_1","volume-title":"Blended RAG: Improving RAG (Retriever-Augmented Generation) Accuracy with Semantic Search and Hybrid Query-Based Retrievers. arXiv preprint arXiv:2404.07220","author":"Sawarkar Kunal","year":"2024","unstructured":"Kunal Sawarkar, Abhilasha Mangal, and Shivam Raj Solanki. 2024. Blended RAG: Improving RAG (Retriever-Augmented Generation) Accuracy with Semantic Search and Hybrid Query-Based Retrievers. arXiv preprint arXiv:2404.07220 (2024)."},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"crossref","unstructured":"S. Schoch R. Mishra and Y. Ji. 2023. Data Selection for Fine-tuning Large Language Models Using Transferred Shapley Values. arXiv preprint arXiv:2306.10165 (2023).","DOI":"10.18653\/v1\/2023.acl-srw.37"},{"key":"e_1_3_2_1_83_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICSME52107.2021.00053"},{"key":"e_1_3_2_1_84_1","volume-title":"Large language model alignment: A survey. arXiv preprint arXiv:2309.15025","author":"Shen Tianhao","year":"2023","unstructured":"Tianhao Shen, Renren Jin, Yufei Huang, Chuang Liu,Weilong Dong, Zishan Guo, Xinwei Wu, Yan Liu, and Deyi Xiong. 2023. Large language model alignment: A survey. arXiv preprint arXiv:2309.15025 (2023)."},{"key":"e_1_3_2_1_85_1","unstructured":"W. Shi A. Ajith M. Xia Y. Huang D. Liu et al. 2023. Detecting pretraining data from large language models. arXiv preprint arXiv:2310.16789 (2023)."},{"key":"e_1_3_2_1_86_1","first-page":"1146","article-title":"Interactive and Visual Prompt Engineering for Ad-hoc Task Adaptation with Large Language Models","volume":"29","author":"Strobelt H.","year":"2023","unstructured":"H. Strobelt, A. Webson, V. Sanh, B. Hoover, J. Beyer, et al. 2023. Interactive and Visual Prompt Engineering for Ad-hoc Task Adaptation with Large Language Models. IEEE Trans. on Visualization and Computer Graphics 29, 1 (2023), 1146-1156.","journal-title":"IEEE Trans. on Visualization and Computer Graphics"},{"key":"e_1_3_2_1_87_1","volume-title":"Sankaran Vaidyanathan, and Dieuwke Hupkes.","author":"Thakur Aman Singh","year":"2024","unstructured":"Aman Singh Thakur, Kartik Choudhary, Venkat Srinik Ramayapally, Sankaran Vaidyanathan, and Dieuwke Hupkes. 2024. Judging the Judges: Evaluating Alignment and Vulnerabilities in LLMs-as-Judges. arXiv preprint arXiv:2406.12624 (2024)."},{"key":"e_1_3_2_1_88_1","unstructured":"Unknown. 2023. SPDX 3.0 Dataset Profile. https:\/\/spdx.github.io\/spdx-spec\/v3.0\/model\/Dataset\/Dataset\/ Accessed: 2024-10-11."},{"key":"e_1_3_2_1_89_1","volume-title":"Proceedings of the International Conference on Modeling, Natural Language Processing and Machine Learning. 91-97","author":"Yang Yutian","year":"2024","unstructured":"CangqingWang, Yutian Yang, Ruisi Li, Dan Sun, Ruicong Cai, Yuzhu Zhang, and Chengqian Fu. 2024. Adapting llms for efficient context processing through soft prompt compression. In Proceedings of the International Conference on Modeling, Natural Language Processing and Machine Learning. 91-97."},{"key":"e_1_3_2_1_90_1","volume-title":"Voyager: An open-ended embodied agent with large language models. arXiv preprint arXiv:2305.16291","author":"Wang Guanzhi","year":"2023","unstructured":"Guanzhi Wang, Yuqi Xie, Yunfan Jiang, Ajay Mandlekar, Chaowei Xiao, Yuke Zhu, Linxi Fan, and Anima Anandkumar. 2023. Voyager: An open-ended embodied agent with large language models. arXiv preprint arXiv:2305.16291 (2023)."},{"key":"e_1_3_2_1_91_1","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445645"},{"key":"e_1_3_2_1_92_1","doi-asserted-by":"publisher","DOI":"10.1145\/3589335.3641257"},{"key":"e_1_3_2_1_93_1","volume-title":"A survey on ChatGPT: AI-generated contents, challenges, and solutions","author":"Wang Yuntao","year":"2023","unstructured":"Yuntao Wang, Yanghe Pan, Miao Yan, Zhou Su, and Tom H Luan. 2023. A survey on ChatGPT: AI-generated contents, challenges, and solutions. IEEE Open Journal of the Computer Society (2023)."},{"key":"e_1_3_2_1_94_1","unstructured":"J.Wei X.Wang D. Schuurmans M. Bosma E. Chi et al. 2022. Chain-of-thought prompting elicits reasoning in large language models. Advances in Neural Info Processing Systems (2022)."},{"key":"e_1_3_2_1_95_1","unstructured":"OpenDataology workgroup. 2022. OpenDataology. https:\/\/github.com\/OpenDataology\/. Accessed 02-07-2024."},{"key":"e_1_3_2_1_96_1","unstructured":"Q. Wu G. Bansal J. Zhang Y. Wu B. Li E. Zhu et al. 2023. Auto-Gen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation. arXiv:2308.08155 [cs.AI]"},{"key":"e_1_3_2_1_97_1","volume-title":"How Easily do Irrelevant Inputs Skew the Responses of Large Language Models? arXiv preprint arXiv:2404.03302","author":"Wu Siye","year":"2024","unstructured":"Siye Wu, Jian Xie, Jiangjie Chen, Tinghui Zhu, Kai Zhang, and Yanghua Xiao. 2024. How Easily do Irrelevant Inputs Skew the Responses of Large Language Models? arXiv preprint arXiv:2404.03302 (2024)."},{"key":"e_1_3_2_1_98_1","doi-asserted-by":"crossref","unstructured":"Yuchen Xia Jiho Kim Yuhan Chen Haojie Ye Souvik Kundu Nishil Talati et al. 2024. Understanding the Performance and Estimating the Cost of LLM Fine-Tuning. arXiv preprint arXiv:2408.04693 (2024).","DOI":"10.1109\/IISWC63097.2024.00027"},{"key":"e_1_3_2_1_99_1","volume-title":"Adaptive chameleon or stubborn sloth: Revealing the behavior of large language models in knowledge conflicts. arXiv preprint arXiv:2305.13300","author":"Xie Jian","year":"2023","unstructured":"Jian Xie, Kai Zhang, Jiangjie Chen, Renze Lou, and Yu Su. 2023. Adaptive chameleon or stubborn sloth: Revealing the behavior of large language models in knowledge conflicts. arXiv preprint arXiv:2305.13300 (2023)."},{"key":"e_1_3_2_1_100_1","unstructured":"Eugene Yan Bryan Bischof Charles Frye Hamel Husain Jason Liu and Shreya Shankar. [n. d.]. What We Learned from a Year of Building with LLMs (Part I) - oreilly.com. https:\/\/www.oreilly.com\/radar\/what-we-learned-from-a-year-of-building-with-llms-part-i\/. [Accessed 04-10-2024]."},{"key":"e_1_3_2_1_101_1","volume-title":"A survey on large language models for software engineering. arXiv preprint arXiv:2312.15223","author":"Zhang Quanjun","year":"2023","unstructured":"Quanjun Zhang, Chunrong Fang, Yang Xie, Yaxin Zhang, Yun Yang, Weisong Sun, Shengcheng Yu, and Zhenyu Chen. 2023. A survey on large language models for software engineering. arXiv preprint arXiv:2312.15223 (2023)."},{"key":"e_1_3_2_1_102_1","doi-asserted-by":"publisher","DOI":"10.1109\/TSE.2025.3533972"},{"key":"e_1_3_2_1_103_1","volume-title":"Lima: Less is more for alignment. arXiv preprint arXiv:2305.11206","author":"Zhou C.","year":"2023","unstructured":"C. Zhou, P. Liu, P. Xu, S. Iyer, J. Sun, et al. 2023. Lima: Less is more for alignment. arXiv preprint arXiv:2305.11206 (2023)."},{"key":"e_1_3_2_1_104_1","volume-title":"Agents: An Open-source Framework for Autonomous Language Agents. arXiv:2309.07870 [cs.CL]","author":"Zhou W.","year":"2023","unstructured":"W. Zhou, Y.E. Jiang, L. Li, J. Wu, T. Wang, et al. 2023. Agents: An Open-source Framework for Autonomous Language Agents. arXiv:2309.07870 [cs.CL]"},{"key":"e_1_3_2_1_105_1","volume-title":"Ziwen Han, Keiran Paster, Silviu Pitis, Harris Chan, and Jimmy Ba.","author":"Zhou Yongchao","year":"2023","unstructured":"Yongchao Zhou, Andrei Ioan Muresanu, Ziwen Han, Keiran Paster, Silviu Pitis, Harris Chan, and Jimmy Ba. 2023. Large Language Models Are Human-Level Prompt Engineers. arXiv:2211.01910 [cs.LG] https:\/\/arxiv.org\/abs\/2211.01910"},{"key":"e_1_3_2_1_106_1","unstructured":"X. Zhu J. Li Y. Liu C. Ma and W. Wang. 2023. A survey on model compression for large language models. arXiv preprint arXiv:2308.07633 (2023)."},{"key":"e_1_3_2_1_107_1","doi-asserted-by":"crossref","unstructured":"Ma\u0142gorzata \u0141azuka Andreea Anghel and Thomas Parnell. 2024. LLM-Pilot: Characterize and Optimize Performance of your LLM Inference Services. arXiv:2410.02425 https:\/\/arxiv.org\/abs\/2410.02425","DOI":"10.1109\/SC41406.2024.00022"}],"event":{"name":"KDD '25: The 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining","location":"Toronto ON Canada","acronym":"KDD '25","sponsor":["SIGKDD ACM Special Interest Group on Knowledge Discovery in Data","SIGMOD ACM Special Interest Group on Management of Data"]},"container-title":["Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V.2"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3711896.3736572","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T18:17:41Z","timestamp":1777573061000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3711896.3736572"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,3]]},"references-count":107,"alternative-id":["10.1145\/3711896.3736572","10.1145\/3711896"],"URL":"https:\/\/doi.org\/10.1145\/3711896.3736572","relation":{},"subject":[],"published":{"date-parts":[[2025,8,3]]},"assertion":[{"value":"2025-08-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}