{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T16:48:08Z","timestamp":1755794888758,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":33,"publisher":"ACM","funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/100000002","name":"NIH (National Institutes of Health)","doi-asserted-by":"publisher","award":["1R01LM014012"],"award-info":[{"award-number":["1R01LM014012"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/100000002","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["217071, 2213700, 2106913, 2008208"],"award-info":[{"award-number":["217071, 2213700, 2106913, 2008208"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,3]]},"DOI":"10.1145\/3711896.3737419","type":"proceedings-article","created":{"date-parts":[[2025,8,3]],"date-time":"2025-08-03T20:52:41Z","timestamp":1754254361000},"page":"5888-5899","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["IdeaBench: Benchmarking Large Language Models for Research Idea Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4764-3359","authenticated-orcid":false,"given":"Sikun","family":"Guo","sequence":"first","affiliation":[{"name":"Department of Computer Science, University of Virginia, Charlottesville, VA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-6590-3748","authenticated-orcid":false,"given":"Amir Hassan","family":"Shariatmadari","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Virginia, Charlottesville, VA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8049-5298","authenticated-orcid":false,"given":"Guangzhi","family":"Xiong","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Virginia, Charlottesville, VA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-4875-805X","authenticated-orcid":false,"given":"Albert","family":"Huang","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Virginia, Charlottesville, VA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0424-5473","authenticated-orcid":false,"given":"Myles","family":"Kim","sequence":"additional","affiliation":[{"name":"School of Medicine, University of Virginia, Charlottesville, VA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-3466-916X","authenticated-orcid":false,"given":"Corey M.","family":"Williams","sequence":"additional","affiliation":[{"name":"School of Medicine, University of Virginia, Charlottesville, VA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3177-4346","authenticated-orcid":false,"given":"Stefan","family":"Bekiranov","sequence":"additional","affiliation":[{"name":"School of Medicine, University of Virginia, Charlottesville, VA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9723-3246","authenticated-orcid":false,"given":"Aidong","family":"Zhang","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Virginia, Charlottesville, VA, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,8,3]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"Microsoft Research AI4Science and Microsoft Azure Quantum. 2023. The impact of large language models on scientific discovery: a preliminary study using gpt-4. arXiv preprint arXiv:2311.07361(2023)."},{"key":"e_1_3_2_2_2_1","volume-title":"Silviu Cucerzan, and Sung Ju Hwang.","author":"Baek Jinheon","year":"2024","unstructured":"Jinheon Baek, Sujay Kumar Jauhar, Silviu Cucerzan, and Sung Ju Hwang. 2024. Researchagent: Iterative research idea generation over scientific literature with large language models. arXiv preprint arXiv:2404.07738(2024)."},{"key":"e_1_3_2_2_3_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Bai Yushi","year":"2024","unstructured":"Yushi Bai, Jiahao Ying, Yixin Cao, Xin Lv, Yuze He, Xiaozhi Wang, Jifan Yu, Kaisheng Zeng, Yijia Xiao, Haozhe Lyu, et al., 2024. Benchmarking foundation models with language-model-as-an-examiner. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_2_4_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. Advances in neural information processing systems Vol. 33 (2020) 1877-1901."},{"key":"e_1_3_2_2_5_1","volume-title":"Yuanzhi Li, Scott Lundberg, et al.","author":"Bubeck S\u00e9bastien","year":"2023","unstructured":"S\u00e9bastien Bubeck, Varun Chandrasekaran, Ronen Eldan, Johannes Gehrke, Eric Horvitz, Ece Kamar, Peter Lee, Yin Tat Lee, Yuanzhi Li, Scott Lundberg, et al., 2023. Sparks of artificial general intelligence: Early experiments with gpt-4. arXiv preprint arXiv:2303.12712(2023)."},{"key":"e_1_3_2_2_6_1","unstructured":"Cheng-Han Chiang and Hung-yi Lee. 2023. Can large language models be an alternative to human evaluations? arXiv preprint arXiv:2305.01937(2023)."},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-long.365"},{"key":"e_1_3_2_2_8_1","unstructured":"Google Scholar. 2024. Google Scholar Top Publications. https:\/\/scholar.google.com\/citations?view_op=top_venues."},{"key":"e_1_3_2_2_9_1","volume-title":"Embracing Foundation Models for Advancing Scientific Discovery. In 2024 IEEE International Conference on Big Data (BigData). IEEE, 1746-1755","author":"Guo Sikun","year":"2024","unstructured":"Sikun Guo, Amir Hassan Shariatmadari, Guangzhi Xiong, and Aidong Zhang. 2024. Embracing Foundation Models for Advancing Scientific Discovery. In 2024 IEEE International Conference on Big Data (BigData). IEEE, 1746-1755."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1137\/1.9781611978520.45"},{"key":"e_1_3_2_2_11_1","volume-title":"Nova: An iterative planning and search approach to enhance novelty and diversity of llm generated ideas. arXiv preprint arXiv:2410.14255(2024).","author":"Hu Xiang","year":"2024","unstructured":"Xiang Hu, Hongyu Fu, Jinge Wang, Yifeng Wang, Zhikun Li, Renjun Xu, Yu Lu, Yaochu Jin, Lili Pan, and Zhenzhong Lan. 2024. Nova: An iterative planning and search approach to enhance novelty and diversity of llm generated ideas. arXiv preprint arXiv:2410.14255(2024)."},{"key":"e_1_3_2_2_12_1","unstructured":"Rodney Kinney Chloe Anastasiades Russell Authur Iz Beltagy Jonathan Bragg Alexandra Buraczynski Isabel Cachola Stefan Candra Yoganand Chandrasekhar Arman Cohan et al. 2023. The semantic scholar open data platform. arXiv preprint arXiv:2301.10140(2023)."},{"key":"e_1_3_2_2_13_1","volume-title":"Machel Reid, Yutaka Matsuo, and Yusuke Iwasawa.","author":"Kojima Takeshi","year":"2022","unstructured":"Takeshi Kojima, Shixiang Shane Gu, Machel Reid, Yutaka Matsuo, and Yusuke Iwasawa. 2022. Large language models are zero-shot reasoners. Advances in neural information processing systems, Vol. 35 (2022), 22199-22213."},{"key":"e_1_3_2_2_14_1","unstructured":"Ruochen Li Liqiang Jing Chi Han Jiawei Zhou and Xinya Du. 2024. Learning to Generate Research Idea with Dynamic Control. arXiv preprint arXiv:2412.14626(2024)."},{"key":"e_1_3_2_2_15_1","volume-title":"ROUGE: A Package for Automatic Evaluation of Summaries. In Annual Meeting of the Association for Computational Linguistics. https:\/\/api.semanticscholar.org\/CorpusID:964287","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. ROUGE: A Package for Automatic Evaluation of Summaries. In Annual Meeting of the Association for Computational Linguistics. https:\/\/api.semanticscholar.org\/CorpusID:964287"},{"key":"e_1_3_2_2_16_1","volume-title":"G-eval: Nlg evaluation using gpt-4 with better human alignment. arXiv preprint arXiv:2303.16634(2023).","author":"Liu Yang","year":"2023","unstructured":"Yang Liu, Dan Iter, Yichong Xu, Shuohang Wang, Ruochen Xu, and Chenguang Zhu. 2023. G-eval: Nlg evaluation using gpt-4 with better human alignment. arXiv preprint arXiv:2303.16634(2023)."},{"key":"e_1_3_2_2_17_1","unstructured":"OpenAI. 2023. GPT-4 Technical Report. ArXiv Vol. abs\/2303.08774 (2023). https:\/\/arxiv.org\/abs\/2303.08774"},{"key":"e_1_3_2_2_18_1","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311-318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311-318."},{"key":"e_1_3_2_2_19_1","unstructured":"Linlu Qiu Liwei Jiang Ximing Lu Melanie Sclar Valentina Pyatkin Chandra Bhagavatula Bailin Wang Yoon Kim Yejin Choi Nouha Dziri et al. 2023. Phenomenal yet puzzling: Testing inductive reasoning capabilities of language models with hypothesis refinement. arXiv preprint arXiv:2310.08559(2023)."},{"key":"e_1_3_2_2_20_1","unstructured":"Machel Reid Nikolay Savinov Denis Teplyashin Dmitry Lepikhin Timothy Lillicrap Jean-baptiste Alayrac Radu Soricut Angeliki Lazaridou Orhan Firat Julian Schrittwieser et al. 2024. Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context. arXiv preprint arXiv:2403.05530(2024)."},{"key":"e_1_3_2_2_21_1","unstructured":"Samuel Schmidgall Yusheng Su Ze Wang Ximeng Sun Jialian Wu Xiaodong Yu Jiang Liu Zicheng Liu and Emad Barsoum. 2025. Agent laboratory: Using llm agents as research assistants. arXiv preprint arXiv:2501.04227(2025)."},{"key":"e_1_3_2_2_22_1","unstructured":"Chenglei Si Diyi Yang and Tatsunori Hashimoto. 2024. Can LLMs Generate Novel Research Ideas? A Large-Scale Human Study with 100 NLP Researchers. arXiv preprint arXiv:2409.04109(2024)."},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"crossref","unstructured":"Brian Thompson and Matt Post. 2020. Automatic machine translation evaluation in many languages via zero-shot paraphrasing. arXiv preprint arXiv:2004.14564(2020).","DOI":"10.18653\/v1\/2020.emnlp-main.8"},{"key":"e_1_3_2_2_24_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288(2023)."},{"key":"e_1_3_2_2_25_1","volume-title":"Foundational Autoraters: Taming Large Language Models for Better Automatic Evaluation. arXiv preprint arXiv:2407.10817(2024).","author":"Vu Tu","year":"2024","unstructured":"Tu Vu, Kalpesh Krishna, Salaheddin Alzubi, Chris Tar, Manaal Faruqui, and Yun-Hsuan Sung. 2024. Foundational Autoraters: Taming Large Language Models for Better Automatic Evaluation. arXiv preprint arXiv:2407.10817(2024)."},{"key":"e_1_3_2_2_26_1","unstructured":"Qingyun Wang Doug Downey Heng Ji and Tom Hope. 2023a. Learning to generate novel scientific directions with contextualized literature-based discovery. arXiv preprint arXiv:2305.14259(2023)."},{"key":"e_1_3_2_2_27_1","volume-title":"Scimon: Scientific inspiration machines optimized for novelty. arXiv preprint arXiv:2305.14259(2023).","author":"Wang Qingyun","year":"2023","unstructured":"Qingyun Wang, Doug Downey, Heng Ji, and Tom Hope. 2023b. Scimon: Scientific inspiration machines optimized for novelty. arXiv preprint arXiv:2305.14259(2023)."},{"key":"e_1_3_2_2_28_1","volume-title":"Sikun Guo, Stefan Bekiranov, and Aidong Zhang.","author":"Xiong Guangzhi","year":"2025","unstructured":"Guangzhi Xiong, Eric Xie, Corey Williams, Myles Kim, Amir Hassan Shariatmadari, Sikun Guo, Stefan Bekiranov, and Aidong Zhang. 2025. Toward Reliable Biomedical Hypothesis Generation: Evaluating Truthfulness and Hallucination in Large Language Models. arXiv preprint arXiv:2505.14599(2025)."},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"crossref","unstructured":"Zonglin Yang Xinya Du Junxian Li Jie Zheng Soujanya Poria and Erik Cambria. 2023. Large language models for automated open-domain scientific hypotheses discovery. arXiv preprint arXiv:2309.02726(2023).","DOI":"10.18653\/v1\/2024.findings-acl.804"},{"key":"e_1_3_2_2_30_1","first-page":"27263","article-title":"Bartscore: Evaluating generated text as text generation","volume":"34","author":"Yuan Weizhe","year":"2021","unstructured":"Weizhe Yuan, Graham Neubig, and Pengfei Liu. 2021. Bartscore: Evaluating generated text as text generation. Advances in Neural Information Processing Systems, Vol. 34 (2021), 27263-27277.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_31_1","volume-title":"Bertscore: Evaluating text generation with bert. arXiv preprint arXiv:1904.09675(2019).","author":"Zhang Tianyi","year":"2019","unstructured":"Tianyi Zhang, Varsha Kishore, Felix Wu, Kilian Q Weinberger, and Yoav Artzi. 2019. Bertscore: Evaluating text generation with bert. arXiv preprint arXiv:1904.09675(2019)."},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"crossref","unstructured":"Wei Zhao Maxime Peyrard Fei Liu Yang Gao Christian M Meyer and Steffen Eger. 2019. MoverScore: Text generation evaluating with contextualized embeddings and earth mover distance. arXiv preprint arXiv:1909.02622(2019).","DOI":"10.18653\/v1\/D19-1053"},{"key":"e_1_3_2_2_33_1","unstructured":"Yangqiaoyu Zhou Haokun Liu Tejes Srivastava Hongyuan Mei and Chenhao Tan. 2024. Hypothesis Generation with Large Language Models. arXiv preprint arXiv:2404.04326(2024)."}],"event":{"name":"KDD '25: The 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data"],"location":"Toronto ON Canada","acronym":"KDD '25"},"container-title":["Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V.2"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3711896.3737419","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,16]],"date-time":"2025-08-16T14:31:22Z","timestamp":1755354682000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3711896.3737419"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,3]]},"references-count":33,"alternative-id":["10.1145\/3711896.3737419","10.1145\/3711896"],"URL":"https:\/\/doi.org\/10.1145\/3711896.3737419","relation":{},"subject":[],"published":{"date-parts":[[2025,8,3]]},"assertion":[{"value":"2025-08-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}