{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,4,5]],"date-time":"2025-04-05T04:11:26Z","timestamp":1743826286004,"version":"3.40.3"},"publisher-location":"Cham","reference-count":61,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031887192","type":"print"},{"value":"9783031887208","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-88720-8_45","type":"book-chapter","created":{"date-parts":[[2025,4,4]],"date-time":"2025-04-04T12:11:08Z","timestamp":1743768668000},"page":"291-299","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Large Language Models Are Human-Like Annotators"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1184-640X","authenticated-orcid":false,"given":"Mounika","family":"Marreddy","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5975-622X","authenticated-orcid":false,"given":"Subba Reddy","family":"Oota","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2843-3110","authenticated-orcid":false,"given":"Manish","family":"Gupta","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,4,3]]},"reference":[{"key":"45_CR1","unstructured":"Almazrouei, E., et\u00a0al.: The falcon series of open language models. arXiv preprint arXiv:2311.16867 (2023)"},{"key":"45_CR2","unstructured":"Anil, R., et\u00a0al.: Palm 2 Technical report. arXiv preprint arXiv:2305.10403 (2023)"},{"key":"45_CR3","doi-asserted-by":"crossref","unstructured":"Bai, G., et al.: MT-bench-101: a fine-grained benchmark for evaluating large language models in multi-turn dialogues. In: Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 7421\u20137454, August 2024","DOI":"10.18653\/v1\/2024.acl-long.401"},{"key":"45_CR4","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., Mann, B., Ryder, N., Subbiah, M., Kaplan, J.D., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A., et al.: Language models are few-shot learners. NeurIPS 33, 1877\u20131901 (2020)","journal-title":"NeurIPS"},{"key":"45_CR5","unstructured":"Chan, C.M., et al.: Chateval: towards better LLM-based evaluators through multi-agent debate. In: The Twelfth International Conference on Learning Representations (2023)"},{"key":"45_CR6","doi-asserted-by":"crossref","unstructured":"Chen, D., Lee, C., Lu, Y., Rosati, D., Yu, Z.: Mixture of soft prompts for controllable data generation. In: Findings of the Association for Computational Linguistics: EMNLP 2023, pp. 14815\u201314833 (2023)","DOI":"10.18653\/v1\/2023.findings-emnlp.988"},{"key":"45_CR7","unstructured":"Chen, M., et\u00a0al.: Evaluating large language models trained on code. arXiv preprint arXiv:2107.03374 (2021)"},{"key":"45_CR8","unstructured":"Chern, I., et\u00a0al.: FacTool: Factuality detection in generative AI\u2013a tool augmented framework for multi-task and multi-domain scenarios. arXiv preprint arXiv:2307.13528 (2023)"},{"key":"45_CR9","unstructured":"Chia, Y.K., Hong, P., Bing, L., Poria, S.: Instructeval: towards holistic evaluation of instruction-tuned large language models. In: Proceedings of the First edition of the Workshop on the Scaling Behavior of Large Language Models (SCALE-LLM 2024), pp. 35\u201364 (2024)"},{"issue":"70","key":"45_CR10","first-page":"1","volume":"25","author":"HW Chung","year":"2024","unstructured":"Chung, H.W., Hou, L., Longpre, S., Zoph, B., Tay, Y., Fedus, W., Li, Y., Wang, X., Dehghani, M., Brahma, S., et al.: Scaling instruction-finetuned language models. J. Mach. Learn. Res. 25(70), 1\u201353 (2024)","journal-title":"J. Mach. Learn. Res."},{"key":"45_CR11","doi-asserted-by":"crossref","unstructured":"Dhuliawala, S., et al.: Chain-of-verification reduces hallucination in large language models. In: ICLR 2024 Workshop on Reliable and Responsible Foundation Models (2024)","DOI":"10.18653\/v1\/2024.findings-acl.212"},{"key":"45_CR12","doi-asserted-by":"crossref","unstructured":"Ding, B., et al.: Is GPT-3 a good data annotator? In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 11173\u201311195 (2023)","DOI":"10.18653\/v1\/2023.acl-long.626"},{"key":"45_CR13","first-page":"30039","volume":"36","author":"Y Dubois","year":"2024","unstructured":"Dubois, Y., et al.: Alpacafarm: a simulation framework for methods that learn from human feedback. NeurIPS 36, 30039\u201330069 (2024)","journal-title":"NeurIPS"},{"key":"45_CR14","doi-asserted-by":"crossref","unstructured":"Fu, J., Ng, S.K., Jiang, Z., Liu, P.: GPTscore: evaluate as you desire. In: Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pp. 6556\u20136576 (2024)","DOI":"10.18653\/v1\/2024.naacl-long.365"},{"key":"45_CR15","unstructured":"Gemini\u00a0Team, Anil, R., et\u00a0al.: Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)"},{"issue":"30","key":"45_CR16","doi-asserted-by":"crossref","DOI":"10.1073\/pnas.2305016120","volume":"120","author":"F Gilardi","year":"2023","unstructured":"Gilardi, F., Alizadeh, M., Kubli, M.: Chatgpt outperforms crowd workers for text-annotation tasks. Proc. Natl. Acad. Sci. 120(30), e2305016120 (2023)","journal-title":"Proc. Natl. Acad. Sci."},{"key":"45_CR17","unstructured":"Gou, Z., et\u00a0al.: Tora: a tool-integrated reasoning agent for mathematical problem solving. In: The Twelfth International Conference on Learning Representations (2024)"},{"key":"45_CR18","unstructured":"Gupta, H., Scaria, K., Anantheswaran, U., Verma, S., Parmar, M., Sawant, S.A., Mishra, S., Baral, C.: Targen: Targeted data generation with large language models. COLM (2024)"},{"key":"45_CR19","doi-asserted-by":"crossref","unstructured":"He, X., et al.: AnnoLLM: making large language models to be better crowdsourced annotators. In: Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 6: Industry Track), pp. 165\u2013190 (2024)","DOI":"10.18653\/v1\/2024.naacl-industry.15"},{"key":"45_CR20","doi-asserted-by":"crossref","unstructured":"Honovich, O., Scialom, T., Levy, O., Schick, T.: Unnatural instructions: tuning language models with (almost) no human labor. In: The 61st Annual Meeting of the Association For Computational Linguistics (2023)","DOI":"10.18653\/v1\/2023.acl-long.806"},{"key":"45_CR21","unstructured":"Jiang, A.Q., et\u00a0al.: Mistral 7b. arXiv preprint arXiv:2310.06825 (2023)"},{"key":"45_CR22","doi-asserted-by":"crossref","unstructured":"Jie, Z., Lu, W.: Leveraging training data in few-shot prompting for numerical reasoning. In: The 61st Annual Meeting of the Association For Computational Linguistics (2023)","DOI":"10.18653\/v1\/2023.findings-acl.668"},{"key":"45_CR23","doi-asserted-by":"crossref","unstructured":"Kamoi, R., Goyal, T., Rodriguez, J.D., Durrett, G.: WiCE: Real-world entailment for claims in Wikipedia. In: Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, pp. 7561\u20137583 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.470"},{"key":"45_CR24","unstructured":"Kenton, J.D.M.W.C., Toutanova, L.K.: Bert: Pre-training of deep bidirectional transformers for language understanding. In: Proceedings of NAACL-HLT, vol.\u00a01, p.\u00a02. Minneapolis, Minnesota (2019)"},{"key":"45_CR25","doi-asserted-by":"crossref","unstructured":"Koo, R., Lee, M., Raheja, V., Park, J.I., Kim, Z.M., Kang, D.: Benchmarking cognitive biases in large language models as evaluators. In: Findings of the Association for Computational Linguistics: ACL 2024, pp. 517\u2013545 (2024)","DOI":"10.18653\/v1\/2024.findings-acl.29"},{"key":"45_CR26","doi-asserted-by":"crossref","unstructured":"Lee, D.H., Pujara, J., Sewak, M., White, R., Jauhar, S.: Making large language models better data creators. In: Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, pp. 15349\u201315360 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.948"},{"key":"45_CR27","unstructured":"Lee, Y.J., Lim, C.G., Choi, Y., Lm, J.H., Choi, H.J.: PERSONACHATGEN: generating personalized dialogues using GPT-3. In: Proceedings of the 1st Workshop on Customized Chat Grounding Persona and Knowledge, pp. 29\u201348 (2022)"},{"key":"45_CR28","doi-asserted-by":"crossref","unstructured":"Lester, B., Al-Rfou, R., Constant, N.: The power of scale for parameter-efficient prompt tuning. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, pp. 3045\u20133059 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"45_CR29","first-page":"51991","volume":"36","author":"G Li","year":"2024","unstructured":"Li, G., Hammoud, H., Itani, H., Khizbullin, D., Ghanem, B.: Camel: communicative agents for \u201cmind\" exploration of large language model society. Adv. Neural. Inf. Process. Syst. 36, 51991\u201352008 (2024)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"45_CR30","doi-asserted-by":"crossref","unstructured":"Li, J., et al.: The dawn after the dark: an empirical study on factuality hallucination in large language models. In: Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 10879\u201310899, August 2024","DOI":"10.18653\/v1\/2024.acl-long.586"},{"key":"45_CR31","doi-asserted-by":"crossref","unstructured":"Li, J., Cheng, X., Zhao, X., Nie, J.Y., Wen, J.R.: Halueval: a large-scale hallucination evaluation benchmark for large language models. In: EMNLP (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.397"},{"key":"45_CR32","doi-asserted-by":"crossref","unstructured":"Li, M., Peng, B., Galley, M., Gao, J., Zhang, Z.: Self-checker: plug-and-play modules for fact-checking with large language models. In: Findings of the Association for Computational Linguistics: NAACL 2024, pp. 163\u2013181 (2024)","DOI":"10.18653\/v1\/2024.findings-naacl.12"},{"key":"45_CR33","doi-asserted-by":"crossref","unstructured":"Li, M., et al.: Coannotating: uncertainty-guided work allocation between human and large language models for data annotation. In: EMNLP (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.92"},{"key":"45_CR34","doi-asserted-by":"crossref","unstructured":"Li, X., Qiu, X.: Mot: Memory-of-thought enables ChatGPT to self-improve. In: EMNLP, pp. 6354\u20136374 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.392"},{"key":"45_CR35","doi-asserted-by":"crossref","unstructured":"Lin, Z., Gou, Z., Liang, T., Luo, R., Liu, H., Yang, Y.: CriticBench: Benchmarking LLMs for critique-correct reasoning. In: Findings of the Association for Computational Linguistics: ACL 2024, pp. 1552\u20131587 (2024)","DOI":"10.18653\/v1\/2024.findings-acl.91"},{"key":"45_CR36","doi-asserted-by":"crossref","unstructured":"Liu, Y., Iter, D., Xu, Y., Wang, S., Xu, R., Zhu, C.: G-eval: NLG evaluation using GPT-4 with better human alignment. In: Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, pp. 2511\u20132522 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.153"},{"issue":"6","key":"45_CR37","doi-asserted-by":"crossref","first-page":"771","DOI":"10.1038\/s41593-022-01070-0","volume":"25","author":"A Luppi","year":"2022","unstructured":"Luppi, A., et al.: A synergistic core for human brain evolution and cognition. Nat. Neurosci. 25(6), 771\u2013782 (2022)","journal-title":"Nat. Neurosci."},{"key":"45_CR38","doi-asserted-by":"crossref","unstructured":"Manakul, P., Liusie, A., Gales, M.: SelfcheckGPT: zero-resource black-box hallucination detection for generative large language models. In: Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, pp. 9004\u20139017 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.557"},{"key":"45_CR39","unstructured":"Manyika, J., Hsiao, S.: An overview of bard: an early experiment with generative AI (2023). https:\/\/ai.google\/static\/documents\/google-about-bard. pdf"},{"key":"45_CR40","unstructured":"Marreddy, M., Oota, S.R., Chinni, V.C., Gupta, M., Flek, L.: USDC: A dataset of user stance and dogmatism in long conversations. arXiv preprint arXiv:2406.16833 (2024)"},{"key":"45_CR41","first-page":"462","volume":"35","author":"Y Meng","year":"2022","unstructured":"Meng, Y., Huang, J., Zhang, Y., Han, J.: Generating training data with language models: towards zero-shot language understanding. NeurIPS 35, 462\u2013477 (2022)","journal-title":"NeurIPS"},{"key":"45_CR42","unstructured":"Michelmann, S., Kumar, M., Norman, K.A., Toneva, M.: Large language models can segment narrative events similarly to humans. arXiv preprint arXiv:2301.10297 (2023)"},{"key":"45_CR43","unstructured":"M\u00fcndler, N., He, J., Jenko, S., Vechev, M.: Self-contradictory hallucinations of large language models: evaluation, detection and mitigation. In: The Twelfth International Conference on Learning Representations (2023)"},{"key":"45_CR44","doi-asserted-by":"crossref","unstructured":"Ono, K., Morita, A.: Evaluating large language models: ChatGPT-4, mistral 8x7b, and google Gemini benchmarked against MMLU. Authorea Preprints (2024)","DOI":"10.36227\/techrxiv.170956672.21573677\/v1"},{"key":"45_CR45","unstructured":"OpenAI, R.: GPT-4 Technical report. arXiv:2303.08774. View in Article 2(5) (2023)"},{"key":"45_CR46","unstructured":"Perez, E., et\u00a0al.: Discovering language model behaviors with model-written evaluations. In: Findings of the Association for Computational Linguistics: ACL 2023, pp. 13387\u201313434 (2023)"},{"key":"45_CR47","unstructured":"Santurkar, S., Durmus, E., Ladhak, F., Lee, C., Liang, P., Hashimoto, T.: Whose opinions do language models reflect? In: ICML, pp. 29971\u201330004. PMLR (2023)"},{"key":"45_CR48","unstructured":"Touvron, H., et\u00a0al.: LLaMA 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"45_CR49","unstructured":"Varshney, N., Yao, W., Zhang, H., Chen, J., Yu, D.: A stitch in time saves nine: Detecting and mitigating hallucinations of LLMs by validating low-confidence generation. arXiv preprint arXiv:2307.03987 (2023)"},{"key":"45_CR50","first-page":"1","volume":"30","author":"A Vaswani","year":"2017","unstructured":"Vaswani, A., et al.: Attention is all you need. NIPS 30, 1\u201311 (2017)","journal-title":"NIPS"},{"key":"45_CR51","doi-asserted-by":"crossref","unstructured":"Wei, J., et\u00a0al.: Symbol tuning improves in-context learning in language models. In: The 2023 Conference on Empirical Methods in Natural Language Processing (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.61"},{"key":"45_CR52","unstructured":"Ye, H., Liu, T., Zhang, A., Hua, W., Jia, W.: Cognitive mirage: a review of hallucinations in large language models. arXiv preprint arXiv:2309.06794 (2023)"},{"key":"45_CR53","doi-asserted-by":"crossref","unstructured":"Ye, J., et al.: ZerogGen: efficient zero-shot learning via dataset generation. In: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, pp. 11653\u201311669 (2022)","DOI":"10.18653\/v1\/2022.emnlp-main.801"},{"key":"45_CR54","doi-asserted-by":"crossref","unstructured":"Ye, J., Gao, J., Wu, Z., Feng, J., Yu, T., Kong, L.: ProGen: progressive zero-shot dataset generation via in-context feedback. In: Findings of the Association for Computational Linguistics: EMNLP 2022, pp. 3671\u20133683 (2022)","DOI":"10.18653\/v1\/2022.findings-emnlp.269"},{"key":"45_CR55","unstructured":"Yu, W., Zhang, Z., Liang, Z., Jiang, M., Sabharwal, A.: Improving language models via plug-and-play retrieval feedback. arXiv preprint arXiv:2305.14002 (2023)"},{"key":"45_CR56","first-page":"55734","volume":"36","author":"Y Yu","year":"2024","unstructured":"Yu, Y., et al.: Large language model as attributed training data generator: a tale of diversity and bias. NeurIPS 36, 55734\u201355784 (2024)","journal-title":"NeurIPS"},{"key":"45_CR57","doi-asserted-by":"crossref","unstructured":"Yuan, P., et al.: BatchEval: towards human-like text evaluation. In: Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 15940\u201315958 (2024)","DOI":"10.18653\/v1\/2024.acl-long.846"},{"key":"45_CR58","doi-asserted-by":"crossref","unstructured":"Zhang, R., Li, Y., Ma, Y., Zhou, M., Zou, L.: LLMaAA: making large language models as active annotators. In: Findings of the Association for Computational Linguistics: EMNLP 2023, pp. 13088\u201313103 (2023)","DOI":"10.18653\/v1\/2023.findings-emnlp.872"},{"key":"45_CR59","first-page":"44502","volume":"36","author":"Y Zhao","year":"2024","unstructured":"Zhao, Y., Zhang, J., Chern, I., Gao, S., Liu, P., He, J., et al.: FELM: benchmarking factuality evaluation of large language models. NeurIPS 36, 44502\u201344523 (2024)","journal-title":"NeurIPS"},{"key":"45_CR60","first-page":"46595","volume":"36","author":"L Zheng","year":"2024","unstructured":"Zheng, L., et al.: Judging LLM-as-a-judge with MT-bench and chatbot arena. NeurIPS 36, 46595\u201346623 (2024)","journal-title":"NeurIPS"},{"key":"45_CR61","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Zhang, Y., Tan, C.: Flame: Few-shot learning from natural language explanations. arXiv preprint arXiv:2306.08042 (2023)","DOI":"10.18653\/v1\/2023.acl-long.372"}],"container-title":["Lecture Notes in Computer Science","Advances in Information Retrieval"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-88720-8_45","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,4]],"date-time":"2025-04-04T12:11:46Z","timestamp":1743768706000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-88720-8_45"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031887192","9783031887208"],"references-count":61,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-88720-8_45","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"3 April 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECIR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Information Retrieval","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Lucca","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7 April 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 April 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"47","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ecir2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/ecir2025.eu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}