{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,6]],"date-time":"2026-06-06T07:02:19Z","timestamp":1780729339568,"version":"3.54.1"},"publisher-location":"Singapore","reference-count":32,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819214617","type":"print"},{"value":"9789819214624","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-92-1462-4_33","type":"book-chapter","created":{"date-parts":[[2026,6,6]],"date-time":"2026-06-06T06:46:03Z","timestamp":1780728363000},"page":"418-430","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Diverse and\u00a0Task-Specific Data Selection for\u00a0Instruction Tuning"],"prefix":"10.1007","author":[{"given":"Juncheng","family":"Diao","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Saiping","family":"Guan","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Gaoyu","family":"Zhu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jiafeng","family":"Guo","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xueqi","family":"Cheng","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2026,6,7]]},"reference":[{"key":"33_CR1","unstructured":"Brown, T.: Language models are few-shot learners. In: NeurIPS (2020)"},{"key":"33_CR2","doi-asserted-by":"crossref","unstructured":"Bukharin, A.: Data diversity matters for robust instruction tuning. In: Findings of EMNLP, pp. 3411\u20133425 (2024)","DOI":"10.18653\/v1\/2024.findings-emnlp.195"},{"key":"33_CR3","unstructured":"Chen, L.: AlpaGasus: training a better alpaca with fewer data. In: ICLR (2024)"},{"key":"33_CR4","unstructured":"Chen, M.: Evaluating LLMs trained on code (2021). arXiv:2107.03374"},{"key":"33_CR5","doi-asserted-by":"crossref","unstructured":"Chen, Y.: Mig: Automatic data selection for instruction tuning bymaximizing information gain in semantic space. In: Findings of ACL, pp. 9902\u20139915 (2025)","DOI":"10.18653\/v1\/2025.findings-acl.515"},{"key":"33_CR6","unstructured":"Clark, P.: Think you have solved question answering? try arc, the ai2 reasoning challenge (2018). arXiv:1803.05457"},{"key":"33_CR7","unstructured":"Cobbe, K.: Training verifiers to solve math problems (2021). arXiv:2110.14168"},{"key":"33_CR8","unstructured":"Dubois, Y., Liang, P., Hashimoto, T.: Length-Controlled alpacaeval: a simple debiasing of automatic evaluators. In: COLMD (2024)"},{"key":"33_CR9","doi-asserted-by":"crossref","unstructured":"Gardner, M.: Allennlp: a deep semantic natural language processing platform. In: ACL Workshop, pp. 1\u20136 (2018)","DOI":"10.18653\/v1\/W18-2501"},{"key":"33_CR10","doi-asserted-by":"crossref","unstructured":"Ge, Y.: Clustering and ranking: diversity-preserved instruction selection through expert-aligned quality estimation. In: EMNLP, pp. 464\u2013478 (2024)","DOI":"10.18653\/v1\/2024.emnlp-main.28"},{"key":"33_CR11","unstructured":"Gunasekar, S.: Textbooks are all you need (2023). arXiv:2306.11644"},{"key":"33_CR12","unstructured":"Hendrycks, D.: Measuring massive multitask language understanding. In: ICLR (2021)"},{"key":"33_CR13","unstructured":"Kou, S., et al.: Which data attributes stimulate math and code reasoning?an investigation via influence functions. In: ICML Workshop (2025)"},{"key":"33_CR14","unstructured":"Lambert, N.: Tulu 3: pushing frontiers in open language model post-training. In: COLM (2025)"},{"key":"33_CR15","doi-asserted-by":"crossref","unstructured":"Li, M.: From quantity to quality: boosting LLM performance with self-guided data selection for instruction tuning. In: NAACL-HLT, pp. 7602\u20137635 (2024)","DOI":"10.18653\/v1\/2024.naacl-long.421"},{"key":"33_CR16","unstructured":"Liu, L.: SelectIT: selective instruction tuning for LLMs via uncertainty-aware self-reflection. In: NeurIPS (2024)"},{"key":"33_CR17","unstructured":"Liu, W.: what makes good data for alignment? a comprehensive study of automatic data selection in instruction tuning. In: ICLR (2024)"},{"key":"33_CR18","unstructured":"Lu, K.: #InsTag: instruction tagging for analyzing supervised fine-tuning of large language models. In: ICLR (2024)"},{"key":"33_CR19","doi-asserted-by":"crossref","unstructured":"Lu, Z., et al.: PIPER: benchmarking and prompting event reasoning boundary of LLMs via debiasing-distillation enhanced tuning. In: ACL. ACL (2025)","DOI":"10.18653\/v1\/2025.acl-long.1389"},{"key":"33_CR20","doi-asserted-by":"crossref","unstructured":"Lu, Z.: Rethinking the reversal curse of LLMs: a prescription from human knowledge reversal. In: EMNLP (2024)","DOI":"10.18653\/v1\/2024.emnlp-main.428"},{"key":"33_CR21","doi-asserted-by":"crossref","unstructured":"Suzgun, M.: Challenging big-bench tasks and whether chain-of-thought can solve them. In: Findings of ACL, pp. 13003\u201313051 (2023)","DOI":"10.18653\/v1\/2023.findings-acl.824"},{"key":"33_CR22","unstructured":"Teknium, Openhermes 2.5: an open dataset of synthetic data for generalist LLM assistants (2023). https:\/\/huggingface.co\/datasets\/teknium\/OpenHermes-2.5"},{"key":"33_CR23","unstructured":"Wu, S.: Self-evolved diverse data sampling for efficient instruction tuning (2023). arXiv:2311.08182"},{"key":"33_CR24","unstructured":"Xia, M.: LESS: selecting influential data for targeted instruction tuning. In: ICLR Workshop (2024)"},{"key":"33_CR25","doi-asserted-by":"crossref","unstructured":"Yang, Y.: Measuring data diversity for instruction tuning: a systematic analysis and a reliable metric, pp. 18530\u201318549. ACL (2025)","DOI":"10.18653\/v1\/2025.acl-long.908"},{"key":"33_CR26","unstructured":"Yin, M.: Entropy law: the story behind data compression and LLM performance (2024). arXiv:2407.06645"},{"key":"33_CR27","unstructured":"Yu, S.: Diversify and conquer: diversity-centric data selection with iterative refinement (2024). arXiv:2409.11378"},{"key":"33_CR28","doi-asserted-by":"crossref","unstructured":"Zhang, Q., Chen, J., Ma, C.: Active instruction tuning for large language models with reference-free instruction selection, pp. 16\u201328. PAKDD (2025)","DOI":"10.1007\/978-981-96-8180-8_2"},{"key":"33_CR29","doi-asserted-by":"crossref","unstructured":"Zheng, L.: Judging LLM-as-a-judge with MT-bench and chatbot arena. In: NeurIPS, pp. 46595\u201346623 (2023)","DOI":"10.52202\/075280-2020"},{"key":"33_CR30","doi-asserted-by":"crossref","unstructured":"Zheng, Y., et al.: LlamaFactory: unified efficient fine-tuning of 100+ language models. In: ACL, pp. 400\u2013410 (2024)","DOI":"10.18653\/v1\/2024.acl-demos.38"},{"key":"33_CR31","unstructured":"Zhou, C.: LIMA: less is more for alignment. In: NeurIPS (2023)"},{"key":"33_CR32","unstructured":"Zhou, J.: Instruction-following evaluation for LLMs (2023). arXiv:2311.07911"}],"container-title":["Lecture Notes in Computer Science","Advances in Knowledge Discovery and Data Mining"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-92-1462-4_33","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,6,6]],"date-time":"2026-06-06T06:46:09Z","timestamp":1780728369000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-92-1462-4_33"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819214617","9789819214624"],"references-count":32,"URL":"https:\/\/doi.org\/10.1007\/978-981-92-1462-4_33","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"7 June 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PAKDD","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Pacific-Asia Conference on Knowledge Discovery and Data Mining","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Hong Kong","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2026","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9 June 2026","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"12 June 2026","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"30","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"pakdd2026","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.pakdd2026.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}