{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,8]],"date-time":"2026-06-08T09:07:04Z","timestamp":1780909624457,"version":"3.54.1"},"publisher-location":"Singapore","reference-count":33,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819214679","type":"print"},{"value":"9789819214686","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-92-1468-6_19","type":"book-chapter","created":{"date-parts":[[2026,6,8]],"date-time":"2026-06-08T08:34:00Z","timestamp":1780907640000},"page":"300-311","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["GPU Kernel Optimization Beyond Full Builds: An LLM Framework with\u00a0Minimal Executable Programs"],"prefix":"10.1007","author":[{"given":"Ruifan","family":"Chu","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Anbang","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiuxiu","family":"Bai","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shuai","family":"Liu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiaoshe","family":"Dong","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2026,6,9]]},"reference":[{"key":"19_CR1","unstructured":"Chetlur, S., et al.: cuDNN: efficient primitives for deep learning. arXiv preprint arXiv:1410.0759 (2014)"},{"key":"19_CR2","unstructured":"Kerr, A.: Cutlass: CUDA templates for linear algebra subroutines. Technical report, NVIDIA (2019)"},{"key":"19_CR3","unstructured":"Spector, B.F., Arora, S., Singhal, A., et\u00a0al.: Thunderkittens: simple, fast, and adorable ai kernels. arXiv preprint arXiv:2410.20399 (2024)"},{"key":"19_CR4","unstructured":"Dao, T.: FlashAttention-2: faster attention with better parallelism and work partitioning. arXiv preprint arXiv:2307.08691 (2023)"},{"key":"19_CR5","first-page":"68658","volume":"37","author":"J Shah","year":"2024","unstructured":"Shah, J., Bikshandi, G., Zhang, Y., Thakkar, V., Ramani, P., Dao, T.: FlashAttention-3: fast and accurate attention with asynchrony and low-precision. Adv. Neural. Inf. Process. Syst. 37, 68658\u201368685 (2024)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"19_CR6","doi-asserted-by":"crossref","unstructured":"Li, C., Xu, Y., Saravani, S.M., Sadayappan, P.: Accelerated auto-optimization of GPU kernels for tensor computations. In: Proceedings of the 38th ACM International Conference on Supercomputing (ICS 2024), pp. 549\u2013561. ACM (2024)","DOI":"10.1145\/3650200.3656626"},{"key":"19_CR7","unstructured":"Wu, M., Cheng, X., Liu, S., et\u00a0al.: Mirage: a multi-level superoptimizer for tensor programs. In: 19th USENIX Symposium on Operating Systems Design and Implementation (OSDI 2025), pp. 21\u201338 (2025)"},{"key":"19_CR8","unstructured":"Chen, T., et\u00a0al.: TVM: an automated end-to-end optimizing compiler for deep learning. In: 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 2018), pp. 578\u2013594 (2018)"},{"key":"19_CR9","unstructured":"Zheng, L., et\u00a0al.: Ansor: generating high-performance tensor programs for deep learning. In: 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 2020), pp. 863\u2013879 (2020)"},{"key":"19_CR10","unstructured":"Ouyang, A., et al.: KernelBench: can LLMs write efficient GPU kernels? In: Proceedings of the 42nd International Conference on Machine Learning (ICML 2025), vol. 267. PMLR (2025). arXiv preprint arXiv:2502.10517"},{"key":"19_CR11","unstructured":"Wen, Z., Zhang, Y., Li, Z., Liu, Z., Xie, L., Zhang, T.: MultiKernelBench: a multi-platform benchmark for kernel generation. arXiv preprint arXiv:2507.17773 (2025)"},{"key":"19_CR12","unstructured":"Chen, M., et\u00a0al.: Evaluating large language models trained on code. arXiv preprint arXiv:2107.03374 (2021)"},{"issue":"6624","key":"19_CR13","doi-asserted-by":"publisher","first-page":"1092","DOI":"10.1126\/science.abq1158","volume":"378","author":"Y Li","year":"2022","unstructured":"Li, Y., et al.: Competition-level code generation with alphacode. Science 378(6624), 1092\u20131097 (2022)","journal-title":"Science"},{"key":"19_CR14","unstructured":"Lange, R.T., Prasad, A., Sun, Q., Faldor, M., Tang, Y., Ha, D.: The AI CUDA engineer: agentic CUDA kernel discovery, optimization and composition. Technical report, Sakana AI (2025)"},{"key":"19_CR15","unstructured":"Chen, W., Zhu, J., Fan, Q., Ma, Y., Zou, A.: CUDA-LLM: LLMs can write efficient CUDA kernels. arXiv preprint arXiv:2506.09092 (2025)"},{"key":"19_CR16","unstructured":"Andrews, M., Witteveen, S.: GPU kernel scientist: an LLM-driven framework for iterative kernel optimization. arXiv preprint arXiv:2506.20807 (2025)"},{"key":"19_CR17","doi-asserted-by":"crossref","unstructured":"Li, J., et\u00a0al.: TritonBench: benchmarking large language model capabilities for generating triton operators. arXiv preprint arXiv:2502.14752 (2025)","DOI":"10.18653\/v1\/2025.findings-acl.1183"},{"key":"19_CR18","unstructured":"Zheng, L., et\u00a0al.: Efficiently programming large language models using SGLang. Technical\/preprint (2023)"},{"issue":"6","key":"19_CR19","doi-asserted-by":"publisher","first-page":"519","DOI":"10.1145\/2499370.2462176","volume":"48","author":"J Ragan-Kelley","year":"2013","unstructured":"Ragan-Kelley, J., Barnes, C., Adams, A., Paris, S., Durand, F., Amarasinghe, S.: Halide: a language and compiler for optimizing parallelism, locality, and recomputations in image processing pipelines. ACM SIGPLAN Not. 48(6), 519\u2013530 (2013)","journal-title":"ACM SIGPLAN Not."},{"key":"19_CR20","unstructured":"Lattner, C., et\u00a0al.: MLIR: a compiler infrastructure for the end of Moore\u2019s law. arXiv preprint arXiv:2002.11054 (2020)"},{"key":"19_CR21","unstructured":"Abadi, M., et\u00a0al.: TensorFlow: large-scale machine learning on heterogeneous distributed systems. arXiv preprint arXiv:1603.04467 (2016)"},{"key":"19_CR22","unstructured":"Thakkar, V., et\u00a0al.: Cutlass (CUDA templates for linear algebra subroutines, version 3.0.0) (2023)"},{"key":"19_CR23","doi-asserted-by":"crossref","unstructured":"Taneja, J., Laird, A., Yan, C., Musuvathi, M., Lahiri, S.K: LLM-vectorizer: LLM-based verified loop vectorizer. In: Proceedings of the 23rd ACM\/IEEE International Symposium on Code Generation and Optimization, pp. 137\u2013149 (2025)","DOI":"10.1145\/3696443.3708929"},{"key":"19_CR24","unstructured":"Wei, A., et al.: Improving assembly code performance with large language models via reinforcement learning (2025)"},{"key":"19_CR25","unstructured":"Zhai, Y., et\u00a0al.: Enabling tensor language model to assist in generating high-performance tensor programs for deep learning. In: 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 2024), pp. 289\u2013305 (2024)"},{"key":"19_CR26","unstructured":"Wei, A., et\u00a0al.: EquiBench: benchmarking code reasoning capabilities of large language models via equivalence checking. arXiv preprint arXiv:2502.12466 (2025)"},{"key":"19_CR27","unstructured":"Agrawal, L.A., et\u00a0al.: GEPA: reflective prompt evolution can outperform reinforcement learning (2025)"},{"key":"19_CR28","unstructured":"Baronio, C., Marsella, P., Pan, B., Guo, S., Alberti, S.: Kevin: multi-turn RL for generating CUDA kernels (2025)"},{"key":"19_CR29","unstructured":"Haiguang DCU official site. https:\/\/www.hygon.cn\/index"},{"key":"19_CR30","unstructured":"Polybench. https:\/\/github.com\/sgrauerg\/polybenchGpu"},{"key":"19_CR31","unstructured":"AMD app SDK. https:\/\/en.wikipedia.org\/wiki\/AMD_APP_SDK"},{"key":"19_CR32","unstructured":"LAMMPS. https:\/\/www.lammps.org\/"},{"key":"19_CR33","unstructured":"MISA-MD. https:\/\/misa-md.github.io\/MDoc\/"}],"container-title":["Lecture Notes in Computer Science","Advances in Knowledge Discovery and Data Mining"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-92-1468-6_19","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,6,8]],"date-time":"2026-06-08T08:35:07Z","timestamp":1780907707000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-92-1468-6_19"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819214679","9789819214686"],"references-count":33,"URL":"https:\/\/doi.org\/10.1007\/978-981-92-1468-6_19","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"9 June 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PAKDD","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Pacific-Asia Conference on Knowledge Discovery and Data Mining","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Hong Kong","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2026","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9 June 2026","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"12 June 2026","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"30","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"pakdd2026","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.pakdd2026.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}