{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,23]],"date-time":"2025-11-23T18:22:49Z","timestamp":1763922169644,"version":"3.45.0"},"publisher-location":"Cham","reference-count":32,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032076113","type":"print"},{"value":"9783032076120","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,11,24]],"date-time":"2025-11-24T00:00:00Z","timestamp":1763942400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,11,24]],"date-time":"2025-11-24T00:00:00Z","timestamp":1763942400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-07612-0_37","type":"book-chapter","created":{"date-parts":[[2025,11,23]],"date-time":"2025-11-23T17:57:33Z","timestamp":1763920653000},"page":"480-493","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Stream-K++: Adaptive GPU GEMM Kernel Selection and\u00a0Scheduling for\u00a0AI Using Bloom Filters"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2832-458X","authenticated-orcid":false,"given":"Harisankar","family":"Sadasivan","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0001-3400-4195","authenticated-orcid":false,"given":"Muhammed Emin","family":"Ozturk","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1616-6817","authenticated-orcid":false,"given":"Muhammad","family":"Osama","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0003-2385-245X","authenticated-orcid":false,"given":"Chris","family":"Millette","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7611-293X","authenticated-orcid":false,"given":"Astha","family":"Rai","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3312-8038","authenticated-orcid":false,"given":"Maksim","family":"Podkorytov","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0006-9577-099X","authenticated-orcid":false,"given":"John","family":"Afaganis","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0009-3048-7642","authenticated-orcid":false,"given":"Carlus","family":"Huang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8114-1080","authenticated-orcid":false,"given":"Jing","family":"Zhang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0003-4945-1240","authenticated-orcid":false,"given":"Jun","family":"Liu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,11,24]]},"reference":[{"key":"37_CR1","unstructured":"ck. https:\/\/github.com\/ROCm\/composable_kernel"},{"key":"37_CR2","unstructured":"cublas. https:\/\/docs.nvidia.com\/cuda\/cublas\/"},{"key":"37_CR3","unstructured":"Cutlass. https:\/\/github.com\/NVIDIA\/cutlass"},{"key":"37_CR4","unstructured":"Efficient gemm in cuda. https:\/\/github.com\/NVIDIA\/cutlass\/blob\/main\/media\/docs\/efficient_gemm.md"},{"key":"37_CR5","unstructured":"Nvidia tensorrt. https:\/\/docs.nvidia.com\/deeplearning\/tensorrt\/developer-guide\/index.html"},{"key":"37_CR6","unstructured":"Python implementation of murmur hash. https:\/\/github.com\/hajimes\/mmh3"},{"key":"37_CR7","unstructured":"rocblas. https:\/\/github.com\/ROCm\/rocBLAS"},{"key":"37_CR8","doi-asserted-by":"crossref","unstructured":"Aminabadi, R.Y., et\u00a0al.: Deepspeed-inference: enabling efficient inference of transformer models at unprecedented scale. In: SC22: International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1\u201315. IEEE (2022)","DOI":"10.1109\/SC41404.2022.00051"},{"key":"37_CR9","doi-asserted-by":"crossref","unstructured":"Barrachina, S., Castillo, M., Igual, F.D., Mayo, R., Quintana-Orti, E.S.: Evaluation and tuning of the level 3 cublas for graphics processors. In: 2008 IEEE International Symposium on Parallel and Distributed Processing, pp.\u00a01\u20138. IEEE (2008)","DOI":"10.1109\/IPDPS.2008.4536485"},{"key":"37_CR10","unstructured":"Chen, T., et al.: TVM: end-to-end optimization stack for deep learning. arXiv preprint arXiv:1802.0479911(2018), 20 (2018)"},{"key":"37_CR11","doi-asserted-by":"crossref","unstructured":"Cui, X., Chen, Y., Zhang, C., Mei, H.: Auto-tuning dense matrix multiplication for GPGPU with cache. In: 2010 IEEE 16th International Conference on Parallel and Distributed Systems, pp. 237\u2013242. IEEE (2010)","DOI":"10.1109\/ICPADS.2010.64"},{"key":"37_CR12","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"37_CR13","doi-asserted-by":"crossref","unstructured":"Dongarra, J., et al.: Accelerating numerical dense linear algebra calculations with GPUs. In: Numerical Computations with GPUs, pp. 3\u201328 (2014)","DOI":"10.1007\/978-3-319-06548-9_1"},{"key":"37_CR14","doi-asserted-by":"crossref","unstructured":"Fuhrer, O., et\u00a0al.: Near-global climate simulation at 1 km resolution: establishing a performance baseline on 4888 GPUs with COSMO 5.0. Geoscientific Model Develop. 11(4), 1665\u20131681 (2018)","DOI":"10.5194\/gmd-11-1665-2018"},{"key":"37_CR15","doi-asserted-by":"crossref","unstructured":"Gupta, U., et\u00a0al.: The architectural implications of Facebook\u2019s DNN-based personalized recommendation. In: 2020 IEEE International Symposium on High Performance Computer Architecture (HPCA), pp. 488\u2013501. IEEE (2020)","DOI":"10.1109\/HPCA47549.2020.00047"},{"key":"37_CR16","doi-asserted-by":"crossref","unstructured":"Hagedorn, B., Elliott, A.S., Barthels, H., Bodik, R., Grover, V.: Fireiron: A data-movement-aware scheduling language for GPUs. In: Proceedings of the ACM International Conference on Parallel Architectures and Compilation Techniques, pp. 71\u201382 (2020)","DOI":"10.1145\/3410463.3414632"},{"key":"37_CR17","doi-asserted-by":"crossref","unstructured":"Jiang, C., Snir, M.: Automatic tuning matrix multiplication performance on graphics hardware. In: 14th International Conference on Parallel Architectures and Compilation Techniques (PACT\u201905), pp. 185\u2013194. IEEE (2005)","DOI":"10.1109\/PACT.2005.10"},{"key":"37_CR18","unstructured":"Khan, J., et\u00a0al.: Miopen: an open source library for deep learning primitives. arXiv preprint arXiv:1910.00078 (2019)"},{"key":"37_CR19","unstructured":"Kublik, S., Saboo, S.: GPT-3. O\u2019Reilly Media, Inc. (2022)"},{"key":"37_CR20","doi-asserted-by":"crossref","unstructured":"Larsen, E.S., McAllister, D.: Fast matrix multiplies using graphics hardware. In: Proceedings of the 2001 ACM\/IEEE Conference on Supercomputing, p. 55 (2001)","DOI":"10.1145\/582034.582089"},{"key":"37_CR21","doi-asserted-by":"publisher","unstructured":"Li, Y., Dongarra, J., Tomov, S.: A Note on Auto-tuning GEMM for GPUs. In: Allen, G., Nabrzyski, J., Seidel, E., van Albada, G.D., Dongarra, J., Sloot, P.M.A. (eds.) ICCS 2009. LNCS, vol. 5544, pp. 884\u2013892. Springer, Heidelberg (2009). https:\/\/doi.org\/10.1007\/978-3-642-01970-8_89","DOI":"10.1007\/978-3-642-01970-8_89"},{"issue":"4","key":"37_CR22","doi-asserted-by":"publisher","first-page":"511","DOI":"10.1177\/1094342010385729","volume":"24","author":"R Nath","year":"2010","unstructured":"Nath, R., Tomov, S., Dongarra, J.: An improved magma GEMM for fermi graphics processing units. Int. J. High Perform. Comput. Appl. 24(4), 511\u2013515 (2010)","journal-title":"Int. J. High Perform. Comput. Appl."},{"key":"37_CR23","doi-asserted-by":"crossref","unstructured":"Osama, M., Merrill, D., Cecka, C., Garland, M., Owens, J.D.: Stream-k: work-centric parallel decomposition for dense matrix-matrix multiplication on the GPU. In: Proceedings of the 28th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming, pp. 429\u2013431 (2023)","DOI":"10.1145\/3572848.3577479"},{"key":"37_CR24","unstructured":"Paszke, A., et\u00a0al.: Pytorch: an imperative style, high-performance deep learning library. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"37_CR25","unstructured":"Pharr, M., Jakob, W., Humphreys, G.: Physically Based Rendering: From Theory to Implementation. MIT Press (2023)"},{"issue":"6","key":"37_CR26","doi-asserted-by":"publisher","first-page":"519","DOI":"10.1145\/2499370.2462176","volume":"48","author":"J Ragan-Kelley","year":"2013","unstructured":"Ragan-Kelley, J., Barnes, C., Adams, A., Paris, S., Durand, F., Amarasinghe, S.: Halide: a language and compiler for optimizing parallelism, locality, and recomputation in image processing pipelines. ACM Sigplan Notices 48(6), 519\u2013530 (2013)","journal-title":"ACM Sigplan Notices"},{"key":"37_CR27","doi-asserted-by":"crossref","unstructured":"Tan, G., Li, L., Triechle, S., Phillips, E., Bao, Y., Sun, N.: Fast implementation of DGEMM on fermi GPU. In: Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1\u201311 (2011)","DOI":"10.1145\/2063384.2063431"},{"key":"37_CR28","doi-asserted-by":"crossref","unstructured":"Tillet, P., Cox, D.: Input-aware auto-tuning of compute-bound HPC kernels. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1\u201312 (2017)","DOI":"10.1145\/3126908.3126939"},{"key":"37_CR29","doi-asserted-by":"crossref","unstructured":"Tillet, P., Kung, H.T., Cox, D.: Triton: an intermediate language and compiler for tiled neural network computations. In: Proceedings of the 3rd ACM SIGPLAN International Workshop on Machine Learning and Programming Languages, pp. 10\u201319 (2019)","DOI":"10.1145\/3315508.3329973"},{"key":"37_CR30","doi-asserted-by":"crossref","unstructured":"Van\u00a0Loan, C.: Computational frameworks for the fast Fourier transform. SIAM (1992)","DOI":"10.1137\/1.9781611970999"},{"key":"37_CR31","unstructured":"Vasilache, N., et al.: Tensor comprehensions: Framework-agnostic high-performance machine learning abstractions. arXiv preprint arXiv:1802.04730 (2018)"},{"key":"37_CR32","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"}],"container-title":["Lecture Notes in Computer Science","High Performance Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-07612-0_37","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,23]],"date-time":"2025-11-23T17:57:37Z","timestamp":1763920657000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-07612-0_37"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,24]]},"ISBN":["9783032076113","9783032076120"],"references-count":32,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-07612-0_37","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,24]]},"assertion":[{"value":"24 November 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ISC High Performance","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on High Performance Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Hamburg","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"10 June 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13 June 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"40","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"supercomputing2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}