{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T08:00:14Z","timestamp":1776931214842,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":32,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,16]]},"DOI":"10.1145\/3731599.3767437","type":"proceedings-article","created":{"date-parts":[[2025,11,7]],"date-time":"2025-11-07T16:18:44Z","timestamp":1762532324000},"page":"793-803","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Architecting Tensor Core-Based Reductions for Irregular Molecular Docking Kernels"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6896-9879","authenticated-orcid":false,"given":"Leonardo","family":"Solis-Vasquez","sequence":"first","affiliation":[{"name":"Technical University of Darmstadt, Darmstadt, Germany"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1832-3030","authenticated-orcid":false,"given":"Andreas F.","family":"Tillack","sequence":"additional","affiliation":[{"name":"Scripps Research, La Jolla, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4622-3747","authenticated-orcid":false,"given":"Diogo","family":"Santos-Martins","sequence":"additional","affiliation":[{"name":"Scripps Research, La Jolla, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1164-3082","authenticated-orcid":false,"given":"Andreas","family":"Koch","sequence":"additional","affiliation":[{"name":"Technical University of Darmstadt, Darmstadt, Germany"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5964-7111","authenticated-orcid":false,"given":"Stefano","family":"Forli","sequence":"additional","affiliation":[{"name":"Scripps Research, La Jolla, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,11,15]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.1145\/3330345.3331057"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2018.00050"},{"key":"e_1_3_3_1_4_2","unstructured":"AMD. 2025. AI Engine: Meeting the Compute Demands of Next-Generation Applications. https:\/\/www.amd.com\/en\/products\/adaptive-socs-and-fpgas\/technologies\/ai-engine.html"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441599"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","unstructured":"C. Navarro R. Carrasco R.J. Barrientos J.A. Riquelme R. Vega. 2021. GPU Tensor Cores for Fast Arithmetic Reductions. IEEE Transactions on Parallel and Distributed Systems 32 1 (2021) 72 \u2013 84. 10.1109\/TPDS.2020.3011893","DOI":"10.1109\/TPDS.2020.3011893"},{"key":"e_1_3_3_1_7_2","unstructured":"Google Cloud. 2025. TPU architecture. https:\/\/cloud.google.com\/tpu\/docs\/system-architecture-tpu-vm"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","unstructured":"D. Santos-Martins L. Solis-Vasquez A.F. Tillack M.F. Sanner A. Koch S. Forli. 2021. Accelerating AutoDock4 with GPUs and Gradient-Based Local Search. J. Chem. Theory Comput. 17 2 (2021) 1060\u20131073. 10.1021\/acs.jctc.0c01006","DOI":"10.1021\/acs.jctc.0c01006"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-39698-441"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","unstructured":"G.M. Morris D.S. Goodsell R.S. Halliday R. Huey W.E. Hart R.K. Belew A.J. Olson. 1998. Automated docking using a Lamarckian genetic algorithm and an empirical binding free energy function. Journal of Computational Chemistry 19 14 (1998) 1639 \u2013 1662. 10.1002\/(SICI)1096-987X(19981115)19:14<1639::AID-JCC10>3.0.CO;2-B","DOI":"10.1002\/(SICI)1096-987X(19981115)19:14<1639::AID-JCC10>3.0.CO;2-B"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","unstructured":"H. Ootomo R. Yokota. 2022. Recovering single precision accuracy from Tensor Cores while surpassing the FP32 theoretical peak performance. The International Journal of High Performance Computing Applications 36 4 (2022) 475 \u2013 491. 10.1177\/10943420221090256","DOI":"10.1177\/10943420221090256"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","unstructured":"I. Halperin B. Ma H. Wolfson R. Nussinov. 2002. Principles of docking: An overview of search algorithms and a guide to scoring functions. Journal of Proteins: Structure Function and Bioinformatics 47 4 (2002) 409 \u2013 443. 10.1002\/prot.10115","DOI":"10.1002\/prot.10115"},{"key":"e_1_3_3_1_13_2","unstructured":"Intel. 2025. Intel Xe GPU Architecture. https:\/\/www.intel.com\/content\/www\/us\/en\/docs\/oneapi\/optimization-guide-gpu\/2025-0\/intel-xe-gpu-architecture.html"},{"key":"e_1_3_3_1_14_2","first-page":"1 \u2013 10","volume-title":"5th International Workshop on FPGAs for Software Programmers (FSP)","author":"Koch L. Solis-Vasquez, A.","year":"2018","unstructured":"L. Solis-Vasquez, A. Koch. 2018. A Case Study in Using OpenCL on FPGAs: Creating an Open-Source Accelerator of the AutoDock Molecular Docking Software. In 5th International Workshop on FPGAs for Software Programmers (FSP). VDE Verlag, 1 \u2013 10. https:\/\/ieeexplore.ieee.org\/document\/8470463"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","unstructured":"L. Solis-Vasquez A.F. Tillack D. Santos-Martins A. Koch S. LeGrand S. Forli. 2022. Benchmarking the performance of irregular computations in AutoDock-GPU molecular docking. Parallel Comput. 109 (2022) 102861. 10.1016\/j.parco.2021.102861","DOI":"10.1016\/j.parco.2021.102861"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/IA351965.2020.00008"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/IA354616.2021.00008"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1145\/3585341.3585372"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/P3HPC51967.2020.00009"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","unstructured":"M.D. Zeiler. 2012. ADADELTA: An Adaptive Learning Rate Method. arXiv abs\/1212.5701 (2012). 10.48550\/arXiv.1212.5701","DOI":"10.48550\/arXiv.1212.5701"},{"key":"e_1_3_3_1_21_2","unstructured":"NVIDIA. 2024. Warp Matrix Functions. https:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/index.html#warp-matrix-functions"},{"key":"e_1_3_3_1_22_2","unstructured":"NVIDIA. 2025. Nsight Compute. https:\/\/docs.nvidia.com\/nsight-compute\/NsightCompute\/index.html"},{"key":"e_1_3_3_1_23_2","unstructured":"NVIDIA. 2025. NVIDIA A100 Tensor Core GPU Architecture. https:\/\/images.nvidia.com\/aem-dam\/en-zz\/Solutions\/data-center\/nvidia-ampere-architecture-whitepaper.pdf"},{"key":"e_1_3_3_1_24_2","unstructured":"NVIDIA. 2025. NVIDIA Blackwell Architecture. https:\/\/www.nvidia.com\/en-us\/data-center\/technologies\/blackwell-architecture"},{"key":"e_1_3_3_1_25_2","unstructured":"NVIDIA. 2025. NVIDIA H100 Tensor Core GPU Architecture. https:\/\/resources.nvidia.com\/en-us-data-center-overview\/gtc22-whitepaper-hopper"},{"key":"e_1_3_3_1_26_2","unstructured":"NVIDIA. 2025. NVIDIA Tensor Cores - Unprecedented Acceleration for Generative AI. https:\/\/www.nvidia.com\/en-us\/data-center\/tensor-cores"},{"key":"e_1_3_3_1_27_2","unstructured":"H. Ootomo. 2024. wmma_extension - An extension library of WMMA API (Tensor Core API). https:\/\/github.com\/wmmae\/wmma_extension"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1145\/3388440.3412472"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW.2018.00091"},{"key":"e_1_3_3_1_30_2","unstructured":"Scripps Research. 2025. AutoDock-GPU: AutoDock for GPUs and other accelerators. https:\/\/github.com\/ccsb-scripps\/AutoDock-GPU"},{"key":"e_1_3_3_1_31_2","unstructured":"Scripps Research. 2025. Set of 42 protein-ligand complexes for testing search algorithms and docking runtime. https:\/\/github.com\/diogomart\/AD-GPU_set_of_42"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS57955.2024.00064"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","unstructured":"W. Sun A. Li T. Geng S. Stuijk H. Corporaal. 2023. Dissecting Tensor Cores via Microbenchmarks: Latency Throughput and Numeric Behaviors. IEEE Transactions on Parallel and Distributed Systems 34 1 (2023) 246 \u2013 261. 10.1109\/TPDS.2022.3217824","DOI":"10.1109\/TPDS.2022.3217824"}],"event":{"name":"SC Workshops '25: Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St Louis MO USA","acronym":"SC Workshops '25","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the SC '25 Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731599.3767437","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T19:31:10Z","timestamp":1767987070000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731599.3767437"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,15]]},"references-count":32,"alternative-id":["10.1145\/3731599.3767437","10.1145\/3731599"],"URL":"https:\/\/doi.org\/10.1145\/3731599.3767437","relation":{},"subject":[],"published":{"date-parts":[[2025,11,15]]},"assertion":[{"value":"2025-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}