{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T18:14:44Z","timestamp":1771697684573,"version":"3.50.1"},"reference-count":47,"publisher":"IEEE","license":[{"start":{"date-parts":[[2019,9,1]],"date-time":"2019-09-01T00:00:00Z","timestamp":1567296000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2019,9,1]],"date-time":"2019-09-01T00:00:00Z","timestamp":1567296000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2019,9,1]],"date-time":"2019-09-01T00:00:00Z","timestamp":1567296000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2019,9]]},"DOI":"10.1109\/hpec.2019.8916466","type":"proceedings-article","created":{"date-parts":[[2019,11,29]],"date-time":"2019-11-29T12:11:36Z","timestamp":1575029496000},"page":"1-8","source":"Crossref","is-referenced-by-count":18,"title":["Low Overhead Instruction Latency Characterization for NVIDIA GPGPUs"],"prefix":"10.1109","author":[{"given":"Yehia","family":"Arafa","sequence":"first","affiliation":[]},{"given":"Abdel-Hameed A.","family":"Badawy","sequence":"additional","affiliation":[]},{"given":"Gopinath","family":"Chennupati","sequence":"additional","affiliation":[]},{"given":"Nandakishore","family":"Santhi","sequence":"additional","affiliation":[]},{"given":"Stephan","family":"Eidenbenz","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1145\/2628071.2628092"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1002\/cpe.3973"},{"key":"ref33","year":"2014","journal-title":"NVIDIA Visual Profiler User's Guide"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1145\/3168831"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2750375"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2015.14"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1016\/j.procs.2012.04.209"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/PMBS.2018.8641666"},{"key":"ref35","article-title":"Dissecting the NVIDIA volta GPU architecture via microbenchmarking","volume":"abs 1804 6826","author":"jia","year":"2018","journal-title":"CoRR"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2008.5214359"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/MCSE.2010.69"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1145\/1806596.1806606"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2010.5452013"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2016.2549523"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICPPW.2010.59"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1007\/s11227-014-1112-2"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2018.00027"},{"key":"ref16","author":"yunqing","year":"2011","journal-title":"Asfermi An Assembler for the NVIDIA Fermi Instruction Set"},{"key":"ref17","author":"gray","year":"2011","journal-title":"MaxAs Assembler for Nvidia Maxwell Architecture"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3018743.3018755"},{"key":"ref19","author":"van der laan","year":"2008","journal-title":"Decuda"},{"key":"ref28","article-title":"Optimizing stencil computations for nvidia kepler gpus","author":"maruyama","year":"2014","journal-title":"Proc Int Workshop High-Performance Stencil Comput"},{"key":"ref4","year":"2012","journal-title":"Kepler GPU Architecture"},{"key":"ref27","first-page":"1","article-title":"Performance upper bound analysis and optimization of sgemm on fermi and kepler gpus","author":"lai","year":"2013","journal-title":"Proceedings of the 2013 IEEE\/ACM International Symposium on Code Generation and Optimization (CGO)"},{"key":"ref3","year":"2009","journal-title":"NVIDIA Fermi GPU Architecture"},{"key":"ref6","year":"2016","journal-title":"Pascal GPU Architecture"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1145\/3240302.3270315"},{"key":"ref5","year":"2014","journal-title":"Maxwell GPU Architecture"},{"key":"ref8","year":"2018","journal-title":"NVIDIA Turing GPU Architecture"},{"key":"ref7","year":"2017","journal-title":"NVIDIA Volta GPU Architecture"},{"key":"ref2","year":"2008","journal-title":"Nvidia tesla v100 gpu architecture"},{"key":"ref9","author":"cheng","year":"2014","journal-title":"Professional CUDA C Programming"},{"key":"ref1","year":"0","journal-title":"Top500"},{"key":"ref46","article-title":"Mixed precision training","volume":"abs 1710 3740","author":"micikevicius","year":"2017","journal-title":"CoRR"},{"key":"ref20","year":"2019","journal-title":"The CUDA Compiler Driver NVCC"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC.2017.8091072"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2009.4919648"},{"key":"ref47","article-title":"Holistic management of the GPGPU memory hierarchy to manage warp-level latency tolerance","volume":"abs 1804 11038","author":"ausavarungnirun","year":"2018","journal-title":"CoRR"},{"key":"ref21","year":"2019","journal-title":"PTX Parallel Thread Execution ISA Version 1 4"},{"key":"ref42","year":"2018","journal-title":"Cuda toolkit documentation v9 1"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/2541940.2541948"},{"key":"ref41","year":"2019","journal-title":"CUDA Binary Utilities"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/LCA.2019.2904497"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/MDAT.2016.2630270"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/2063384.2063431"},{"key":"ref43","year":"2018","journal-title":"CUDA Toolkit Documentation v10 0"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/3178487.3178536"}],"event":{"name":"2019 IEEE High Performance Extreme Computing Conference (HPEC)","location":"Waltham, MA, USA","start":{"date-parts":[[2019,9,24]]},"end":{"date-parts":[[2019,9,26]]}},"container-title":["2019 IEEE High Performance Extreme Computing Conference (HPEC)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/8910148\/8916214\/08916466.pdf?arnumber=8916466","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,7,18]],"date-time":"2022-07-18T14:47:03Z","timestamp":1658155623000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8916466\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,9]]},"references-count":47,"URL":"https:\/\/doi.org\/10.1109\/hpec.2019.8916466","relation":{},"subject":[],"published":{"date-parts":[[2019,9]]}}}