{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,9]],"date-time":"2026-06-09T08:42:01Z","timestamp":1780994521558,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":41,"publisher":"ACM","license":[{"start":{"date-parts":[[2019,10,12]],"date-time":"2019-10-12T00:00:00Z","timestamp":1570838400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2019,10,12]]},"DOI":"10.1145\/3352460.3358307","type":"proceedings-article","created":{"date-parts":[[2019,10,11]],"date-time":"2019-10-11T11:16:45Z","timestamp":1570792605000},"page":"372-383","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":155,"title":["NVBit"],"prefix":"10.1145","author":[{"given":"Oreste","family":"Villa","sequence":"first","affiliation":[{"name":"NVIDIA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Mark","family":"Stephenson","sequence":"additional","affiliation":[{"name":"NVIDIA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"David","family":"Nellans","sequence":"additional","affiliation":[{"name":"NVIDIA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Stephen W.","family":"Keckler","sequence":"additional","affiliation":[{"name":"NVIDIA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2019,10,12]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Fourth Annual Workshop on Modeling, Benchmarking and Simulation (MoBS).","author":"Aamer Jaleel","year":"2008","unstructured":"Jaleel Aamer, S. Cohn Robert, Luk Chi-Keung, and Jacob Bruce. 2008. CMP$im: A Pin-Based On-The-Fly Multi-Core Cache Simulator. In Fourth Annual Workshop on Modeling, Benchmarking and Simulation (MoBS)."},{"key":"e_1_3_2_1_2_1","unstructured":"Derek Bruening. 2004. Efficient Transparent and Comprehensive Runtime Code Manipulation. Ph.D. Dissertation. Massachusetts Institute of Technology."},{"key":"e_1_3_2_1_3_1","volume-title":"An Analysis of Deep Neural Network Models for Practical Applications. CoRR abs\/1605.07678","author":"Canziani Alfredo","year":"2016","unstructured":"Alfredo Canziani, Adam Paszke, and Eugenio Culurciello. 2016. An Analysis of Deep Neural Network Models for Practical Applications. CoRR abs\/1605.07678 (2016). arXiv:1605.07678"},{"key":"e_1_3_2_1_4_1","volume-title":"Proceedings of the Conference on Neural Information Processing Systems (NeurIPS).","author":"Collobert Ronan","year":"2011","unstructured":"Ronan Collobert, Koray Kavukcuoglu, and Cl\u00e9ment Farabet. 2011. Torch7: A Matlab-like Environment for Machine Learning. In Proceedings of the Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_5_1","volume-title":"CUDA Programming: A Developer's Guide to Parallel Computing with GPUs. Morgan Kaufmann","author":"Cook Shane","unstructured":"Shane Cook. 2012. CUDA Programming: A Developer's Guide to Parallel Computing with GPUs. Morgan Kaufmann, Waltham, MA."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/2597917.2597943"},{"key":"e_1_3_2_1_7_1","unstructured":"Gregory Diamos Andrew Kerr and Mukil Kesavan. 2009. Translating GPU Binaries to Tiered SIMD Architectures with Ocelot. Technical Report 09-01. Georgia Institute of Technology. http:\/\/www.cercs.gatech.edu\/tech-reports\/tr2009\/abstracts\/01.html"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/1964179.1964192"},{"key":"e_1_3_2_1_9_1","volume-title":"Proceedings of the International Symposium on Performance Analysis of Systems and Software (ISPASS). 249--258","author":"Sastry Hari Siva Kumar","year":"2017","unstructured":"Siva Kumar Sastry Hari, Timothy Tsai, Mark Stephenson, Stephen W. Keckler, and Joel Emer. 2017. SASSIFI: An Architecture-level Fault Injection Tool for GPU Application Resilience Evaluation. In Proceedings of the International Symposium on Performance Analysis of Systems and Software (ISPASS). 249--258."},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of the International Conference on Architectural Support for Programming Languages and Operation Systems (ASPLOS). 156--164","author":"Hauswirth Matthias","unstructured":"Matthias Hauswirth and Trishul M. Chilimbi. 2004. Low-overhead Memory Leak Detection Using Adaptive Statistical Profiling. In Proceedings of the International Conference on Architectural Support for Programming Languages and Operation Systems (ASPLOS). 156--164."},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of the International Symposium on Code Generation and Optimization (CGO). 229--241","author":"Hayes Ari B.","unstructured":"Ari B. Hayes, Fei Hua, Jin Huang, Yan-Hao Chen, and Eddy Z. Zhang. 2019. Decoding CUDA Binary. In Proceedings of the International Symposium on Code Generation and Optimization (CGO). 229--241."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/1176760.1176793"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/4434.895108"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2654889"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/1735688.1735696"},{"key":"e_1_3_2_1_16_1","volume-title":"Proceedings of the International Conference on Supercomputing (ICS). 369--378","author":"Lam Michael O.","unstructured":"Michael O. Lam, Jeffrey K. Hollingsworth, Bronis R. de Supinski, and Matthew P. LeGendre. 2013. Automatically Adapting Programs for Mixed-precision Floating-point Computation. In Proceedings of the International Conference on Supercomputing (ICS). 369--378."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2012.29"},{"key":"e_1_3_2_1_18_1","unstructured":"Linux Programmer's Manual. http:\/\/man7.org\/linux\/man-pages\/man8\/ld.so.8.html. Accessed: 2019-02-11."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/1065010.1065034"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.5555\/3140065.3140094"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/40.216747"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/1250734.1250746"},{"key":"e_1_3_2_1_23_1","unstructured":"NVIDIA CUDA Binary Utilities. https:\/\/docs.nvidia.com\/cuda\/cuda-binary-utilities\/index.html. Accessed: 2019-02-11."},{"key":"e_1_3_2_1_24_1","unstructured":"NVIDIA CUDA Compiler Driver NVCC. https:\/\/docs.nvidia.com\/cuda\/cuda-compiler-driver-nvcc\/index.html. Accessed: 2019-02-11."},{"key":"e_1_3_2_1_25_1","unstructured":"NVIDIA CUDA Driver APIs. https:\/\/docs.nvidia.com\/cuda\/cuda-driver-api\/index.html. Accessed: 2019-02-11."},{"key":"e_1_3_2_1_26_1","unstructured":"NVIDIA CUDA Fortran. https:\/\/developer.nvidia.com\/cuda-fortran. Accessed: 2019-02-11."},{"key":"e_1_3_2_1_27_1","unstructured":"NVIDIA CUDA GDB. https:\/\/docs.nvidia.com\/cuda\/cuda-gdb\/index.html. Accessed: 2019-02-11."},{"key":"e_1_3_2_1_28_1","unstructured":"NVIDIA CUPTI Callback APIs. https:\/\/docs.nvidia.com\/cuda\/cupti\/group__CUPTI__CALLBACK__API.html. Accessed: 2019-02-11."},{"key":"e_1_3_2_1_29_1","unstructured":"NVIDIA GPU Accelerated Libraries for Computing. https:\/\/developer.nvidia.com\/gpu-accelerated-libraries. Accessed: 2019-02-11."},{"key":"e_1_3_2_1_30_1","unstructured":"NVIDIA Parallel Thread Execution ISA. https:\/\/docs.nvidia.com\/cuda\/parallel-thread-execution\/index.html. Accessed: 2019-02-11."},{"key":"e_1_3_2_1_31_1","unstructured":"NVIDIA TITAN V. https:\/\/www.nvidia.com\/en-us\/titan\/titan-v\/. Accessed: 2019-02-11."},{"key":"e_1_3_2_1_32_1","unstructured":"SASSI Instrumentation Tool for NVIDIA GPUs. https:\/\/github.com\/NVlabs\/SASSI. Accessed: 2019-02-11."},{"key":"e_1_3_2_1_33_1","volume-title":"Proceedings of the Workshop on Computer Architecture Education.","author":"Reddi Vijay Janapa","unstructured":"Vijay Janapa Reddi, Alex Settle, Daniel A. Connors, and Robert S. Cohn. 2004. PIN: A Binary Instrumentation Tool for Computer Architecture Research and Education. In Proceedings of the Workshop on Computer Architecture Education."},{"key":"e_1_3_2_1_34_1","unstructured":"Standard Performance Evaluation Corporation (SPEC): ACCEL. https:\/\/www.spec.org\/accel\/. Accessed: 2019-02-11."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/178243.178260"},{"key":"e_1_3_2_1_36_1","volume-title":"Proceedings of the International Symposium on Computer Architecture (ISCA). 185--197","author":"Stephenson Mark","unstructured":"Mark Stephenson, Siva Kumar Sastry Hari, Yunsup Lee, Eiman Ebrahimi, Daniel R. Johnson, David Nellans, Mike O'Connor, and Stephen W. Keckler. 2015. Flexible Software Profiling of GPU Architectures. In Proceedings of the International Symposium on Computer Architecture (ISCA). 185--197."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/MCSE.2010.69"},{"key":"e_1_3_2_1_38_1","volume-title":"Fast Convolutional Nets with fbfft: A GPU Performance Evaluation. CoRR abs\/1412.7580","author":"Vasilache Nicolas","year":"2014","unstructured":"Nicolas Vasilache, Jeff Johnson, Michael Mathieu, Soumith Chintala, Serkan Piantino, and Yann LeCun. 2014. Fast Convolutional Nets with fbfft: A GPU Performance Evaluation. CoRR abs\/1412.7580 (2014)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-32820-6_85"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-78791-4_10"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/1133981.1134012"}],"event":{"name":"MICRO '52: The 52nd Annual IEEE\/ACM International Symposium on Microarchitecture","location":"Columbus OH USA","acronym":"MICRO '52","sponsor":["SIGMICRO ACM Special Interest Group on Microarchitectural Research and Processing","IEEE CS"]},"container-title":["Proceedings of the 52nd Annual IEEE\/ACM International Symposium on Microarchitecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3352460.3358307","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3352460.3358307","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,7,29]],"date-time":"2025-07-29T22:30:01Z","timestamp":1753828201000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3352460.3358307"}},"subtitle":["A Dynamic Binary Instrumentation Framework for NVIDIA GPUs"],"short-title":[],"issued":{"date-parts":[[2019,10,12]]},"references-count":41,"alternative-id":["10.1145\/3352460.3358307","10.1145\/3352460"],"URL":"https:\/\/doi.org\/10.1145\/3352460.3358307","relation":{},"subject":[],"published":{"date-parts":[[2019,10,12]]},"assertion":[{"value":"2019-10-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}