{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T12:18:20Z","timestamp":1771935500734,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","license":[{"start":{"date-parts":[[2019,3,25]],"date-time":"2019-03-25T00:00:00Z","timestamp":1553472000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["1750760"],"award-info":[{"award-number":["1750760"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2019,3,25]]},"DOI":"10.1145\/3302424.3303949","type":"proceedings-article","created":{"date-parts":[[2019,3,22]],"date-time":"2019-03-22T13:10:03Z","timestamp":1553260203000},"page":"1-16","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":35,"title":["GRNN"],"prefix":"10.1145","author":[{"given":"Connor","family":"Holmes","sequence":"first","affiliation":[{"name":"Department of Computer Science, Colorado School of Mines, Golden, Colorado"}]},{"given":"Daniel","family":"Mawhirter","sequence":"additional","affiliation":[{"name":"Department of Computer Science, Colorado School of Mines, Golden, Colorado"}]},{"given":"Yuxiong","family":"He","sequence":"additional","affiliation":[{"name":"Microsoft Business AI and Research, Seattle, Washington"}]},{"given":"Feng","family":"Yan","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Nevada, Reno, Nevada"}]},{"given":"Bo","family":"Wu","sequence":"additional","affiliation":[{"name":"Department of Computer Science, Colorado School of Mines, Golden, Colorado"}]}],"member":"320","published-online":{"date-parts":[[2019,3,25]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Dense linear algebra on gpus. https:\/\/developer.nvidia.com\/cublas. Accessed: 2018-10-1.  Dense linear algebra on gpus. https:\/\/developer.nvidia.com\/cublas. Accessed: 2018-10-1."},{"key":"e_1_3_2_1_2_1","unstructured":"NVIDIA CUDA. http:\/\/www.nvidia.com\/cuda.  NVIDIA CUDA. http:\/\/www.nvidia.com\/cuda."},{"key":"e_1_3_2_1_3_1","unstructured":"The accelerated linear algebra compiler framework. https:\/\/www.tensorflow.org\/performance\/xla\/ 2018.  The accelerated linear algebra compiler framework. https:\/\/www.tensorflow.org\/performance\/xla\/ 2018."},{"key":"e_1_3_2_1_4_1","unstructured":"Nv-wavenet: Better speech synthesis using gpu-enabled wavenet inference. https:\/\/devblogs.nvidia.com\/nv-wavenet-gpu-speech-synthesis\/ 2018.  Nv-wavenet: Better speech synthesis using gpu-enabled wavenet inference. https:\/\/devblogs.nvidia.com\/nv-wavenet-gpu-speech-synthesis\/ 2018."},{"key":"e_1_3_2_1_5_1","unstructured":"Nvidia tensort - programmable inference accelerator. https:\/\/developer.nvidia.com\/tensorrt 2018.  Nvidia tensort - programmable inference accelerator. https:\/\/developer.nvidia.com\/tensorrt 2018."},{"key":"e_1_3_2_1_6_1","first-page":"265","volume-title":"Proceedings of the 12th USENIX Conference on Operating Systems Design and Implementation, OSDI'16","author":"Abadi Mart\u00edn","year":"2016"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/2872362.2872368"},{"key":"e_1_3_2_1_8_1","unstructured":"Tianqi Chen Mu Li Yutian Li Min Lin Naiyan Wang Minjie Wang Tianjun Xiao Bing Xu Chiyuan Zhang and Zheng Zhang. Mxnet: A flexible and efficient machine learning library for heterogeneous distributed systems. CoRR abs\/1512.01274 2015.  Tianqi Chen Mu Li Yutian Li Min Lin Naiyan Wang Minjie Wang Tianjun Xiao Bing Xu Chiyuan Zhang and Zheng Zhang. Mxnet: A flexible and efficient machine learning library for heterogeneous distributed systems. CoRR abs\/1512.01274 2015."},{"key":"e_1_3_2_1_9_1","unstructured":"Tianqi Chen Thierry Moreau Ziheng Jiang Haichen Shen Eddie Q. Yan Leyuan Wang Yuwei Hu Luis Ceze Carlos Guestrin and Arvind Krishnamurthy. TVM: end-to-end optimization stack for deep learning. CoRR abs\/1802.04799 2018.  Tianqi Chen Thierry Moreau Ziheng Jiang Haichen Shen Eddie Q. Yan Leyuan Wang Yuwei Hu Luis Ceze Carlos Guestrin and Arvind Krishnamurthy. TVM: end-to-end optimization stack for deep learning. CoRR abs\/1802.04799 2018."},{"key":"e_1_3_2_1_10_1","unstructured":"Sharan Chetlur Cliff Woolley Philippe Vandermersch Jonathan Cohen John Tran Bryan Catanzaro and Evan Shelhamer. cudnn: Efficient primitives for deep learning. CoRR abs\/1410.0759 2014.  Sharan Chetlur Cliff Woolley Philippe Vandermersch Jonathan Cohen John Tran Bryan Catanzaro and Evan Shelhamer. cudnn: Efficient primitives for deep learning. CoRR abs\/1410.0759 2014."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1179"},{"key":"e_1_3_2_1_12_1","volume-title":"Advances in Neural Information Processing Systems 28: Annual Conference on Neural Information Processing Systems 2015","author":"Chorowski Jan","year":"2015"},{"key":"e_1_3_2_1_13_1","first-page":"613","volume-title":"14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17)","author":"Crankshaw Daniel","year":"2017"},{"key":"e_1_3_2_1_14_1","first-page":"2024","volume-title":"Proceedings of the 33nd International Conference on Machine Learning, ICML 2016","author":"Diamos Greg","year":"2016"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3190508.3190541"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6638947"},{"key":"e_1_3_2_1_17_1","volume-title":"Innovative Parallel Computing, page","author":"Gupta Kshitij","year":"2012"},{"key":"e_1_3_2_1_18_1","unstructured":"Song Han Huizi Mao and William J. Dally. Deep compression: Compressing deep neural network with pruning trained quantization and huffman coding. CoRR abs\/1510.00149 2015.  Song Han Huizi Mao and William J. Dally. Deep compression: Compressing deep neural network with pruning trained quantization and huffman coding. CoRR abs\/1510.00149 2015."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00059"},{"key":"e_1_3_2_1_20_1","volume-title":"Harcourt Brace & Co.","author":"Hecht-Nielsen Robert","year":"1992"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2654889"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.5555\/3016100.3016285"},{"key":"e_1_3_2_1_24_1","first-page":"2849","volume-title":"Proceedings of the 33nd International Conference on Machine Learning, ICML 2016","author":"Lin Darryl Dexu","year":"2016"},{"key":"e_1_3_2_1_25_1","first-page":"2873","volume-title":"Proceedings of the Twenty-Fifth International Joint Conference on Artificial Intelligence, IJCAI 2016","author":"Liu Pengfei","year":"2016"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D15-1166"},{"key":"e_1_3_2_1_27_1","unstructured":"Christopher Olston Noah Fiedel Kiril Gorovoy Jeremiah Harmsen Li Lao Fangwei Li Vinu Rajashekhar Sukriti Ramesh and Jordan Soyke. Tensorflow-serving: Flexible high-performance ML serving. CoRR abs\/1712.06139 2017.  Christopher Olston Noah Fiedel Kiril Gorovoy Jeremiah Harmsen Li Lao Fangwei Li Vinu Rajashekhar Sukriti Ramesh and Jordan Soyke. Tensorflow-serving: Flexible high-performance ML serving. CoRR abs\/1712.06139 2017."},{"key":"e_1_3_2_1_28_1","volume-title":"NIPS-W","author":"Paszke Adam","year":"2017"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/2939672.2945397"},{"key":"e_1_3_2_1_30_1","unstructured":"Jeff A. Stuart and John D. Owens. Efficient synchronization primitives for gpus. CoRR abs\/1110.4623 2011.  Jeff A. Stuart and John D. Owens. Efficient synchronization primitives for gpus. CoRR abs\/1110.4623 2011."},{"key":"e_1_3_2_1_31_1","unstructured":"Ming Tan Bing Xiang and Bowen Zhou. Lstm-based deep learning models for non-factoid answer selection. CoRR abs\/1511.04108 2015.  Ming Tan Bing Xiang and Bowen Zhou. Lstm-based deep learning models for non-factoid answer selection. CoRR abs\/1511.04108 2015."},{"key":"e_1_3_2_1_32_1","unstructured":"Theano Development Team. Theano: A Python framework for fast computation of mathematical expressions. arXiv e-prints abs\/1605.02688 May 2016.  Theano Development Team. Theano: A Python framework for fast computation of mathematical expressions. arXiv e-prints abs\/1605.02688 May 2016."},{"key":"e_1_3_2_1_33_1","volume-title":"Deep Learning and Unsupervised Feature Learning Workshop, NIPS 2011","author":"Vanhoucke Vincent","year":"2011"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/2751205.2751213"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3037697.3037742"},{"key":"e_1_3_2_1_36_1","first-page":"1","volume-title":"24th IEEE International Symposium on Parallel and Distributed Processing, IPDPS 2010, Atlanta, Georgia, USA, 19-23 April 2010 - Conference Proceedings","author":"Xiao Shucai","year":"2010"},{"key":"e_1_3_2_1_37_1","unstructured":"Caiming Xiong Victor Zhong and Richard Socher. Dynamic coattention networks for question answering. CoRR abs\/1611.01604 2016.  Caiming Xiong Victor Zhong and Richard Socher. Dynamic coattention networks for question answering. CoRR abs\/1611.01604 2016."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N16-1174"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080215"},{"key":"e_1_3_2_1_40_1","first-page":"951","volume-title":"2018 USENIX Annual Technical Conference (USENIX ATC 18)","author":"Zhang Minjia","year":"2018"},{"key":"e_1_3_2_1_41_1","volume-title":"Advances in Neural Information Processing Systems 28: Annual Conference on Neural Information Processing Systems 2015","author":"Zhang Xiang","year":"2015"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123939.3123978"}],"event":{"name":"EuroSys '19: Fourteenth EuroSys Conference 2019","location":"Dresden Germany","acronym":"EuroSys '19","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the Fourteenth EuroSys Conference 2019"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3302424.3303949","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3302424.3303949","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3302424.3303949","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T01:01:48Z","timestamp":1750208508000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3302424.3303949"}},"subtitle":["Low-Latency and Scalable RNN Inference on GPUs"],"short-title":[],"issued":{"date-parts":[[2019,3,25]]},"references-count":42,"alternative-id":["10.1145\/3302424.3303949","10.1145\/3302424"],"URL":"https:\/\/doi.org\/10.1145\/3302424.3303949","relation":{},"subject":[],"published":{"date-parts":[[2019,3,25]]},"assertion":[{"value":"2019-03-25","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}