{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T16:30:59Z","timestamp":1773246659155,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":20,"publisher":"ACM","license":[{"start":{"date-parts":[[2020,8,24]],"date-time":"2020-08-24T00:00:00Z","timestamp":1598227200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2020,8,24]]},"DOI":"10.1145\/3409963.3410493","type":"proceedings-article","created":{"date-parts":[[2020,8,19]],"date-time":"2020-08-19T18:03:42Z","timestamp":1597860222000},"page":"75-81","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":18,"title":["Profiling and optimizing deep learning inference on mobile GPUs"],"prefix":"10.1145","author":[{"given":"Shiqi","family":"Jiang","sequence":"first","affiliation":[{"name":"Microsoft Research"}]},{"given":"Lihao","family":"Ran","sequence":"additional","affiliation":[{"name":"Microsoft Research"}]},{"given":"Ting","family":"Cao","sequence":"additional","affiliation":[{"name":"Microsoft Research"}]},{"given":"Yusen","family":"Xu","sequence":"additional","affiliation":[{"name":"Microsoft Research"}]},{"given":"Yunxin","family":"Liu","sequence":"additional","affiliation":[{"name":"Microsoft Research"}]}],"member":"320","published-online":{"date-parts":[[2020,8,24]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2019. Q2 2019 Smartphone and Tablet GPU Market Share. https:\/\/www.businesswire.com\/news\/home\/20191118005549\/en\/.  2019. Q2 2019 Smartphone and Tablet GPU Market Share. https:\/\/www.businesswire.com\/news\/home\/20191118005549\/en\/."},{"key":"e_1_3_2_1_2_1","unstructured":"2019. The Standard for Embedded Accelerated 3D Graphics. https:\/\/www.khronos.org\/opengles\/.  2019. The Standard for Embedded Accelerated 3D Graphics. https:\/\/www.khronos.org\/opengles\/."},{"key":"e_1_3_2_1_3_1","unstructured":"2020. RenderScript Overview. https:\/\/developer.android.com\/.  2020. RenderScript Overview. https:\/\/developer.android.com\/."},{"key":"e_1_3_2_1_4_1","unstructured":"2020. Snapdragon Profiler. https:\/\/developer.qualcomm.com\/software\/snapdragon-profiler.  2020. Snapdragon Profiler. https:\/\/developer.qualcomm.com\/software\/snapdragon-profiler."},{"key":"e_1_3_2_1_5_1","unstructured":"2020. Streamline Performance Analyzer. https:\/\/developer.arm.com\/tools-and-software\/embedded\/arm-development-studio.  2020. Streamline Performance Analyzer. https:\/\/developer.arm.com\/tools-and-software\/embedded\/arm-development-studio."},{"key":"e_1_3_2_1_6_1","unstructured":"2020. TensorFlow Lite ML for Mobile and Edge Devices. https:\/\/www.tensorflow.org\/lite.  2020. TensorFlow Lite ML for Mobile and Edge Devices. https:\/\/www.tensorflow.org\/lite."},{"key":"e_1_3_2_1_7_1","unstructured":"2020. The Open Standard for Parallel Programming of Heterogeneous Systems. https:\/\/www.khronos.org\/opengcl\/.  2020. The Open Standard for Parallel Programming of Heterogeneous Systems. https:\/\/www.khronos.org\/opengcl\/."},{"key":"e_1_3_2_1_8_1","unstructured":"Chris Cummins Pavlos Petoumenos Michel Steuwer and Hugh Leather. 2016. Autotuning OpenCL Workgroup Size for Stencil Patterns. In &lt;u&gt;The 6th International Workshop on Adaptive Self-tuning Computing Systems (ADAPT)&lt;\/u&gt;.  Chris Cummins Pavlos Petoumenos Michel Steuwer and Hugh Leather. 2016. Autotuning OpenCL Workgroup Size for Stencil Patterns. In &lt;u&gt;The 6th International Workshop on Adaptive Self-tuning Computing Systems (ADAPT)&lt;\/u&gt;."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"T. L. Falch and A. C. Elster. 2015. Machine Learning Based Auto-Tuning for Enhanced OpenCL Performance Portability. In &lt;u&gt;2015 IEEE International Parallel and Distributed Processing Symposium Workshop&lt;\/u&gt;.  T. L. Falch and A. C. Elster. 2015. Machine Learning Based Auto-Tuning for Enhanced OpenCL Performance Portability. In &lt;u&gt;2015 IEEE International Parallel and Distributed Processing Symposium Workshop&lt;\/u&gt;.","DOI":"10.1109\/IPDPSW.2015.85"},{"key":"e_1_3_2_1_10_1","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep residual learning for image recognition. In &lt;u&gt;Proceedings of the IEEE conference on computer vision and pattern recognition&lt;\/u&gt;. 770--778.  Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep residual learning for image recognition. In &lt;u&gt;Proceedings of the IEEE conference on computer vision and pattern recognition&lt;\/u&gt;. 770--778."},{"key":"e_1_3_2_1_11_1","volume-title":"Mobilenets: Efficient convolutional neural networks for mobile vision applications. &lt;u&gt;arXiv preprint arXiv:1704.04861&lt;\/u&gt","author":"Howard Andrew G","year":"2017"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Gao Huang Zhuang Liu Laurens van der Maaten and Kilian Q. Weinberger. 2017. Densely Connected Convolutional Networks. &lt;u&gt;2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)&lt;\/u&gt; (2017).  Gao Huang Zhuang Liu Laurens van der Maaten and Kilian Q. Weinberger. 2017. Densely Connected Convolutional Networks. &lt;u&gt;2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)&lt;\/u&gt; (2017).","DOI":"10.1109\/CVPR.2017.243"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Gang Huang Mengwei Xu Felix Xiaozhu Lin Yunxin Liu Yun Ma Saumay Pushp and Xuanzhe Liu. 2017. ShuffleDog: Characterizing and Adapting User-Perceived Latency of Android Apps. &lt;u&gt;IEEE Transactions on Mobile Computing&lt;\/u&gt; 16 10 (2017) 2913--2926.  Gang Huang Mengwei Xu Felix Xiaozhu Lin Yunxin Liu Yun Ma Saumay Pushp and Xuanzhe Liu. 2017. ShuffleDog: Characterizing and Adapting User-Perceived Latency of Android Apps. &lt;u&gt;IEEE Transactions on Mobile Computing&lt;\/u&gt; 16 10 (2017) 2913--2926.","DOI":"10.1109\/TMC.2017.2651823"},{"key":"e_1_3_2_1_14_1","unstructured":"Zhe Jia Marco Maggioni Benjamin Staiger and Daniele Paolo Scarpazza. 2018. Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking. &lt;u&gt;CoRR&lt;\/u&gt; abs\/1804.06826 (2018).  Zhe Jia Marco Maggioni Benjamin Staiger and Daniele Paolo Scarpazza. 2018. Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking. &lt;u&gt;CoRR&lt;\/u&gt; abs\/1804.06826 (2018)."},{"key":"e_1_3_2_1_15_1","unstructured":"Juhyun Lee Nikolay Chirkov Ekaterina Ignasheva Yury Pisarchyk Mogan Shieh Fabio Riccardi Raman Sarokin Andrei Kulik and Matthias Grundmann. 2019. On-Device Neural Net Inference with Mobile GPUs. (2019). arXiv:1907.01989 http:\/\/arxiv.org\/abs\/1907.01989  Juhyun Lee Nikolay Chirkov Ekaterina Ignasheva Yury Pisarchyk Mogan Shieh Fabio Riccardi Raman Sarokin Andrei Kulik and Matthias Grundmann. 2019. On-Device Neural Net Inference with Mobile GPUs. (2019). arXiv:1907.01989 http:\/\/arxiv.org\/abs\/1907.01989"},{"key":"e_1_3_2_1_16_1","unstructured":"Arm Limited. 2019. &lt;u&gt;Arm Mali GPU OpenCL Developer Guide&lt;\/u&gt;. Technical Report.  Arm Limited. 2019. &lt;u&gt;Arm Mali GPU OpenCL Developer Guide&lt;\/u&gt;. Technical Report."},{"key":"e_1_3_2_1_17_1","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. &lt;u&gt;arXiv preprint arXiv:1409.1556&lt;\/u&gt; (2014).  Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. &lt;u&gt;arXiv preprint arXiv:1409.1556&lt;\/u&gt; (2014)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","unstructured":"Christian Szegedy Vincent Vanhoucke Sergey Ioffe Jon Shlens and Zbigniew Wojna. 2016. Rethinking the Inception Architecture for Computer Vision. &lt;u&gt;2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)&lt;\/u&gt; (2016).  Christian Szegedy Vincent Vanhoucke Sergey Ioffe Jon Shlens and Zbigniew Wojna. 2016. Rethinking the Inception Architecture for Computer Vision. &lt;u&gt;2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)&lt;\/u&gt; (2016).","DOI":"10.1109\/CVPR.2016.308"},{"key":"e_1_3_2_1_19_1","unstructured":"Qualcomm Technologies. 2017. &lt;u&gt;Qualcomm Snapdragon Mobile Platform OpenCL General Programming and Optimization&lt;\/u&gt;. Technical Report.  Qualcomm Technologies. 2017. &lt;u&gt;Qualcomm Snapdragon Mobile Platform OpenCL General Programming and Optimization&lt;\/u&gt;. Technical Report."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Barret Zoph Vijay Vasudevan Jonathon Shlens and Quoc V. Le. 2018. Learning Transferable Architectures for Scalable Image Recognition. &lt;u&gt;2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition&lt;\/u&gt; (2018).  Barret Zoph Vijay Vasudevan Jonathon Shlens and Quoc V. Le. 2018. Learning Transferable Architectures for Scalable Image Recognition. &lt;u&gt;2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition&lt;\/u&gt; (2018).","DOI":"10.1109\/CVPR.2018.00907"}],"event":{"name":"APSys '20: 11th ACM SIGOPS Asia-Pacific Workshop on Systems","location":"Tsukuba Japan","acronym":"APSys '20","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 11th ACM SIGOPS Asia-Pacific Workshop on Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3409963.3410493","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3409963.3410493","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:17:08Z","timestamp":1750191428000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3409963.3410493"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,8,24]]},"references-count":20,"alternative-id":["10.1145\/3409963.3410493","10.1145\/3409963"],"URL":"https:\/\/doi.org\/10.1145\/3409963.3410493","relation":{},"subject":[],"published":{"date-parts":[[2020,8,24]]},"assertion":[{"value":"2020-08-24","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}