{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,19]],"date-time":"2026-06-19T22:42:28Z","timestamp":1781908948383,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":113,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,2,22]],"date-time":"2022-02-22T00:00:00Z","timestamp":1645488000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,2,28]]},"DOI":"10.1145\/3503222.3507767","type":"proceedings-article","created":{"date-parts":[[2022,2,22]],"date-time":"2022-02-22T20:49:01Z","timestamp":1645562941000},"page":"27-42","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":46,"title":["A full-stack search technique for domain optimized deep learning accelerators"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5112-1839","authenticated-orcid":false,"given":"Dan","family":"Zhang","sequence":"first","affiliation":[{"name":"Google Brain, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Safeen","family":"Huda","sequence":"additional","affiliation":[{"name":"Google, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7450-1594","authenticated-orcid":false,"given":"Ebrahim","family":"Songhori","sequence":"additional","affiliation":[{"name":"Google Brain, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kartik","family":"Prabhu","sequence":"additional","affiliation":[{"name":"Stanford University, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Quoc","family":"Le","sequence":"additional","affiliation":[{"name":"Google Brain, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4887-6293","authenticated-orcid":false,"given":"Anna","family":"Goldie","sequence":"additional","affiliation":[{"name":"Google Brain, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Azalia","family":"Mirhoseini","sequence":"additional","affiliation":[{"name":"Google Brain, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2022,2,22]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Electric Power Monthly with Data for","year":"2021","unstructured":"2020. Electric Power Monthly with Data for May 2021. https:\/\/www.eia.gov\/electricity\/monthly\/current_month\/july2021.pdf Accessed: 2021-08-09"},{"key":"e_1_3_2_1_2_1","unstructured":"2021. Developer Guide - NVIDIA Deep Learning cuDNN. https:\/\/web.archive.org\/web\/20210520075036\/https:\/\/docs.nvidia.com\/deeplearning\/cudnn\/developer-guide\/index.html##op-fusion"},{"key":"e_1_3_2_1_3_1","unstructured":"2021. Software Engineer Salaries in San Francisco Bay Area. https:\/\/www.levels.fyi\/Salaries\/Software-Engineer\/San-Francisco-Bay-Area\/ Accessed: 2021-08-09"},{"key":"e_1_3_2_1_4_1","volume-title":"Lane","author":"Abdelfattah Mohamed S.","year":"2020","unstructured":"Mohamed S. Abdelfattah, Lukasz Dudziak, Thomas Chau, Royson Lee, Hyeji Kim, and Nicholas D. Lane. 2020. Best of Both Worlds: AutoML Codesign of a CNN and Its Hardware Accelerator. In Proceedings of the 57th ACM\/EDAC\/IEEE Design Automation Conference (DAC \u201920). IEEE Press, Article 192, 6 pages. isbn:9781450367257 https:\/\/dl.acm.org\/doi\/abs\/10.5555\/3437539.3437731"},{"key":"e_1_3_2_1_5_1","volume-title":"Workshop on ML for Systems at NeurIPS.","author":"Abdolrashidi Amirali","year":"2019","unstructured":"Amirali Abdolrashidi, Qiumin Xu, Shibo Wang, Sudip Roy, and Yanqi Zhou. 2019. Learning to Fuse. Workshop on ML for Systems at NeurIPS."},{"key":"e_1_3_2_1_6_1","unstructured":"Daniel Adiwardana Minh-Thang Luong David R So Jamie Hall Noah Fiedel Romal Thoppilan Zi Yang Apoorv Kulshreshtha Gaurav Nemade and Yifeng Lu. 2020. Towards a human-like open-domain chatbot. arXiv preprint arXiv:2001.09977."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783725"},{"key":"e_1_3_2_1_8_1","volume-title":"Emil Talpes, and Bill McGee.","author":"Bannon Pete","year":"2021","unstructured":"Pete Bannon, Ganesh Venkataramanan, Debjit Das Sarma, Emil Talpes, and Bill McGee. 2021. Compute and Redundancy Solution for the Full Self-Driving Computer. https:\/\/web.archive.org\/web\/20210413053454\/https:\/\/old.hotchips.org\/hc31\/HC31_2.3_Tesla_Hotchips_ppt_Final_0817.pdf"},{"key":"e_1_3_2_1_9_1","volume-title":"The Datacenter as a Computer: Designing Warehouse-Scale Machines","author":"Barroso Luiz Andr\u00e9","unstructured":"Luiz Andr\u00e9 Barroso, Urs H\u00f6lzle, and Parthasarathy Ranganathan. 2018. The Datacenter as a Computer: Designing Warehouse-Scale Machines (3rd ed.). Morgan & Claypool Publishers. isbn:9781681734330","edition":"3"},{"key":"e_1_3_2_1_10_1","volume-title":"Artificial-intelligence hardware: New opportunities for semiconductor companies","author":"Batra Gaurav","unstructured":"Gaurav Batra, Zach Jacobson, Siddarth Madhav, Andrea Queirolo, and Nick Santhanam. 2018. Artificial-intelligence hardware: New opportunities for semiconductor companies. McKinsey & Company, New York, NY, USA, Tech. Rep."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.14778\/3229863.3229865"},{"key":"e_1_3_2_1_12_1","unstructured":"Tom B. Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell Sandhini Agarwal Ariel Herbert-Voss Gretchen Krueger Tom Henighan Rewon Child Aditya Ramesh Daniel M. Ziegler Jeffrey Wu Clemens Winter Christopher Hesse Mark Chen Eric Sigler Mateusz Litwin Scott Gray Benjamin Chess Jack Clark Christopher Berner Sam McCandlish Alec Radford Ilya Sutskever and Dario Amodei. 2020. Language Models are Few-Shot Learners. arxiv:2005.14165."},{"key":"e_1_3_2_1_13_1","unstructured":"Han Cai Ligeng Zhu and Song Han. 2019. ProxylessNAS: Direct Neural Architecture Search on Target Task and Hardware. arxiv:1812.00332."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2016.40"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.195"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2021.3061394"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2018.022071131"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054275"},{"key":"e_1_3_2_1_19_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arxiv:1810.04805.","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arxiv:1810.04805."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2750389"},{"key":"e_1_3_2_1_21_1","unstructured":"William Fedus Barret Zoph and Noam Shazeer. 2021. Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity. arXiv preprint arXiv:2101.03961."},{"key":"e_1_3_2_1_22_1","volume-title":"Understanding return on investment","author":"Friedlob George T","unstructured":"George T Friedlob and Franklin J Plewa Jr. 1996. Understanding return on investment. John Wiley & Sons."},{"key":"e_1_3_2_1_23_1","unstructured":"Gerald Gamrath Daniel Anderson Ksenia Bestuzheva Wei-Kun Chen Leon Eifler Maxime Gasse Patrick Gemander Ambros Gleixner Leona Gottwald Katrin Halbig Gregor Hendel Christopher Hojny Thorsten Koch Pierre Le Bodic Stephen J. Maher Frederic Matter Matthias Miltenberger Erik M\u00fchmer Benjamin M\u00fcller Marc E. Pfetsch Franziska Schl\u00f6sser Felipe Serrano Yuji Shinano Christine Tawfik Stefan Vigerske Fabian Wegscheider Dieter Weninger and Jakob Witzig. 2020. The SCIP Optimization Suite 7.0. Optimization Online. http:\/\/www.optimization-online.org\/DB_HTML\/2020\/03\/7705.html"},{"key":"e_1_3_2_1_24_1","volume-title":"Proceedings of the Thirtieth Conference on Uncertainty in Artificial Intelligence (UAI\u201914)","author":"Gelbart Michael A.","year":"2075","unstructured":"Michael A. Gelbart, Jasper Snoek, and Ryan P. Adams. 2014. Bayesian Optimization with Unknown Constraints. In Proceedings of the Thirtieth Conference on Uncertainty in Artificial Intelligence (UAI\u201914). AUAI Press, Arlington, Virginia, USA. 250\u2013259. isbn:9780974903910 https:\/\/dl.acm.org\/doi\/10.5555\/3020751.3020778"},{"key":"e_1_3_2_1_25_1","volume-title":"10th NIPS Workshop on Optimization for Machine Learning.","author":"Golovin Daniel","year":"2017","unstructured":"Daniel Golovin, Greg Kochanski, and John Elliot Karro. 2017. Black box optimization via a bayesian-optimized genetic algorithm. In 10th NIPS Workshop on Optimization for Machine Learning."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3097983.3098043"},{"key":"e_1_3_2_1_27_1","volume-title":"XLA: Optimizing Compiler for TensorFlow. https:\/\/www.tensorflow.org\/xla","year":"2018","unstructured":"Google. 2018. XLA: Optimizing Compiler for TensorFlow. https:\/\/www.tensorflow.org\/xla"},{"key":"e_1_3_2_1_28_1","unstructured":"Suyog Gupta and Mingxing Tan. 2019. EfficientNet-EdgeTPU: Creating Accelerator-Optimized Neural Networks with AutoML. https:\/\/ai.googleblog.com\/2019\/08\/efficientnet-edgetpu-creating.html"},{"key":"e_1_3_2_1_29_1","volume-title":"Trained Quantization and Huffman Coding. In 4th International Conference on Learning Representations. arxiv:1510","author":"Han Song","unstructured":"Song Han, Huizi Mao, and William J. Dally. 2016. Deep Compression: Compressing Deep Neural Network with Pruning, Trained Quantization and Huffman Coding. In 4th International Conference on Learning Representations. arxiv:1510.00149"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3316781.3317829"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00059"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3445814.3446762"},{"key":"e_1_3_2_1_34_1","volume-title":"Mobilenets: Efficient convolutional neural networks for mobile vision applications. arXiv preprint arXiv:1704.04861.","author":"Howard Andrew G","year":"2017","unstructured":"Andrew G Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, and Hartwig Adam. 2017. Mobilenets: Efficient convolutional neural networks for mobile vision applications. arXiv preprint arXiv:1704.04861."},{"key":"e_1_3_2_1_35_1","volume-title":"SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and&lt","author":"Iandola Forrest N","unstructured":"Forrest N Iandola, Song Han, Matthew W Moskewicz, Khalid Ashraf, William J Dally, and Kurt Keutzer. 2016. SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and&lt; 0.5 MB model size. arXiv preprint arXiv:1602.07360."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2020.2986127"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00010"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3360307"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"e_1_3_2_1_40_1","unstructured":"David Kanter and Vijay Janapa Reddi. 2021. MLPerf Inference Rules. https:\/\/github.com\/mlcommons\/inference_policies\/blob\/master\/inference_rules.adoc"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00058"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3400302.3415639"},{"key":"e_1_3_2_1_43_1","volume-title":"ML for Systems Workshop at NeurIPS, Article arXiv:2008","author":"Kaufman Samuel J.","year":"2020","unstructured":"Samuel J. Kaufman, Phitchaya Mangpo Phothilimthana, Yanqi Zhou, and Mike Burrows. 2020. A Learned Performance Model for the Tensor Processing Unit. ML for Systems Workshop at NeurIPS, Article arXiv:2008.01040, Aug., arXiv:2008.01040 pages."},{"key":"e_1_3_2_1_44_1","unstructured":"Brucek Khailany. 2019. Machine-Learning-Assisted Agile VLSI Design For Machine Learning. https:\/\/web.archive.org\/web\/20210810054054\/http:\/\/crva.ict.ac.cn\/documents\/agile-and-open-hardware\/khailany-sigarch-visioning-oahw2019.pdf"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCAS.1988.14929"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2006.884574"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2020.2985963"},{"key":"e_1_3_2_1_48_1","volume-title":"8th International Conference on Learning Representations, ICLR 2020","author":"Lan Zhenzhong","year":"2020","unstructured":"Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, and Radu Soricut. 2020. Albert: A lite bert for self-supervised learning of language representations. In 8th International Conference on Learning Representations, ICLR 2020, Addis Ababa, Ethiopia, April 26-30, 2020. OpenReview.net. https:\/\/openreview.net\/forum?id=H1eA7AEtvS"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.435"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2016.11"},{"key":"e_1_3_2_1_51_1","volume-title":"Ninth International Conference on Learning Representations (ICLR). arxiv:2006","author":"Lepikhin Dmitry","year":"2021","unstructured":"Dmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu, Dehao Chen, Orhan Firat, Yanping Huang, Maxim Krikun, Noam Shazeer, and Zhifeng Chen. 2021. Gshard: Scaling giant models with conditional computation and automatic sharding. In Ninth International Conference on Learning Representations (ICLR). arxiv:2006.16668"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3445814.3446759"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00799"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18072.2020.9218749"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3400302.3415645"},{"key":"e_1_3_2_1_56_1","volume-title":"Neural-Hardware Architecture Search. In NeurIPS ML for Systems Workshop.","author":"Lin Yujun","year":"2019","unstructured":"Yujun Lin, Driss Hafdi, Kuan Wang, Zhijian Liu, and Song Han. 2019. Neural-Hardware Architecture Search. In NeurIPS ML for Systems Workshop."},{"key":"e_1_3_2_1_57_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692.","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692."},{"key":"e_1_3_2_1_58_1","unstructured":"Guoping Long Jun Yang Kai Zhu and Wei Lin. 2018. FusionStitching: Deep fusion and code generation for tensorflow computations on gpus. arXiv preprint arXiv:1811.05213."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2017.29"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2019.2936215"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3020078.3021736"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2016.25"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2020.2974843"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2021.3059962"},{"key":"e_1_3_2_1_65_1","unstructured":"Maxim Milakov and Natalia Gimelshein. 2018. Online normalizer calculation for softmax. arxiv:1805.02867."},{"key":"e_1_3_2_1_66_1","unstructured":"Pandu Nayak. 2019. Understanding searches better than ever before. https:\/\/blog.google\/products\/search\/search-language-understanding-bert\/"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/NORCHIP.2014.7004740"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2019.00042"},{"key":"e_1_3_2_1_69_1","unstructured":"Jongsoo Park Maxim Naumov Protonu Basu Summer Deng Aravind Kalaiah Daya Shanker Khudia James Law Parth Malani Andrey Malevich Nadathur Satish Juan Miguel Pino Martin Schatz Alexander Sidorov Viswanath Sivakumar Andrew Tulloch Xiaodong Wang Yiming Wu Hector Yuen Utku Diril Dmytro Dzhulgakov Kim M. Hazelwood Bill Jia Yangqing Jia Lin Qiao Vijay Rao Nadav Rotem Sungjoo Yoo and Mikhail Smelyanskiy. 2018. Deep learning inference in facebook data centers: Characterization performance optimizations and hardware implications. CoRR abs\/1811.09886 (2018)."},{"key":"e_1_3_2_1_70_1","unstructured":"David Patterson Joseph Gonzalez Quoc Le Chen Liang Lluis-Miquel Munguia Daniel Rothchild David So Maud Texier and Jeff Dean. 2021. Carbon emissions and large neural network training. arXiv preprint arXiv:2104.10350."},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2018.032271058"},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1145\/3060403.3066860"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2015.42"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00480"},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.1145\/2847263.2847265"},{"key":"e_1_3_2_1_76_1","volume-title":"Language models are unsupervised multitask learners. OpenAI blog, 1, 8","author":"Radford Alec","year":"2019","unstructured":"Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, and Ilya Sutskever. 2019. Language models are unsupervised multitask learners. OpenAI blog, 1, 8 (2019), 9."},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1145\/2491956.2462176"},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"publisher","unstructured":"Parthasarathy Ranganathan Daniel Stodolsky Jeff Calow Jeremy Dorfman Marisabel Guevara Clinton Wills Smullen IV Aki Kuusela Raghu Balasubramanian Sandeep Bhatia Prakash Chauhan Anna Cheung In Suk Chong Niranjani Dasharathi Jia Feng Brian Fosco Samuel Foss Ben Gelb Sara J. Gwin Yoshiaki Hase Da-ke He C. Richard Ho Roy W. Huffman Jr. Elisha Indupalli Indira Jayaram Poonacha Kongetira Cho Mon Kyaw Aaron Laursen Yuan Li Fong Lou Kyle A. Lucke JP Maaninen Ramon Macias Maire Mahony David Alexander Munday Srikanth Muroor Narayana Penukonda Eric Perkins-Argueta Devin Persaud Alex Ramirez Ville-Mikko Rautio Yolanda Ripley Amir Salek Sathish Sekar Sergey N. Sokolov Rob Springer Don Stark Mercedes Tan Mark S. Wachsler Andrew C. Walton David A. Wickeraad Alvin Wijaya and Hon Kwan Wu. 2021. Warehouse-scale video acceleration: co-design and deployment in the wild. In Proceedings of the 26th ACM International Conference on Architectural Support for Programming Languages and Operating Systems. 600\u2013615. https:\/\/doi.org\/10.1145\/3445814.3446723 10.1145\/3445814.3446723","DOI":"10.1145\/3445814.3446723"},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISLPED.2017.8009208"},{"key":"e_1_3_2_1_80_1","volume-title":"Relay: A High-Level Compiler for Deep Learning. arxiv:1904.08368.","author":"Roesch Jared","year":"2019","unstructured":"Jared Roesch, Steven Lyubomirsky, Marisa Kirisame, Logan Weber, Josh Pollock, Luis Vega, Ziheng Jiang, Tianqi Chen, Thierry Moreau, and Zachary Tatlock. 2019. Relay: A High-Level Compiler for Deep Learning. arxiv:1904.08368."},{"key":"e_1_3_2_1_81_1","volume-title":"CUDA by example: an introduction to general-purpose GPU programming","author":"Sanders Jason","unstructured":"Jason Sanders and Edward Kandrot. 2010. CUDA by example: an introduction to general-purpose GPU programming. Addison-Wesley Professional."},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00474"},{"key":"e_1_3_2_1_83_1","unstructured":"Zachary Shahan. 2020. Tesla Autopilot Innovation Comes From Team Of 300 Jedi Engineers \u2014 Interview With Elon Musk. https:\/\/web.archive.org\/web\/20210430195722\/https:\/\/cleantechnica.com\/2020\/08\/15\/tesla-autopilot-innovation-comes-from-team-of-300-jedi-engineers-interview-with-elon-musk\/"},{"key":"e_1_3_2_1_84_1","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358302"},{"key":"e_1_3_2_1_85_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783720"},{"key":"e_1_3_2_1_86_1","unstructured":"Zhan Shi Chirag Sakhuja Milad Hashemi Kevin Swersky and Calvin Lin. 2020. Learned Hardware\/Software Co-Design of Neural Accelerators. arxiv:2010.02075."},{"key":"e_1_3_2_1_87_1","unstructured":"Ryan Smith. 2020. NVIDIA Ampere Unleashed: NVIDIA Announces New GPU Architecture A100 GPU and Accelerator. https:\/\/www.anandtech.com\/show\/15801\/nvidia-announces-ampere-architecture-and-a100-products Accessed: 2021-08-09"},{"key":"e_1_3_2_1_88_1","doi-asserted-by":"publisher","DOI":"10.1145\/2847263.2847276"},{"key":"e_1_3_2_1_89_1","unstructured":"Supermicro. 2021. Data Centers & the Environment on the state of the green datacenter. https:\/\/www.supermicro.com\/en\/white-paper\/datacenter-report"},{"key":"e_1_3_2_1_90_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00293"},{"key":"e_1_3_2_1_91_1","volume-title":"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks. In International Conference on Machine Learning. 6105\u20136114","author":"Tan Mingxing","year":"2019","unstructured":"Mingxing Tan and Quoc Le. 2019. EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks. In International Conference on Machine Learning. 6105\u20136114."},{"key":"e_1_3_2_1_92_1","volume-title":"\u0141 ukasz Kaiser, and Illia Polosukhin","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141 ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in neural information processing systems. 5998\u20136008."},{"key":"e_1_3_2_1_93_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCAD45719.2019.8942127"},{"key":"e_1_3_2_1_94_1","volume-title":"Zitian Liu, and Scott Shenker.","author":"Vulimiri Ashish","year":"2013","unstructured":"Ashish Vulimiri, P Godfrey, Sri Varsha Gorge, Zitian Liu, and Scott Shenker. 2013. A cost-benefit analysis of low latency via added utilization. arXiv preprint arXiv:1306.3534."},{"key":"e_1_3_2_1_95_1","doi-asserted-by":"publisher","DOI":"10.1145\/3316781.3317875"},{"key":"e_1_3_2_1_96_1","doi-asserted-by":"publisher","DOI":"10.1145\/3061639.3062207"},{"key":"e_1_3_2_1_97_1","volume-title":"CosTLO: Cost-Effective Redundancy for Lower Latency Variance on Cloud Storage Services. In 12th USENIX Symposium on Networked Systems Design and Implementation (NSDI 15)","author":"Wu Zhe","year":"1971","unstructured":"Zhe Wu, Curtis Yu, and Harsha V. Madhyastha. 2015. CosTLO: Cost-Effective Redundancy for Lower Latency Variance on Cloud Storage Services. In 12th USENIX Symposium on Networked Systems Design and Implementation (NSDI 15). USENIX Association, Oakland, CA. 543\u2013557. isbn:978-1-931971-218 https:\/\/www.usenix.org\/conference\/nsdi15\/technical-sessions\/presentation\/wu"},{"key":"e_1_3_2_1_98_1","doi-asserted-by":"publisher","DOI":"10.1145\/216585.216588"},{"key":"e_1_3_2_1_99_1","doi-asserted-by":"publisher","DOI":"10.1145\/3273982.3273991"},{"key":"e_1_3_2_1_100_1","doi-asserted-by":"publisher","DOI":"10.5555\/3437539.3437702"},{"key":"e_1_3_2_1_101_1","volume-title":"Analyze DNN Accelerators. In 25th International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS).","author":"Yang Xuan","year":"2020","unstructured":"Xuan Yang, Mingyu Gao, Qiaoyi Liu, Jeff Setter, Jing Pu, Ankita Nayak, Steven Bell, Kaidi Cao, Heonjae Ha, Priyanka Raina, Christos Kozyrakis, and Mark Horowitz. 2020. Interstellar: Using Halide\u2019s Scheduling Language to Analyze DNN Accelerators. In 25th International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS)."},{"key":"e_1_3_2_1_102_1","volume-title":"Xlnet: Generalized autoregressive pretraining for language understanding. Advances in neural information processing systems, 32","author":"Yang Zhilin","year":"2019","unstructured":"Zhilin Yang, Zihang Dai, Yiming Yang, Jaime Carbonell, Russ R Salakhutdinov, and Quoc V Le. 2019. Xlnet: Generalized autoregressive pretraining for language understanding. Advances in neural information processing systems, 32 (2019)."},{"key":"e_1_3_2_1_103_1","volume-title":"Apollo: Transferable Architecture Exploration. arxiv:2102.01723.","author":"Yazdanbakhsh Amir","year":"2021","unstructured":"Amir Yazdanbakhsh, Christof Angermueller, Berkin Akin, Yanqi Zhou, Albin Jones, Milad Hashemi, Kevin Swersky, Satrajit Chatterjee, Ravi Narayanaswami, and James Laudon. 2021. Apollo: Transferable Architecture Exploration. arxiv:2102.01723."},{"key":"e_1_3_2_1_104_1","unstructured":"Amir Yazdanbakhsh Kiran Seshadri Berkin Akin James Laudon and Ravi Narayanaswami. 2021. An Evaluation of Edge TPU Accelerators for Convolutional Neural Networks. arxiv:2102.10423."},{"key":"e_1_3_2_1_105_1","doi-asserted-by":"publisher","DOI":"10.1145\/2684746.2689060"},{"key":"e_1_3_2_1_106_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2017.2785257"},{"key":"e_1_3_2_1_107_1","doi-asserted-by":"publisher","DOI":"10.1145\/3296957.3173197"},{"key":"e_1_3_2_1_108_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISVLSI.2019.00014"},{"key":"e_1_3_2_1_109_1","doi-asserted-by":"publisher","DOI":"10.1145\/3240765.3240801"},{"key":"e_1_3_2_1_110_1","doi-asserted-by":"publisher","DOI":"10.1145\/3400302.3415609"},{"key":"e_1_3_2_1_111_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00044"},{"key":"e_1_3_2_1_112_1","unstructured":"Yanqi Zhou Xuanyi Dong Berkin Akin Mingxing Tan Daiyi Peng Tianjian Meng Amir Yazdanbakhsh Da Huang Ravi Narayanaswami and James Laudon. 2021. Rethinking Co-design of Neural Architectures and Hardware Accelerators. arxiv:2102.08619."},{"key":"e_1_3_2_1_113_1","first-page":"13844","article-title":"Transferable Graph Optimizers for ML Compilers","volume":"33","author":"Zhou Yanqi","year":"2020","unstructured":"Yanqi Zhou, Sudip Roy, Amirali Abdolrashidi, Daniel Wong, Peter Ma, Qiumin Xu, Hanxiao Liu, Phitchaya Phothilimtha, Shen Wang, Anna Goldie, Azalia Mirhoseini, and James Laudon. 2020. Transferable Graph Optimizers for ML Compilers. In Advances in Neural Information Processing Systems. 33, 13844\u201313855.","journal-title":"Advances in Neural Information Processing Systems."}],"event":{"name":"ASPLOS '22: 27th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","location":"Lausanne Switzerland","acronym":"ASPLOS '22","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGOPS ACM Special Interest Group on Operating Systems","SIGARCH ACM Special Interest Group on Computer Architecture","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 27th ACM International Conference on Architectural Support for Programming Languages and Operating Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503222.3507767","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3503222.3507767","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:30:49Z","timestamp":1750188649000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503222.3507767"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,2,22]]},"references-count":113,"alternative-id":["10.1145\/3503222.3507767","10.1145\/3503222"],"URL":"https:\/\/doi.org\/10.1145\/3503222.3507767","relation":{},"subject":[],"published":{"date-parts":[[2022,2,22]]},"assertion":[{"value":"2022-02-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}