{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:16:35Z","timestamp":1750220195563,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":64,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,10,8]],"date-time":"2022-10-08T00:00:00Z","timestamp":1665187200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U20A20226"],"award-info":[{"award-number":["U20A20226"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,10,8]]},"DOI":"10.1145\/3559009.3569656","type":"proceedings-article","created":{"date-parts":[[2023,1,27]],"date-time":"2023-01-27T14:02:50Z","timestamp":1674828170000},"page":"451-466","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Parallelizing Neural Network Models Effectively on GPU by Implementing Reductions Atomically"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2303-9736","authenticated-orcid":false,"given":"Jie","family":"Zhao","sequence":"first","affiliation":[{"name":"State Key Laboratory of Mathematical Engineering and Advanced Computing, Zhengzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7164-8213","authenticated-orcid":false,"given":"C\u00e9dric","family":"Bastoul","sequence":"additional","affiliation":[{"name":"Huawei Technologies France SASU, Paris, France"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3486-3731","authenticated-orcid":false,"given":"Yanzhi","family":"Yi","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4367-0464","authenticated-orcid":false,"given":"Jiahui","family":"Hu","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9903-8217","authenticated-orcid":false,"given":"Wang","family":"Nie","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9744-5676","authenticated-orcid":false,"given":"Renwei","family":"Zhang","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1031-6431","authenticated-orcid":false,"given":"Zhen","family":"Geng","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd., Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4160-7170","authenticated-orcid":false,"given":"Chong","family":"Li","sequence":"additional","affiliation":[{"name":"Huawei Technologies France SASU, Paris, France"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3264-5535","authenticated-orcid":false,"given":"Thibaut","family":"Tachon","sequence":"additional","affiliation":[{"name":"Huawei Technologies France SASU, Paris, France"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8983-4666","authenticated-orcid":false,"given":"Zhiliang","family":"Gan","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd., Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,1,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"TensorFlow: A System for Large-Scale Machine Learning. In 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16)","author":"Abadi Mart\u00edn","year":"2016","unstructured":"Mart\u00edn Abadi , Paul Barham , Jianmin Chen , Zhifeng Chen , Andy Davis , Jeffrey Dean , Matthieu Devin , Sanjay Ghemawat , Geoffrey Irving , Michael Isard , Manjunath Kudlur , Josh Levenberg , Rajat Monga , Sherry Moore , Derek G. Murray , Benoit Steiner , Paul Tucker , Vijay Vasudevan , Pete Warden , Martin Wicke , Yuan Yu , and Xiaoqiang Zheng . 2016 . TensorFlow: A System for Large-Scale Machine Learning. In 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16) . USENIX Association, Savannah, GA, 265--283. https:\/\/www.usenix.org\/conference\/osdi16\/technical-sessions\/presentation\/abadi Mart\u00edn Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, Michael Isard, Manjunath Kudlur, Josh Levenberg, Rajat Monga, Sherry Moore, Derek G. Murray, Benoit Steiner, Paul Tucker, Vijay Vasudevan, Pete Warden, Martin Wicke, Yuan Yu, and Xiaoqiang Zheng. 2016. TensorFlow: A System for Large-Scale Machine Learning. In 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16). USENIX Association, Savannah, GA, 265--283. https:\/\/www.usenix.org\/conference\/osdi16\/technical-sessions\/presentation\/abadi"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/29873.29875"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2019.8661197"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/2854038.2854048"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1016\/B978-0-12-385963-1.00026-5"},{"key":"e_1_3_2_1_6_1","unstructured":"Somashekaracharya G. Bhaskaracharya Julien Demouth and Vinod Grover. 2020. Automatic Kernel Generation for Volta Tensor Cores. arXiv:2006.12645 [cs.PL]  Somashekaracharya G. Bhaskaracharya Julien Demouth and Vinod Grover. 2020. Automatic Kernel Generation for Volta Tensor Cores. arXiv:2006.12645 [cs.PL]"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/1375581.1375595"},{"key":"e_1_3_2_1_8_1","volume-title":"Lin (Eds.)","volume":"33","author":"Brown Tom","year":"2020","unstructured":"Tom Brown , Benjamin Mann , Nick Ryder , Melanie Subbiah , Jared D Kaplan , Prafulla Dhariwal , Arvind Neelakantan , Pranav Shyam , Girish Sastry , Amanda Askell , Sandhini Agarwal , Ariel Herbert-Voss , Gretchen Krueger , Tom Henighan , Rewon Child , Aditya Ramesh , Daniel Ziegler , Jeffrey Wu , Clemens Winter , Chris Hesse , Mark Chen , Eric Sigler , Mateusz Litwin , Scott Gray , Benjamin Chess , Jack Clark , Christopher Berner , Sam McCandlish , Alec Radford , Ilya Sutskever , and Dario Amodei . 2020 . Language Models are Few-Shot Learners. In Advances in Neural Information Processing Systems, H. Larochelle, M. Ranzato, R. Hadsell, M.F. Balcan, and H . Lin (Eds.) , Vol. 33 . Curran Associates, Inc. , 1877--1901. https:\/\/proceedings.neurips.cc\/paper\/2020\/file\/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel Ziegler, Jeffrey Wu, Clemens Winter, Chris Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models are Few-Shot Learners. In Advances in Neural Information Processing Systems, H. Larochelle, M. Ranzato, R. Hadsell, M.F. Balcan, and H. Lin (Eds.), Vol. 33. Curran Associates, Inc., 1877--1901. https:\/\/proceedings.neurips.cc\/paper\/2020\/file\/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf"},{"key":"e_1_3_2_1_9_1","volume-title":"TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen , Thierry Moreau , Ziheng Jiang , Lianmin Zheng , Eddie Yan , Haichen Shen , Meghan Cowan , Leyuan Wang , Yuwei Hu , Luis Ceze , Carlos Guestrin , and Arvind Krishnamurthy . 2018 . TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18) . USENIX Association, Carlsbad, CA, 578--594. https:\/\/www.usenix.org\/conference\/osdi18\/presentation\/chen Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. 2018. TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). USENIX Association, Carlsbad, CA, 578--594. https:\/\/www.usenix.org\/conference\/osdi18\/presentation\/chen"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/2988450.2988454"},{"key":"e_1_3_2_1_11_1","unstructured":"Sharan Chetlur Cliff Woolley Philippe Vandermersch Jonathan Cohen John Tran Bryan Catanzaro and Evan Shelhamer. 2014. cuDNN: Efficient Primitives for Deep Learning. arXiv:1410.0759 [cs.NE]  Sharan Chetlur Cliff Woolley Philippe Vandermersch Jonathan Cohen John Tran Bryan Catanzaro and Evan Shelhamer. 2014. cuDNN: Efficient Primitives for Deep Learning. arXiv:1410.0759 [cs.NE]"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00083"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-1423"},{"key":"e_1_3_2_1_14_1","volume-title":"5th International Workshop on Polyhedral Compilation Techniques","author":"Doerfert Johannes","year":"2015","unstructured":"Johannes Doerfert , Kevin Streit , Sebastian Hack , and Zino Benaissa . 2015 . Polly's Polyhedral Scheduling in the Presence of Reductions . In 5th International Workshop on Polyhedral Compilation Techniques ( Amsterdam, The Netherlands) (IMPACT 2015). 11 pages. Johannes Doerfert, Kevin Streit, Sebastian Hack, and Zino Benaissa. 2015. Polly's Polyhedral Scheduling in the Presence of Reductions. In 5th International Workshop on Polyhedral Compilation Techniques (Amsterdam, The Netherlands) (IMPACT 2015). 11 pages."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3211346.3211354"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1007\/BF01407835"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-0-387-09766-4_502"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3307681.3326606"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/1111037.1111041"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2017.7863746"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3173162.3173182"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2019.8661187"},{"volume-title":"XLA: Optimizing Compiler for Machine Learning. https:\/\/www.tensorflow.org\/xla","year":"2017","key":"e_1_3_2_1_23_1","unstructured":"Google. 2017 . XLA: Optimizing Compiler for Machine Learning. https:\/\/www.tensorflow.org\/xla Google. 2017. XLA: Optimizing Compiler for Machine Learning. https:\/\/www.tensorflow.org\/xla"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/2743016"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.5555\/3172077.3172127"},{"key":"e_1_3_2_1_27_1","volume-title":"Optimizing parallel reduction in CUDA. Nvidia developer technology 2, 4","author":"Harris Mark","year":"2007","unstructured":"Mark Harris . 2007. Optimizing parallel reduction in CUDA. Nvidia developer technology 2, 4 ( 2007 ), 1--39. Mark Harris. 2007. Optimizing parallel reduction in CUDA. Nvidia developer technology 2, 4 (2007), 1--39."},{"key":"e_1_3_2_1_28_1","volume-title":"Cooperative Groups: Flexible CUDA Thread Programming. https:\/\/developer.nvidia.com\/blog\/cooperative-groups","author":"Harris Mark","year":"2017","unstructured":"Mark Harris and Kyrylo Perelygin . 2017 . Cooperative Groups: Flexible CUDA Thread Programming. https:\/\/developer.nvidia.com\/blog\/cooperative-groups Mark Harris and Kyrylo Perelygin. 2017. Cooperative Groups: Flexible CUDA Thread Programming. https:\/\/developer.nvidia.com\/blog\/cooperative-groups"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/2935323.2935326"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/2502323.2502328"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00140"},{"key":"e_1_3_2_1_32_1","unstructured":"Huawei. 2020. MindSpore. https:\/\/www.mindspore.cn\/en  Huawei. 2020. MindSpore. https:\/\/www.mindspore.cn\/en"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/73560.73588"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/1362622.1362691"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3122948.3122952"},{"key":"e_1_3_2_1_37_1","unstructured":"Justin Luitjens. 2014. Faster Parallel Reductions on Kepler. https:\/\/developer.nvidia.com\/blog\/faster-parallel-reductions-kepler  Justin Luitjens. 2014. Faster Parallel Reductions on Kepler. https:\/\/developer.nvidia.com\/blog\/faster-parallel-reductions-kepler"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/233561.233564"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3297858.3304047"},{"key":"e_1_3_2_1_40_1","unstructured":"Nvidia. 2013. cuBLAS. https:\/\/developer.nvidia.com\/cublas  Nvidia. 2013. cuBLAS. https:\/\/developer.nvidia.com\/cublas"},{"key":"e_1_3_2_1_41_1","unstructured":"Nvidia. 2018. CUB Documentation. https:\/\/nvlabs.github.io\/cub\/  Nvidia. 2018. CUB Documentation. https:\/\/nvlabs.github.io\/cub\/"},{"key":"e_1_3_2_1_42_1","unstructured":"Nvidia. 2020. CUDA C++ Programming Guide. https:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/index.html  Nvidia. 2020. CUDA C++ Programming Guide. https:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/index.html"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/1508244.1508256"},{"key":"e_1_3_2_1_44_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. In Advances in neural information processing systems. 8026--8037.","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke , Sam Gross , Francisco Massa , Adam Lerer , James Bradbury , Gregory Chanan , Trevor Killeen , Zeming Lin , Natalia Gimelshein , Luca Antiga , Alban Desmaison , Andreas Kopf , Edward Yang , Zachary DeVito , Martin Raison , Alykhan Tejani , Sasank Chilamkurthy , Benoit Steiner , Lu Fang , Junjie Bai , and Soumith Chintala . 2019 . Pytorch: An imperative style, high-performance deep learning library. In Advances in neural information processing systems. 8026--8037. Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. 2019. Pytorch: An imperative style, high-performance deep learning library. In Advances in neural information processing systems. 8026--8037."},{"key":"e_1_3_2_1_45_1","volume-title":"1987 16th International Conference on Parallel Processing (ICPP","author":"Polychronopoulos Constantine D","year":"1987","unstructured":"Constantine D Polychronopoulos . 1987 . Loop coalescing: A Compiler Transformation for Parallel machines . In 1987 16th International Conference on Parallel Processing (ICPP 1987). 235--242. Constantine D Polychronopoulos. 1987. Loop coalescing: A Compiler Transformation for Parallel machines. In 1987 16th International Conference on Parallel Processing (ICPP 1987). 235--242."},{"volume-title":"Polyhedral Optimization of TensorFlow Computation Graphs","author":"Pradelle Beno\u00eet","key":"e_1_3_2_1_46_1","unstructured":"Beno\u00eet Pradelle , Beno\u00eet Meister , Muthu Baskaran , Jonathan Springer , and Richard Lethin . 2019. Polyhedral Optimization of TensorFlow Computation Graphs . In Programming and Performance Visualization Tools, Abhinav Bhatele, David Boehme, Joshua A. Levine, Allen D. Malony, and Martin Schulz (Eds.). Springer International Publishing , Cham , 74--89. Beno\u00eet Pradelle, Beno\u00eet Meister, Muthu Baskaran, Jonathan Springer, and Richard Lethin. 2019. Polyhedral Optimization of TensorFlow Computation Graphs. In Programming and Performance Visualization Tools, Abhinav Bhatele, David Boehme, Joshua A. Levine, Allen D. Malony, and Martin Schulz (Eds.). Springer International Publishing, Cham, 74--89."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/2491956.2462176"},{"key":"e_1_3_2_1_48_1","volume-title":"Dilip Sequeira, Ashish Sirasao, Fei Sun, Hanlin Tang, Michael Thomson, Frank Wei, Ephrem Wu, Lingjie Xu, Koichi Yamada, Bing Yu, George Yuan, Aaron Zhong, Peizhao Zhang, and Yuchen Zhou.","author":"Reddi Vijay Janapa","year":"2020","unstructured":"Vijay Janapa Reddi , Christine Cheng , David Kanter , Peter Mattson , Guenther Schmuelling , Carole-Jean Wu , Brian Anderson , Maximilien Breughe , Mark Charlebois , William Chou , Ramesh Chukka , Cody Coleman , Sam Davis , Pan Deng , Greg Diamos , Jared Duke , Dave Fick , J. Scott Gardner , Itay Hubara , Sachin Idgunji , Thomas B. Jablin , Jeff Jiao , Tom St. John , Pankaj Kanwar , David Lee , Jeffery Liao , Anton Lokhmotov , Francisco Massa , Peng Meng , Paulius Micikevicius , Colin Osborne , Gennady Pekhimenko , Arun Tejusve Raghunath Rajan , Dilip Sequeira, Ashish Sirasao, Fei Sun, Hanlin Tang, Michael Thomson, Frank Wei, Ephrem Wu, Lingjie Xu, Koichi Yamada, Bing Yu, George Yuan, Aaron Zhong, Peizhao Zhang, and Yuchen Zhou. 2020 . MLPerf Inference Benchmark . arXiv:1911.02549 [cs.LG] Vijay Janapa Reddi, Christine Cheng, David Kanter, Peter Mattson, Guenther Schmuelling, Carole-Jean Wu, Brian Anderson, Maximilien Breughe, Mark Charlebois, William Chou, Ramesh Chukka, Cody Coleman, Sam Davis, Pan Deng, Greg Diamos, Jared Duke, Dave Fick, J. Scott Gardner, Itay Hubara, Sachin Idgunji, Thomas B. Jablin, Jeff Jiao, Tom St. John, Pankaj Kanwar, David Lee, Jeffery Liao, Anton Lokhmotov, Francisco Massa, Peng Meng, Paulius Micikevicius, Colin Osborne, Gennady Pekhimenko, Arun Tejusve Raghunath Rajan, Dilip Sequeira, Ashish Sirasao, Fei Sun, Hanlin Tang, Michael Thomson, Frank Wei, Ephrem Wu, Lingjie Xu, Koichi Yamada, Bing Yu, George Yuan, Aaron Zhong, Peizhao Zhang, and Yuchen Zhou. 2020. MLPerf Inference Benchmark. arXiv:1911.02549 [cs.LG]"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/2967938.2967950"},{"key":"e_1_3_2_1_50_1","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very Deep Convolutional Networks for Large-Scale Image Recognition. arXiv:1409.1556 [cs.CV]  Karen Simonyan and Andrew Zisserman. 2014. Very Deep Convolutional Networks for Large-Scale Image Recognition. arXiv:1409.1556 [cs.CV]"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/2594291.2594342"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2017.7863734"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2017.7863747"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3355606"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.5555\/3295222.3295349"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/2544137.2544141"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-15582-6_49"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/2400682.2400713"},{"key":"e_1_3_2_1_59_1","volume-title":"Scheduling for PPCG. Report CW 706","author":"Verdoolaege Sven","year":"2017","unstructured":"Sven Verdoolaege and Gerda Janssens . 2017. Scheduling for PPCG. Report CW 706 ( 2017 ). Sven Verdoolaege and Gerda Janssens. 2017. Scheduling for PPCG. Report CW 706 (2017)."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/3434301"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-37658-0_2"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00044"},{"key":"e_1_3_2_1_63_1","volume-title":"Proceedings of Machine Learning and Systems, Diana Marculescu, Yuejie Chi, and Carole-Jean Wu (Eds.)","volume":"4","author":"Zhao Jie","year":"2022","unstructured":"Jie Zhao , Xiong Gao , Ruijie Xia , Zhaochuang Zhang , Deshi Chen , Lei Chen , Renwei Zhang , Zhen Geng , Bin Cheng , and Xuefeng Jin . 2022 . Apollo: Automatic Partition-based Operator Fusion through Layer by Layer Optimization . In Proceedings of Machine Learning and Systems, Diana Marculescu, Yuejie Chi, and Carole-Jean Wu (Eds.) , Vol. 4 . 1--19. Jie Zhao, Xiong Gao, Ruijie Xia, Zhaochuang Zhang, Deshi Chen, Lei Chen, Renwei Zhang, Zhen Geng, Bin Cheng, and Xuefeng Jin. 2022. Apollo: Automatic Partition-based Operator Fusion through Layer by Layer Optimization. In Proceedings of Machine Learning and Systems, Diana Marculescu, Yuejie Chi, and Carole-Jean Wu (Eds.), Vol. 4. 1--19."},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1145\/3453483.3454106"},{"key":"e_1_3_2_1_65_1","volume-title":"Ansor: Generating High-Performance Tensor Programs for Deep Learning. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Zheng Lianmin","year":"2020","unstructured":"Lianmin Zheng , Chengfan Jia , Minmin Sun , Zhao Wu , Cody Hao Yu , Ameer Haj-Ali , Yida Wang , Jun Yang , Danyang Zhuo , Koushik Sen , Joseph E. Gonzalez , and Ion Stoica . 2020 . Ansor: Generating High-Performance Tensor Programs for Deep Learning. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20) . USENIX Association, 863--879. https:\/\/www.usenix.org\/conference\/osdi20\/presentation\/zheng Lianmin Zheng, Chengfan Jia, Minmin Sun, Zhao Wu, Cody Hao Yu, Ameer Haj-Ali, Yida Wang, Jun Yang, Danyang Zhuo, Koushik Sen, Joseph E. Gonzalez, and Ion Stoica. 2020. Ansor: Generating High-Performance Tensor Programs for Deep Learning. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). USENIX Association, 863--879. https:\/\/www.usenix.org\/conference\/osdi20\/presentation\/zheng"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1145\/2259016.2259027"}],"event":{"name":"PACT '22: International Conference on Parallel Architectures and Compilation Techniques","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","IFIP WG 10.3 IFIP WG 10.3","IEEE CS"],"location":"Chicago Illinois","acronym":"PACT '22"},"container-title":["Proceedings of the International Conference on Parallel Architectures and Compilation Techniques"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3559009.3569656","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3559009.3569656","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:02:38Z","timestamp":1750186958000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3559009.3569656"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,10,8]]},"references-count":64,"alternative-id":["10.1145\/3559009.3569656","10.1145\/3559009"],"URL":"https:\/\/doi.org\/10.1145\/3559009.3569656","relation":{},"subject":[],"published":{"date-parts":[[2022,10,8]]},"assertion":[{"value":"2023-01-27","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}