{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,21]],"date-time":"2025-06-21T11:28:32Z","timestamp":1750505312277,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":20,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,6,23]],"date-time":"2024-06-23T00:00:00Z","timestamp":1719100800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,6,23]]},"DOI":"10.1145\/3649329.3655683","type":"proceedings-article","created":{"date-parts":[[2024,11,7]],"date-time":"2024-11-07T19:27:22Z","timestamp":1731007642000},"page":"1-6","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["GSPO: A Graph Substitution and Parallelization Joint Optimization Framework for DNN Inference"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-5264-2818","authenticated-orcid":false,"given":"Zheng","family":"Xu","sequence":"first","affiliation":[{"name":"School of Integrated Circuits, Tsinghua University, Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-4702-0635","authenticated-orcid":false,"given":"Xu","family":"Dai","sequence":"additional","affiliation":[{"name":"Shanghai Artificial Intelligence Laboratory, Shanghai, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5117-7920","authenticated-orcid":false,"given":"Shaojun","family":"Wei","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, BNRist, Tsinghua University, Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2309-572X","authenticated-orcid":false,"given":"Shouyi","family":"Yin","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, BNRist, Tsinghua University, Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6942-4395","authenticated-orcid":false,"given":"Yang","family":"Hu","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, Tsinghua University, Beijing, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,11,7]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings of the 12th USENIX Conference on Operating Systems Design and Implementation (OSDI)","author":"Abadi M.","year":"2016","unstructured":"M. Abadi, P. Barham, J. Chen, Z. Chen, and et al. Davis, A. 2016. TensorFlow: A System for Large-Scale Machine Learning. In Proceedings of the 12th USENIX Conference on Operating Systems Design and Implementation (OSDI) (2016)."},{"key":"e_1_3_2_1_2_1","volume-title":"Neural Machine Translation by Jointly Learning to Align and Translate. CoRR","author":"Bahdanau Dzmitry","year":"2014","unstructured":"Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. 2014. Neural Machine Translation by Jointly Learning to Align and Translate. CoRR (2014)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2004.60"},{"key":"e_1_3_2_1_4_1","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation","author":"Jiang","year":"2018","unstructured":"T.; Jiang Z.; Zheng L. Yan E. Shen H. Cowan M. Wang L. et al. Chen, T.; Moreau. 2018. TVM: An automated end-to-end optimizing compiler for deep learning. 13th USENIX Symposium on Operating Systems Design and Implementation (2018)."},{"key":"e_1_3_2_1_5_1","unstructured":"CUDA. 2022. CUDA C++ Programming Guide. https:\/\/docs.nvidia.com\/cuda."},{"key":"e_1_3_2_1_6_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. CoRR","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. CoRR (2018)."},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of Machine Learning and Systems","author":"Ding Yaoyao","year":"2021","unstructured":"Yaoyao Ding, Ligeng Zhu, Zhihao Jia, Gennady Pekhimenko, and Song Han. 2021. Ios: Inter-operator scheduler for cnn acceleration. Proceedings of Machine Learning and Systems (2021)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.14778\/3407790.3407857"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_10_1","volume-title":"SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and < 0.5 MB model size. arXiv preprint arXiv:1602.07360","author":"Iandola Forrest N","year":"2016","unstructured":"Forrest N Iandola, Song Han, Matthew W Moskewicz, Khalid Ashraf, William J Dally, and Kurt Keutzer. 2016. SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and < 0.5 MB model size. arXiv preprint arXiv:1602.07360 (2016)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359630"},{"key":"e_1_3_2_1_12_1","volume-title":"Proceedings of Machine Learning and Systems.","author":"Jia Zhihao","year":"2019","unstructured":"Zhihao Jia, James Thomas, Todd Warszawski, Mingyu Gao, Matei Zaharia, and Alex Aiken. 2019. Optimizing DNN Computation with Relaxed Graph Substitutions. In Proceedings of Machine Learning and Systems."},{"key":"e_1_3_2_1_13_1","volume-title":"ImageNet Classification with Deep Convolutional Neural Networks. Advances in Neural Information Processing Systems","author":"Krizhevsky Alex","year":"2012","unstructured":"Alex Krizhevsky, Ilya Sutskever, and Geoffrey E Hinton. 2012. ImageNet Classification with Deep Convolutional Neural Networks. Advances in Neural Information Processing Systems (2012)."},{"key":"e_1_3_2_1_14_1","volume-title":"Nimble: Lightweight and Parallel GPU Task Scheduling for Deep Learning. In Advances in Neural Information Processing Systems.","author":"Kwon Woosuk","year":"2020","unstructured":"Woosuk Kwon, Gyeong-In Yu, Eunji Jeong, and Byung-Gon Chun. 2020. Nimble: Lightweight and Parallel GPU Task Scheduling for Deep Learning. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_15_1","unstructured":"PyTorch. 2022. PyTorch CUDA Semantics. https:\/\/pytorch.org."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.308"},{"key":"e_1_3_2_1_17_1","volume-title":"\u0141 ukasz Kaiser, and Illia Polosukhin","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141 ukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. Advances in Neural Information Processing Systems (2017)."},{"key":"e_1_3_2_1_18_1","volume-title":"AutoGraph: Optimizing DNN Computation Graph for Parallel GPU Kernel Execution","author":"Zhao Yuxuan","year":"2023","unstructured":"Yuxuan Zhao, Qi Sun, Zhuolun He, Yang Bai, and Bei Yu. 2023. AutoGraph: Optimizing DNN Computation Graph for Parallel GPU Kernel Execution. Association for the Advancement of Artificial Intelligence (2023)."},{"key":"e_1_3_2_1_19_1","volume-title":"Neural architecture search with reinforcement learning. arXiv preprint arXiv:1611.01578","author":"Zoph Barret","year":"2016","unstructured":"Barret Zoph and Quoc V Le. 2016. Neural architecture search with reinforcement learning. arXiv preprint arXiv:1611.01578 (2016)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00907"}],"event":{"name":"DAC '24: 61st ACM\/IEEE Design Automation Conference","sponsor":["SIGDA ACM Special Interest Group on Design Automation","IEEE-CEDA","SIGBED ACM Special Interest Group on Embedded Systems"],"location":"San Francisco CA USA","acronym":"DAC '24"},"container-title":["Proceedings of the 61st ACM\/IEEE Design Automation Conference"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3649329.3655683","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3649329.3655683","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:48Z","timestamp":1750295868000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3649329.3655683"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,23]]},"references-count":20,"alternative-id":["10.1145\/3649329.3655683","10.1145\/3649329"],"URL":"https:\/\/doi.org\/10.1145\/3649329.3655683","relation":{},"subject":[],"published":{"date-parts":[[2024,6,23]]},"assertion":[{"value":"2024-11-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}