{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T15:40:06Z","timestamp":1755877206274,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":25,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,2,28]],"date-time":"2025-02-28T00:00:00Z","timestamp":1740700800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"CCF-Ant Research Fund CCF-AFSGRF","award":["20230207"],"award-info":[{"award-number":["20230207"]}]},{"DOI":"10.13039\/501100006374","name":"Beijing Nova Program","doi-asserted-by":"publisher","award":["Z211100002121143, 20220484217"],"award-info":[{"award-number":["Z211100002121143, 20220484217"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Innovation Funding of ICT, CAS","award":["E461030"],"award-info":[{"award-number":["E461030"]}]},{"name":"Youth Innovation Promotion Association of Chinese Academy of Sciences","award":["2021099"],"award-info":[{"award-number":["2021099"]}]},{"DOI":"10.13039\/501100006374","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2021YFB0300202"],"award-info":[{"award-number":["2021YFB0300202"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100006374","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62102396, 62032023, T2125013"],"award-info":[{"award-number":["62102396, 62032023, T2125013"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Pilotfor Major Scientific Research Facility of Jiangsu Province of China","award":["BM2021800"],"award-info":[{"award-number":["BM2021800"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,2,28]]},"DOI":"10.1145\/3710848.3710857","type":"proceedings-article","created":{"date-parts":[[2025,2,28]],"date-time":"2025-02-28T06:20:57Z","timestamp":1740723657000},"page":"563-565","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Magneto: Accelerating Parallel Structures in DNNs via Co-Optimization of Operators"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2716-5051","authenticated-orcid":false,"given":"Zhanyuan","family":"Di","sequence":"first","affiliation":[{"name":"SKLP, Institute of Computing Technology, CAS, University of Chinese, Academy of Sciences"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-4940-5598","authenticated-orcid":false,"given":"Leping","family":"Wang","sequence":"additional","affiliation":[{"name":"SKLP, Institute of Computing Technology, CAS"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-5821-1163","authenticated-orcid":false,"given":"Ziyi","family":"Ren","sequence":"additional","affiliation":[{"name":"SKLP, Institute of Computing Technology, CAS, University of Chinese, Academy of Sciences"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9678-7228","authenticated-orcid":false,"given":"En","family":"Shao","sequence":"additional","affiliation":[{"name":"SKLP, Institute of Computing Technology, CAS, University of Chinese, Academy of Sciences"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2303-9736","authenticated-orcid":false,"given":"Jie","family":"Zhao","sequence":"additional","affiliation":[{"name":"Hunan University"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4682-983X","authenticated-orcid":false,"given":"Siyuan","family":"Feng","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5422-4497","authenticated-orcid":false,"given":"Dingwen","family":"Tao","sequence":"additional","affiliation":[{"name":"SKLP, Institute of Computing Technology, CAS"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6361-5948","authenticated-orcid":false,"given":"Guangming","family":"Tan","sequence":"additional","affiliation":[{"name":"SKLP, Institute of Computing Technology, CAS"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1953-1392","authenticated-orcid":false,"given":"Ninghui","family":"Sun","sequence":"additional","affiliation":[{"name":"SKLP, Institute of Computing Technology, CAS"}]}],"member":"320","published-online":{"date-parts":[[2025,2,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Retrieved","author":"AMD.","year":"2024","unstructured":"AMD. 2024. MIGraphX Documentation. Retrieved November 20, 2024 from https:\/\/rocm.docs.amd.com\/projects\/AMDMIGraphX v2.4."},{"key":"e_1_3_2_1_2_1","volume-title":"Retrieved","author":"AMD.","year":"2024","unstructured":"AMD. 2024. ROCm Documentation. Retrieved November 20, 2024 from https:\/\/rocm.docs.amd.com v5.4.3."},{"key":"e_1_3_2_1_3_1","volume-title":"Proceedings of The 33rd International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"182","author":"Amodei Dario","year":"2016","unstructured":"Dario Amodei, Sundaram Ananthanarayanan, Rishita Anubhai, Jingliang Bai, Eric Battenberg, Carl Case, Jared Casper, Bryan Catanzaro, Qiang Cheng, Guoliang Chen, Jie Chen, Jingdong Chen, Zhijie Chen, Mike Chrzanowski, Adam Coates, Greg Diamos, Ke Ding, Niandong Du, Erich Elsen, Jesse Engel, Weiwei Fang, Linxi Fan, Christopher Fougner, Liang Gao, Caixia Gong, Awni Hannun, Tony Han, Lappi Johannes, Bing Jiang, Cai Ju, Billy Jun, Patrick LeGresley, Libby Lin, Junjie Liu, Yang Liu, Weigao Li, Xiangang Li, Dongpeng Ma, Sharan Narang, Andrew Ng, Sherjil Ozair, Yiping Peng, Ryan Prenger, Sheng Qian, Zongfeng Quan, Jonathan Raiman, Vinay Rao, Sanjeev Satheesh, David Seetapun, Shubho Sengupta, Kavya Srinet, Anuroop Sriram, Haiyuan Tang, Liliang Tang, Chong Wang, Jidong Wang, Kaifu Wang, Yi Wang, Zhijian Wang, Zhiqian Wang, Shuang Wu, Likai Wei, Bo Xiao, Wen Xie, Yan Xie, Dani Yogatama, Bin Yuan, Jun Zhan, and Zhenyao Zhu. 2016. Deep Speech 2: End-to-End Speech Recognition in English and Mandarin. In Proceedings of The 33rd International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 48), Maria Florina Balcan and Kilian Q. Weinberger (Eds.). PMLR, New York, New York, USA, 173--182. https:\/\/proceedings.mlr. press\/v48\/amodei16.html"},{"key":"e_1_3_2_1_4_1","volume-title":"TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. 2018. TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). USENIX Association, Carlsbad, CA, 578--594. https:\/\/www.usenix.org\/conference\/osdi18\/presentation\/chen"},{"key":"e_1_3_2_1_5_1","volume-title":"Retrieved","author":"ONNX","year":"2024","unstructured":"ONNX developers. 2024. ONNX. Retrieved November 20, 2024 from https:\/\/onnx.ai v1.13."},{"key":"e_1_3_2_1_6_1","volume-title":"Retrieved","author":"ONNX","year":"2024","unstructured":"ONNX Runtime developers. 2024. ONNX Runtime. Retrieved November 20, 2024 from https:\/\/github.com\/microsoft\/onnxruntime v1.14.0."},{"key":"e_1_3_2_1_7_1","volume-title":"Retrieved","author":"PyTorch","year":"2024","unstructured":"PyTorch developers. 2024. PyTorch. Retrieved November 20, 2024 from https:\/\/pytorch.org v1.12."},{"key":"e_1_3_2_1_8_1","volume-title":"Retrieved","author":"TensorFlow","year":"2024","unstructured":"TensorFlow developers. 2024. Tensorflow. Retrieved November 20, 2024 from https:\/\/www.tensorflow.org\/ v2.12."},{"key":"e_1_3_2_1_9_1","volume-title":"Retrieved","author":"XLA","year":"2024","unstructured":"XLA developers. 2024. XLA. Retrieved November 20, 2024 from https:\/\/www.tensorflow.org\/xla v2.12."},{"key":"e_1_3_2_1_10_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_11_1","first-page":"1","article-title":"Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity","volume":"23","author":"Fedus William","year":"2022","unstructured":"William Fedus, Barret Zoph, and Noam Shazeer. 2022. Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity. Journal of Machine Learning Research 23, 120 (2022), 1--39.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_12_1","volume-title":"Yolox: Exceeding yolo series in","author":"Ge Zheng","year":"2021","unstructured":"Zheng Ge, Songtao Liu, Feng Wang, Zeming Li, and Jian Sun. 2021. Yolox: Exceeding yolo series in 2021. arXiv preprint arXiv:2107.08430 (2021)."},{"key":"e_1_3_2_1_13_1","volume-title":"LSTM: A search space odyssey","author":"Greff Klaus","year":"2016","unstructured":"Klaus Greff, Rupesh K Srivastava, Jan Koutn\u00edk, Bas R Steunebrink, and J\u00fcrgen Schmidhuber. 2016. LSTM: A search space odyssey. IEEE transactions on neural networks and learning systems 28, 10 (2016), 2222--2232."},{"key":"e_1_3_2_1_14_1","volume-title":"SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and &lt;0.5 MB model size. arXiv preprint arXiv:1602.07360","author":"Iandola Forrest N","year":"2016","unstructured":"Forrest N Iandola, Song Han, Matthew W Moskewicz, Khalid Ashraf, William J Dally, and Kurt Keutzer. 2016. SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and &lt;0.5 MB model size. arXiv preprint arXiv:1602.07360 (2016)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO53902.2022.9741270"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3219819.3220007"},{"key":"e_1_3_2_1_17_1","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Ma Lingxiao","year":"2020","unstructured":"Lingxiao Ma, Zhiqiang Xie, Zhi Yang, Jilong Xue, Youshan Miao, Wei Cui, Wenxiang Hu, Fan Yang, Lintao Zhang, and Lidong Zhou. 2020. Rammer: Enabling Holistic Deep Learning Compiler Optimizations with {rTasks}. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). 881--897."},{"key":"e_1_3_2_1_18_1","volume-title":"Retrieved","author":"Ma Lingxiao","year":"2024","unstructured":"Lingxiao Ma, Zhiqiang Xie, Zhi Yang, Jilong Xue, Youshan Miao, Wei Cui, Wenxiang Hu, Fan Yang, Lintao Zhang, and Lidong Zhou. 2024. NNFusion. Retrieved November 20, 2024 from https:\/\/github.com\/microsoft\/nnfusion v0.3."},{"key":"e_1_3_2_1_19_1","volume-title":"Retrieved","author":"NVIDIA.","year":"2024","unstructured":"NVIDIA. 2024. CUDA Deep Neural Network library. Retrieved November 20, 2024 from https:\/\/developer.nvidia.com\/cudnn v8.7.0."},{"key":"e_1_3_2_1_20_1","volume-title":"Retrieved","author":"NVIDIA.","year":"2024","unstructured":"NVIDIA. 2024. TensorRT. Retrieved November 20, 2024 from https:\/\/developer.nvidia.com\/tensorrt v8.5.3."},{"key":"e_1_3_2_1_21_1","volume-title":"Sequence to sequence learning with neural networks. Advances in neural information processing systems 27","author":"Sutskever Ilya","year":"2014","unstructured":"Ilya Sutskever, Oriol Vinyals, and Quoc V Le. 2014. Sequence to sequence learning with neural networks. Advances in neural information processing systems 27 (2014)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.308"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.634"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3447548.3467093"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00907"}],"event":{"name":"PPoPP '25: The 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"],"location":"Las Vegas NV USA","acronym":"PPoPP '25"},"container-title":["Proceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3710848.3710857","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3710848.3710857","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T15:15:23Z","timestamp":1755875723000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3710848.3710857"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2,28]]},"references-count":25,"alternative-id":["10.1145\/3710848.3710857","10.1145\/3710848"],"URL":"https:\/\/doi.org\/10.1145\/3710848.3710857","relation":{},"subject":[],"published":{"date-parts":[[2025,2,28]]},"assertion":[{"value":"2025-02-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}