{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,21]],"date-time":"2026-03-21T19:14:16Z","timestamp":1774120456435,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":33,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,3,28]],"date-time":"2022-03-28T00:00:00Z","timestamp":1648425600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"National Key R&D Program of China","award":["2021ZD0110202"],"award-info":[{"award-number":["2021ZD0110202"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U20A20226"],"award-info":[{"award-number":["U20A20226"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"NSFC for Distinguished Young Scholar","award":["61825602"],"award-info":[{"award-number":["61825602"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,4,2]]},"DOI":"10.1145\/3503221.3508417","type":"proceedings-article","created":{"date-parts":[[2022,3,28]],"date-time":"2022-03-28T13:58:22Z","timestamp":1648475902000},"page":"192-204","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":42,"title":["BaGuaLu"],"prefix":"10.1145","author":[{"given":"Zixuan","family":"Ma","sequence":"first","affiliation":[{"name":"Tsinghua University"}]},{"given":"Jiaao","family":"He","sequence":"additional","affiliation":[{"name":"Tsinghua University"}]},{"given":"Jiezhong","family":"Qiu","sequence":"additional","affiliation":[{"name":"Tsinghua University and Beijing Academy of Artificial Intelligence"}]},{"given":"Huanqi","family":"Cao","sequence":"additional","affiliation":[{"name":"Tsinghua University"}]},{"given":"Yuanwei","family":"Wang","sequence":"additional","affiliation":[{"name":"Tsinghua University"}]},{"given":"Zhenbo","family":"Sun","sequence":"additional","affiliation":[{"name":"Tsinghua University"}]},{"given":"Liyan","family":"Zheng","sequence":"additional","affiliation":[{"name":"Tsinghua University"}]},{"given":"Haojie","family":"Wang","sequence":"additional","affiliation":[{"name":"Tsinghua University"}]},{"given":"Shizhi","family":"Tang","sequence":"additional","affiliation":[{"name":"Tsinghua University"}]},{"given":"Tianyu","family":"Zheng","sequence":"additional","affiliation":[{"name":"Zhejiang Lab"}]},{"given":"Junyang","family":"Lin","sequence":"additional","affiliation":[{"name":"Alibaba Group"}]},{"given":"Guanyu","family":"Feng","sequence":"additional","affiliation":[{"name":"Tsinghua University"}]},{"given":"Zeqiang","family":"Huang","sequence":"additional","affiliation":[{"name":"Zhejiang Lab"}]},{"given":"Jie","family":"Gao","sequence":"additional","affiliation":[{"name":"Zhejiang Lab"}]},{"given":"Aohan","family":"Zeng","sequence":"additional","affiliation":[{"name":"Tsinghua University and Beijing Academy of Artificial Intelligence"}]},{"given":"Jianwei","family":"Zhang","sequence":"additional","affiliation":[{"name":"Alibaba Group"}]},{"given":"Runxin","family":"Zhong","sequence":"additional","affiliation":[{"name":"Tsinghua University"}]},{"given":"Tianhui","family":"Shi","sequence":"additional","affiliation":[{"name":"Tsinghua University"}]},{"given":"Sha","family":"Liu","sequence":"additional","affiliation":[{"name":"Zhejiang Lab"}]},{"given":"Weimin","family":"Zheng","sequence":"additional","affiliation":[{"name":"Tsinghua University"}]},{"given":"Jie","family":"Tang","sequence":"additional","affiliation":[{"name":"Tsinghua University and Beijing Academy of Artificial Intelligencea"}]},{"given":"Hongxia","family":"Yang","sequence":"additional","affiliation":[{"name":"Alibaba Group"}]},{"given":"Xin","family":"Liu","sequence":"additional","affiliation":[{"name":"Zhejiang Lab"}]},{"given":"Jidong","family":"Zhai","sequence":"additional","affiliation":[{"name":"Tsinghua University"}]},{"given":"Wenguang","family":"Chen","sequence":"additional","affiliation":[{"name":"Tsinghua University"}]}],"member":"320","published-online":{"date-parts":[[2022,3,28]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Tom B. Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell Sandhini Agarwal Ariel Herbert-Voss Gretchen Krueger Tom Henighan Rewon Child Aditya Ramesh Daniel M. Ziegler Jeffrey Wu Clemens Winter Christopher Hesse Mark Chen Eric Sigler Mateusz Litwin Scott Gray Benjamin Chess Jack Clark Christopher Berner Sam McCandlish Alec Radford Ilya Sutskever and Dario Amodei. 2020. Language Models are Few-Shot Learners. arXiv:2005.14165 [cs.CL]"},{"key":"e_1_3_2_1_2_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_3_1","volume-title":"Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity. arXiv:2101.03961 [cs.LG]","author":"Fedus William","year":"2021","unstructured":"William Fedus, Barret Zoph, and Noam Shazeer. 2021. Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity. arXiv:2101.03961 [cs.LG]"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-016-5588-7"},{"key":"e_1_3_2_1_5_1","volume-title":"Identity Mappings in Deep Residual Networks. In ECCV","author":"He Kaiming","year":"2016","unstructured":"Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. 2016. Identity Mappings in Deep Residual Networks. In ECCV 2016. 630--645."},{"key":"e_1_3_2_1_6_1","volume-title":"Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems 2019","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Dehao Chen, Mia Xu Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V. Le, Yonghui Wu, and Zhifeng Chen. 2019. GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism. In Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems 2019, NeurIPS 2019, December 8--14, 2019, Vancouver, BC, Canada, Hanna M. Wallach, Hugo Larochelle, Alina Beygelzimer, Florence d'Alch\u00e9-Buc, Emily B. Fox, and Roman Garnett (Eds.). 103--112."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2018.00054"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3126908.3126916"},{"key":"e_1_3_2_1_9_1","volume-title":"Exascale deep learning for scientific inverse problems. arXiv preprint arXiv:1909.11150","author":"Laanait Nouamane","year":"2019","unstructured":"Nouamane Laanait, Joshua Romero, Junqi Yin, M Todd Young, Sean Treichler, Vitalii Starchenko, Albina Borisevich, Alex Sergeev, and Michael Matheson. 2019. Exascale deep learning for scientific inverse problems. arXiv preprint arXiv:1909.11150 (2019)."},{"key":"e_1_3_2_1_10_1","volume-title":"Fat-trees: universal networks for hardware-efficient supercomputing","author":"Leiserson Charles E","year":"1985","unstructured":"Charles E Leiserson. 1985. Fat-trees: universal networks for hardware-efficient supercomputing. IEEE transactions on Computers 100, 10 (1985), 892--901."},{"key":"e_1_3_2_1_11_1","volume-title":"Gshard: Scaling giant models with conditional computation and automatic sharding. arXiv preprint arXiv:2006.16668","author":"Lepikhin Dmitry","year":"2020","unstructured":"Dmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu, Dehao Chen, Orhan Firat, Yanping Huang, Maxim Krikun, Noam Shazeer, and Zhifeng Chen. 2020. Gshard: Scaling giant models with conditional computation and automatic sharding. arXiv preprint arXiv:2006.16668 (2020)."},{"key":"e_1_3_2_1_12_1","unstructured":"Mike Lewis Shruti Bhosale Tim Dettmers Naman Goyal and Luke Zettlemoyer. 2021. BASE Layers: Simplifying Training of Large Sparse Models. arXiv:2103.16716 [cs.CL]"},{"key":"e_1_3_2_1_13_1","volume-title":"M6: A Chinese Multimodal Pretrainer. CoRR abs\/2103.00823","author":"Lin Junyang","year":"2021","unstructured":"Junyang Lin, Rui Men, An Yang, Chang Zhou, Ming Ding, Yichang Zhang, Peng Wang, Ang Wang, Le Jiang, Xianyan Jia, Jie Zhang, Jianwei Zhang, Xu Zou, Zhikang Li, Xiaodong Deng, Jie Liu, Jinbao Xue, Huiling Zhou, Jianxin Ma, Jin Yu, Yong Li, Wei Lin, Jingren Zhou, Jie Tang, and Hongxia Yang. 2021. M6: A Chinese Multimodal Pretrainer. CoRR abs\/2103.00823 (2021)."},{"key":"e_1_3_2_1_14_1","volume-title":"RoBERTa: A Robustly Optimized BERT Pretraining Approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. RoBERTa: A Robustly Optimized BERT Pretraining Approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_15_1","volume-title":"Fixing Weight Decay Regularization in Adam. CoRR abs\/1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Fixing Weight Decay Regularization in Adam. CoRR abs\/1711.05101 (2017)."},{"key":"e_1_3_2_1_16_1","unstructured":"NVIDIA. 2021. Apex (A PyTorch Extension). https:\/\/nvidia.github.io\/apex\/"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2018.00053"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/BigData47090.2019.9006467"},{"key":"e_1_3_2_1_19_1","volume-title":"Deep contextualized word representations. CoRR abs\/1802.05365","author":"Peters Matthew E.","year":"2018","unstructured":"Matthew E. Peters, Mark Neumann, Mohit Iyyer, Matt Gardner, Christopher Clark, Kenton Lee, and Luke Zettlemoyer. 2018. Deep contextualized word representations. CoRR abs\/1802.05365 (2018). arXiv:1802.05365 http:\/\/arxiv.org\/abs\/1802.05365"},{"key":"e_1_3_2_1_20_1","unstructured":"Alec Radford Karthik Narasimhan Tim Salimans and Ilya Sutskever. 2018. Improving language understanding by generative pre-training. (2018)."},{"key":"e_1_3_2_1_21_1","unstructured":"Alec Radford Jeff Wu Rewon Child David Luan Dario Amodei and Ilya Sutskever. 2019. Language Models are Unsupervised Multitask Learners. (2019)."},{"key":"e_1_3_2_1_22_1","first-page":"1","article-title":"Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer","volume":"21","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J Liu. 2020. Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer. Journal of Machine Learning Research 21 (2020), 1--67.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_23_1","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Rajbhandari Samyam","year":"2020","unstructured":"Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, and Yuxiong He. 2020. ZeRO: Memory Optimizations toward Training Trillion Parameter Models. In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (Atlanta, Georgia) (SC '20). IEEE Press, Article 20, 16 pages."},{"key":"e_1_3_2_1_24_1","volume-title":"Msa transformer. bioRxiv","author":"Rao Roshan","year":"2021","unstructured":"Roshan Rao, Jason Liu, Robert Verkuil, Joshua Meier, John F Canny, Pieter Abbeel, Tom Sercu, and Alexander Rives. 2021. Msa transformer. bioRxiv (2021)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_2_1_26_1","volume-title":"Molecular transformer: a model for uncertainty-calibrated chemical reaction prediction. ACS central science 5, 9","author":"Schwaller Philippe","year":"2019","unstructured":"Philippe Schwaller, Teodoro Laino, Th\u00e9ophile Gaudin, Peter Bolgar, Christopher A Hunter, Costas Bekas, and Alpha A Lee. 2019. Molecular transformer: a model for uncertainty-calibrated chemical reaction prediction. ACS central science 5, 9 (2019), 1572--1583."},{"key":"e_1_3_2_1_27_1","volume-title":"Alex Bridgland, et al.","author":"Senior Andrew W","year":"2020","unstructured":"Andrew W Senior, Richard Evans, John Jumper, James Kirkpatrick, Laurent Sifre, Tim Green, Chongli Qin, Augustin \u017d\u00eddek, Alexander WR Nelson, Alex Bridgland, et al. 2020. Improved protein structure prediction using potentials from deep learning. Nature 577, 7792 (2020), 706--710."},{"key":"e_1_3_2_1_28_1","volume-title":"Mesh-tensorflow: Deep learning for supercomputers. arXiv preprint arXiv:1811.02084","author":"Shazeer Noam","year":"2018","unstructured":"Noam Shazeer, Youlong Cheng, Niki Parmar, Dustin Tran, Ashish Vaswani, Penporn Koanantakool, Peter Hawkins, HyoukJoong Lee, Mingsheng Hong, Cliff Young, et al. 2018. Mesh-tensorflow: Deep learning for supercomputers. arXiv preprint arXiv:1811.02084 (2018)."},{"key":"e_1_3_2_1_29_1","volume-title":"Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. arXiv preprint arXiv:1701.06538","author":"Shazeer Noam","year":"2017","unstructured":"Noam Shazeer, Azalia Mirhoseini, Krzysztof Maziarz, Andy Davis, Quoc Le, Geoffrey Hinton, and Jeff Dean. 2017. Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. arXiv preprint arXiv:1701.06538 (2017)."},{"key":"e_1_3_2_1_30_1","volume-title":"Megatron-lm: Training multi-billion parameter language models using gpu model parallelism. arXiv preprint arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. 2019. Megatron-lm: Training multi-billion parameter language models using gpu model parallelism. arXiv preprint arXiv:1909.08053 (2019)."},{"key":"e_1_3_2_1_31_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. In Advances in neural information processing systems. 5998--6008."},{"key":"e_1_3_2_1_32_1","volume-title":"XLNet: Generalized Autoregressive Pretraining for Language Understanding. arXiv preprint arXiv:1906.08237","author":"Yang Zhilin","year":"2019","unstructured":"Zhilin Yang, Zihang Dai, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, and Quoc V Le. 2019. XLNet: Generalized Autoregressive Pretraining for Language Understanding. arXiv preprint arXiv:1906.08237 (2019)."},{"key":"e_1_3_2_1_33_1","volume-title":"Adaptive Loss Scaling for Mixed Precision Training. CoRR abs\/1910.12385","author":"Zhao Ruizhe","year":"2019","unstructured":"Ruizhe Zhao, Brian Vogel, and Tanvir Ahmed. 2019. Adaptive Loss Scaling for Mixed Precision Training. CoRR abs\/1910.12385 (2019). arXiv:1910.12385 http:\/\/arxiv.org\/abs\/1910.12385"}],"event":{"name":"PPoPP '22: 27th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming","location":"Seoul Republic of Korea","acronym":"PPoPP '22","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the 27th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503221.3508417","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3503221.3508417","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:00:49Z","timestamp":1750186849000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503221.3508417"}},"subtitle":["targeting brain scale pretrained models with over 37 million cores"],"short-title":[],"issued":{"date-parts":[[2022,3,28]]},"references-count":33,"alternative-id":["10.1145\/3503221.3508417","10.1145\/3503221"],"URL":"https:\/\/doi.org\/10.1145\/3503221.3508417","relation":{},"subject":[],"published":{"date-parts":[[2022,3,28]]},"assertion":[{"value":"2022-03-28","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}