{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,6]],"date-time":"2026-06-06T01:13:36Z","timestamp":1780708416806,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,6,21]],"date-time":"2023-06-21T00:00:00Z","timestamp":1687305600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/100006228","name":"Oak Ridge National Laboratory","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100006228","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100006224","name":"Argonne National Laboratory","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100006224","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,6,21]]},"DOI":"10.1145\/3577193.3593704","type":"proceedings-article","created":{"date-parts":[[2023,6,20]],"date-time":"2023-06-20T18:47:05Z","timestamp":1687286825000},"page":"203-214","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":41,"title":["A Hybrid Tensor-Expert-Data Parallelism Approach to Optimize Mixture-of-Experts Training"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2756-4290","authenticated-orcid":false,"given":"Siddharth","family":"Singh","sequence":"first","affiliation":[{"name":"Department of Computer Science, University of Maryland, College Park, Maryland, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5508-0728","authenticated-orcid":false,"given":"Olatunji","family":"Ruwase","sequence":"additional","affiliation":[{"name":"Microsoft, Inc., Redmond, Washington, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6272-3760","authenticated-orcid":false,"given":"Ammar Ahmad","family":"Awan","sequence":"additional","affiliation":[{"name":"Microsoft, Inc., Redmond, Washington, United States"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0386-8759","authenticated-orcid":false,"given":"Samyam","family":"Rajbhandari","sequence":"additional","affiliation":[{"name":"Microsoft, Inc., Redmond, Washington, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0478-8854","authenticated-orcid":false,"given":"Yuxiong","family":"He","sequence":"additional","affiliation":[{"name":"Microsoft, Inc., Redmond, Washington, United States of America"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3069-3701","authenticated-orcid":false,"given":"Abhinav","family":"Bhatele","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Maryland, College Park, Maryland, United States of America"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2023,6,21]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2112.10684"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2105.14450"},{"key":"e_1_3_2_1_3_1","unstructured":"BigScience. 2022. BigScience Large Open-science Open-access Multilingual Language Model. https:\/\/huggingface.co\/bigscience\/bloom. BigScience. 2022. BigScience Large Open-science Open-access Multilingual Language Model. https:\/\/huggingface.co\/bigscience\/bloom."},{"key":"e_1_3_2_1_4_1","volume-title":"Language Models are Few-Shot Learners. CoRR abs\/2005.14165","author":"Brown Tom B.","year":"2020","unstructured":"Tom B. Brown , Benjamin Mann , Nick Ryder , Melanie Subbiah , Jared Kaplan , Prafulla Dhariwal , Arvind Neelakantan , Pranav Shyam , Girish Sastry , Amanda Askell , Sandhini Agarwal , Ariel Herbert-Voss , Gretchen Krueger , Tom Henighan , Rewon Child , Aditya Ramesh , Daniel M. Ziegler , Jeffrey Wu , Clemens Winter , Christopher Hesse , Mark Chen , Eric Sigler , Mateusz Litwin , Scott Gray , Benjamin Chess , Jack Clark , Christopher Berner , Sam McCandlish , Alec Radford , Ilya Sutskever , and Dario Amodei . 2020. Language Models are Few-Shot Learners. CoRR abs\/2005.14165 ( 2020 ). arXiv:2005.14165 https:\/\/arxiv.org\/abs\/2005.14165 Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel M. Ziegler, Jeffrey Wu, Clemens Winter, Christopher Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models are Few-Shot Learners. CoRR abs\/2005.14165 (2020). arXiv:2005.14165 https:\/\/arxiv.org\/abs\/2005.14165"},{"key":"e_1_3_2_1_5_1","unstructured":"Tianqi Chen Bing Xu Chiyuan Zhang and Carlos Guestrin. 2016. Training Deep Nets with Sublinear Memory Cost. arXiv:1604.06174 [cs.LG] Tianqi Chen Bing Xu Chiyuan Zhang and Carlos Guestrin. 2016. Training Deep Nets with Sublinear Memory Cost. arXiv:1604.06174 [cs.LG]"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Damai Dai Li Dong Shuming Ma Bo Zheng Zhifang Sui Baobao Chang and Furu Wei. 2022. StableMoE: Stable Routing Strategy for Mixture of Experts. arXiv:2204.08396 [cs.LG] Damai Dai Li Dong Shuming Ma Bo Zheng Zhifang Sui Baobao Chang and Furu Wei. 2022. StableMoE: Stable Routing Strategy for Mixture of Experts. arXiv:2204.08396 [cs.LG]","DOI":"10.18653\/v1\/2022.acl-long.489"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1903.06681"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356207"},{"key":"e_1_3_2_1_9_1","unstructured":"Nan Du Yanping Huang Andrew M. Dai Simon Tong Dmitry Lepikhin Yuanzhong Xu Maxim Krikun Yanqi Zhou Adams Wei Yu Orhan Firat Barret Zoph Liam Fedus Maarten Bosma Zongwei Zhou Tao Wang Yu Emma Wang Kellie Webster Marie Pellat Kevin Robinson Kathleen Meier-Hellstern Toju Duke Lucas Dixon Kun Zhang Quoc V Le Yonghui Wu Zhifeng Chen and Claire Cui. 2022. GLaM: Efficient Scaling of Language Models with Mixture-of-Experts. arXiv:2112.06905 [cs.CL] Nan Du Yanping Huang Andrew M. Dai Simon Tong Dmitry Lepikhin Yuanzhong Xu Maxim Krikun Yanqi Zhou Adams Wei Yu Orhan Firat Barret Zoph Liam Fedus Maarten Bosma Zongwei Zhou Tao Wang Yu Emma Wang Kellie Webster Marie Pellat Kevin Robinson Kathleen Meier-Hellstern Toju Duke Lucas Dixon Kun Zhang Quoc V Le Yonghui Wu Zhifeng Chen and Claire Cui. 2022. GLaM: Efficient Scaling of Language Models with Mixture-of-Experts. arXiv:2112.06905 [cs.CL]"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2101.03961"},{"key":"e_1_3_2_1_11_1","volume-title":"The Pile: An 800GB Dataset of Diverse Text for Language Modeling. CoRR abs\/2101.00027","author":"Gao Leo","year":"2021","unstructured":"Leo Gao , Stella Biderman , Sid Black , Laurence Golding , Travis Hoppe , Charles Foster , Jason Phang , Horace He , Anish Thite , Noa Nabeshima , Shawn Presser , and Connor Leahy . 2021 . The Pile: An 800GB Dataset of Diverse Text for Language Modeling. CoRR abs\/2101.00027 (2021). arXiv:2101.00027 https:\/\/arxiv.org\/abs\/2101.00027 Leo Gao, Stella Biderman, Sid Black, Laurence Golding, Travis Hoppe, Charles Foster, Jason Phang, Horace He, Anish Thite, Noa Nabeshima, Shawn Presser, and Connor Leahy. 2021. The Pile: An 800GB Dataset of Diverse Text for Language Modeling. CoRR abs\/2101.00027 (2021). arXiv:2101.00027 https:\/\/arxiv.org\/abs\/2101.00027"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"e_1_3_2_1_13_1","unstructured":"Jordan Hoffmann Sebastian Borgeaud Arthur Mensch Elena Buchatskaya Trevor Cai Eliza Rutherford Diego de Las Casas Lisa Anne Hendricks Johannes Welbl Aidan Clark Tom Hennigan Eric Noland Katie Millican George van den Driessche Bogdan Damoc Aurelia Guy Simon Osindero Karen Simonyan Erich Elsen Jack W. Rae Oriol Vinyals and Laurent Sifre. 2022. Training Compute-Optimal Large Language Models. arXiv:2203.15556 [cs.CL] Jordan Hoffmann Sebastian Borgeaud Arthur Mensch Elena Buchatskaya Trevor Cai Eliza Rutherford Diego de Las Casas Lisa Anne Hendricks Johannes Welbl Aidan Clark Tom Hennigan Eric Noland Katie Millican George van den Driessche Bogdan Damoc Aurelia Guy Simon Osindero Karen Simonyan Erich Elsen Jack W. Rae Oriol Vinyals and Laurent Sifre. 2022. Training Compute-Optimal Large Language Models. arXiv:2203.15556 [cs.CL]"},{"key":"e_1_3_2_1_14_1","volume-title":"Advances in Neural Information Processing Systems, H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch\u00e9-Buc","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang , Youlong Cheng , Ankur Bapna , Orhan Firat , Dehao Chen , Mia Chen , HyoukJoong Lee , Jiquan Ngiam , Quoc V Le , Yonghui Wu , and zhifeng Chen. 2019. GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism . In Advances in Neural Information Processing Systems, H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch\u00e9-Buc , E. Fox, and R. Garnett (Eds.), Vol. 32 . Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper\/ 2019 \/file\/093f65e080a295f8076b1c5722a46aa2-Paper.pdf Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Dehao Chen, Mia Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V Le, Yonghui Wu, and zhifeng Chen. 2019. GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism. In Advances in Neural Information Processing Systems, H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch\u00e9-Buc, E. Fox, and R. Garnett (Eds.), Vol. 32. Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper\/2019\/file\/093f65e080a295f8076b1c5722a46aa2-Paper.pdf"},{"key":"e_1_3_2_1_15_1","volume-title":"Tutel: Adaptive Mixture-of-Experts at Scale. arXiv:2206.03382 [cs.DC]","author":"Hwang Changho","year":"2022","unstructured":"Changho Hwang , Wei Cui , Yifan Xiong , Ziyue Yang , Ze Liu , Han Hu , Zilong Wang , Rafael Salas , Jithin Jose , Prabhat Ram , Joe Chau , Peng Cheng , Fan Yang , Mao Yang , and Yongqiang Xiong . 2022 . Tutel: Adaptive Mixture-of-Experts at Scale. arXiv:2206.03382 [cs.DC] Changho Hwang, Wei Cui, Yifan Xiong, Ziyue Yang, Ze Liu, Han Hu, Zilong Wang, Rafael Salas, Jithin Jose, Prabhat Ram, Joe Chau, Peng Cheng, Fan Yang, Mao Yang, and Yongqiang Xiong. 2022. Tutel: Adaptive Mixture-of-Experts at Scale. arXiv:2206.03382 [cs.DC]"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2001.08361"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2109.10465"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2006.16668"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.14778\/3415478.3415530"},{"key":"e_1_3_2_1_20_1","volume-title":"Fixing Weight Decay Regularization in Adam. CoRR abs\/1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter . 2017. Fixing Weight Decay Regularization in Adam. CoRR abs\/1711.05101 ( 2017 ). arXiv:1711.05101 http:\/\/arxiv.org\/abs\/1711.05101 Ilya Loshchilov and Frank Hutter. 2017. Fixing Weight Decay Regularization in Adam. CoRR abs\/1711.05101 (2017). arXiv:1711.05101 http:\/\/arxiv.org\/abs\/1711.05101"},{"key":"e_1_3_2_1_21_1","unstructured":"Microsoft. 2021. 3D parallelism with MegatronLM and ZeRO Redundancy Optimizer. https:\/\/github.com\/microsoft\/DeepSpeedExamples\/tree\/master\/Megatron-LM-v1.1.5-3D_parallelism. Microsoft. 2021. 3D parallelism with MegatronLM and ZeRO Redundancy Optimizer. https:\/\/github.com\/microsoft\/DeepSpeedExamples\/tree\/master\/Megatron-LM-v1.1.5-3D_parallelism."},{"key":"e_1_3_2_1_22_1","volume-title":"Efficient Large-Scale Language Model Training on GPU Clusters. CoRR abs\/2104.04473","author":"Narayanan Deepak","year":"2021","unstructured":"Deepak Narayanan , Mohammad Shoeybi , Jared Casper , Patrick LeGresley , Mostofa Patwary , Vijay Korthikanti , Dmitri Vainbrand , Prethvi Kashinkunti , Julie Bernauer , Bryan Catanzaro , Amar Phanishayee , and Matei Zaharia . 2021. Efficient Large-Scale Language Model Training on GPU Clusters. CoRR abs\/2104.04473 ( 2021 ). arXiv:2104.04473 https:\/\/arxiv.org\/abs\/2104.04473 Deepak Narayanan, Mohammad Shoeybi, Jared Casper, Patrick LeGresley, Mostofa Patwary, Vijay Korthikanti, Dmitri Vainbrand, Prethvi Kashinkunti, Julie Bernauer, Bryan Catanzaro, Amar Phanishayee, and Matei Zaharia. 2021. Efficient Large-Scale Language Model Training on GPU Clusters. CoRR abs\/2104.04473 (2021). arXiv:2104.04473 https:\/\/arxiv.org\/abs\/2104.04473"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2203.14685"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1910.10683"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2201.05596"},{"key":"e_1_3_2_1_26_1","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Rajbhandari Samyam","year":"2020","unstructured":"Samyam Rajbhandari , Jeff Rasley , Olatunji Ruwase , and Yuxiong He . 2020 . ZeRO: Memory Optimizations toward Training Trillion Parameter Models . In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis ( Atlanta, Georgia) (SC '20). IEEE Press, Article 20, 16 pages. Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, and Yuxiong He. 2020. ZeRO: Memory Optimizations toward Training Trillion Parameter Models. In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (Atlanta, Georgia) (SC '20). IEEE Press, Article 20, 16 pages."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476205"},{"key":"e_1_3_2_1_28_1","volume-title":"Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, and Yuxiong He.","author":"Ren Jie","year":"2021","unstructured":"Jie Ren , Samyam Rajbhandari , Reza Yazdani Aminabadi , Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, and Yuxiong He. 2021 . ZeRO-Offload: Democratizing Billion-Scale Model Training. CoRR abs\/2101.06840 (2021). arXiv:2101.06840 https:\/\/arxiv.org\/abs\/2101.06840 Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, and Yuxiong He. 2021. ZeRO-Offload: Democratizing Billion-Scale Model Training. CoRR abs\/2101.06840 (2021). arXiv:2101.06840 https:\/\/arxiv.org\/abs\/2101.06840"},{"key":"e_1_3_2_1_29_1","volume-title":"Daniel Keysers, and Neil Houlsby.","author":"Riquelme Carlos","year":"2021","unstructured":"Carlos Riquelme , Joan Puigcerver , Basil Mustafa , Maxim Neumann , Rodolphe Jenatton , Andr\u00e9 Susano Pinto , Daniel Keysers, and Neil Houlsby. 2021 . Scaling Vision with Sparse Mixture of Experts . arXiv:2106.05974 [cs.CV] Carlos Riquelme, Joan Puigcerver, Basil Mustafa, Maxim Neumann, Rodolphe Jenatton, Andr\u00e9 Susano Pinto, Daniel Keysers, and Neil Houlsby. 2021. Scaling Vision with Sparse Mixture of Experts. arXiv:2106.05974 [cs.CV]"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1701.06538"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2205.10034"},{"key":"e_1_3_2_1_32_1","unstructured":"Mohammad Shoeybi Mostofa Patwary Raul Puri Patrick LeGresley Jared Casper and Bryan Catanzaro. 2020. Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. arXiv:1909.08053 [cs.CL] Mohammad Shoeybi Mostofa Patwary Raul Puri Patrick LeGresley Jared Casper and Bryan Catanzaro. 2020. Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. arXiv:1909.08053 [cs.CL]"},{"key":"e_1_3_2_1_33_1","volume-title":"Proceedings of the IEEE International Parallel & Distributed Processing Symposium (IPDPS '22)","author":"Singh Siddharth","year":"2022","unstructured":"Siddharth Singh and Abhinav Bhatele . 2022 . AxoNN: An asynchronous, message-driven parallel framework for extreme-scale deep learning . In Proceedings of the IEEE International Parallel & Distributed Processing Symposium (IPDPS '22) . IEEE Computer Society. Siddharth Singh and Abhinav Bhatele. 2022. AxoNN: An asynchronous, message-driven parallel framework for extreme-scale deep learning. In Proceedings of the IEEE International Parallel & Distributed Processing Symposium (IPDPS '22). IEEE Computer Society."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2201.11990"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS49936.2021.00109"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2005.03300"},{"key":"e_1_3_2_1_37_1","volume-title":"CoRR abs\/1706.03762","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani , Noam Shazeer , Niki Parmar , Jakob Uszkoreit , Llion Jones , Aidan N. Gomez , Lukasz Kaiser , and Illia Polosukhin . 2017. Attention Is All You Need. CoRR abs\/1706.03762 ( 2017 ). arXiv:1706.03762 http:\/\/arxiv.org\/abs\/1706.03762 Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention Is All You Need. CoRR abs\/1706.03762 (2017). arXiv:1706.03762 http:\/\/arxiv.org\/abs\/1706.03762"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3545008.3545087"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2104.05343"},{"key":"e_1_3_2_1_40_1","unstructured":"Fuzhao Xue Ziji Shi Futao Wei Yuxuan Lou Yong Liu and Yang You. 2021. Go Wider Instead of Deeper. arXiv:2107.11817 [cs.LG] Fuzhao Xue Ziji Shi Futao Wei Yuxuan Lou Yong Liu and Yang You. 2021. Go Wider Instead of Deeper. arXiv:2107.11817 [cs.LG]"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Yanli Zhao Andrew Gu Rohan Varma Liang Luo Chien-Chin Huang Min Xu Less Wright Hamid Shojanazeri Myle Ott Sam Shleifer Alban Desmaison Can Balioglu Bernard Nguyen Geeta Chauhan Yuchen Hao and Shen Li. 2023. PyTorch FSDP: Experiences on Scaling Fully Sharded Data Parallel. arXiv:2304.11277 [cs.DC] Yanli Zhao Andrew Gu Rohan Varma Liang Luo Chien-Chin Huang Min Xu Less Wright Hamid Shojanazeri Myle Ott Sam Shleifer Alban Desmaison Can Balioglu Bernard Nguyen Geeta Chauhan Yuchen Hao and Shen Li. 2023. PyTorch FSDP: Experiences on Scaling Fully Sharded Data Parallel. arXiv:2304.11277 [cs.DC]","DOI":"10.14778\/3611540.3611569"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"Yukun Zhu Ryan Kiros Richard Zemel Ruslan Salakhutdinov Raquel Urtasun Antonio Torralba and Sanja Fidler. 2015. Aligning Books and Movies: Towards Story-like Visual Explanations by Watching Movies and Reading Books. In arXiv preprint arXiv:1506.06724. Yukun Zhu Ryan Kiros Richard Zemel Ruslan Salakhutdinov Raquel Urtasun Antonio Torralba and Sanja Fidler. 2015. Aligning Books and Movies: Towards Story-like Visual Explanations by Watching Movies and Reading Books. In arXiv preprint arXiv:1506.06724.","DOI":"10.1109\/ICCV.2015.11"},{"key":"e_1_3_2_1_43_1","volume-title":"Hany Hassan, Ruofei Zhang, Tuo Zhao, and Jianfeng Gao.","author":"Zuo Simiao","year":"2022","unstructured":"Simiao Zuo , Xiaodong Liu , Jian Jiao , Young Jin Kim , Hany Hassan, Ruofei Zhang, Tuo Zhao, and Jianfeng Gao. 2022 . Taming Sparsely Activated Transformer with Stochastic Experts . arXiv:2110.04260 [cs.CL] Simiao Zuo, Xiaodong Liu, Jian Jiao, Young Jin Kim, Hany Hassan, Ruofei Zhang, Tuo Zhao, and Jianfeng Gao. 2022. Taming Sparsely Activated Transformer with Stochastic Experts. arXiv:2110.04260 [cs.CL]"}],"event":{"name":"ICS '23: 37th International Conference on Supercomputing","location":"Orlando FL USA","acronym":"ICS '23","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 37th International Conference on Supercomputing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3577193.3593704","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/abs\/10.1145\/3577193.3593704","content-type":"text\/html","content-version":"vor","intended-application":"syndication"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:47:31Z","timestamp":1750178851000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3577193.3593704"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6,21]]},"references-count":43,"alternative-id":["10.1145\/3577193.3593704","10.1145\/3577193"],"URL":"https:\/\/doi.org\/10.1145\/3577193.3593704","relation":{},"subject":[],"published":{"date-parts":[[2023,6,21]]},"assertion":[{"value":"2023-06-21","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}