{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T06:55:40Z","timestamp":1772866540513,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,2,28]],"date-time":"2025-02-28T00:00:00Z","timestamp":1740700800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,2,28]]},"DOI":"10.1145\/3710848.3710868","type":"proceedings-article","created":{"date-parts":[[2025,2,28]],"date-time":"2025-02-28T06:20:57Z","timestamp":1740723657000},"page":"170-182","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Harnessing Inter-GPU Shared Memory for Seamless MoE Communication-Computation Fusion"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-8329-0145","authenticated-orcid":false,"given":"Hulin","family":"Wang","sequence":"first","affiliation":[{"name":"School of Computer Science, Wuhan University, Wuhan, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8101-785X","authenticated-orcid":false,"given":"Yaqi","family":"Xia","sequence":"additional","affiliation":[{"name":"School of Computer Science, Wuhan University, Wuhan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3913-3623","authenticated-orcid":false,"given":"Donglin","family":"Yang","sequence":"additional","affiliation":[{"name":"Nvidia Corporation, Santa Clara, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-9500-3390","authenticated-orcid":false,"given":"Xiaobo","family":"Zhou","sequence":"additional","affiliation":[{"name":"IOTSC, University of Macau, Macau S.A.R"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2869-7623","authenticated-orcid":false,"given":"Dazhao","family":"Cheng","sequence":"additional","affiliation":[{"name":"School of Computer Science, Wuhan University, Wuhan, China"}]}],"member":"320","published-online":{"date-parts":[[2025,2,28]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"1877","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel Ziegler, Jeffrey Wu, Clemens Winter, Chris Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. Language models are few-shot learners. In Advances in Neural Information Processing Systems (NeurIPS), pages 1877--1901, 2020."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01311"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651379"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3572848.3577484"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1285"},{"key":"e_1_3_2_1_6_1","first-page":"4171","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics (ACL)","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. BERT: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics (ACL), pages 4171--4186, 2019."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3627535.3638466"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441593"},{"key":"e_1_3_2_1_9_1","first-page":"389","volume-title":"Proceedings of the 26th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (PPoPP)","author":"Fang Jiarui","year":"2021","unstructured":"Jiarui Fang, Yang Yu, Chengduo Zhao, and Jie Zhou. Turbotransformers: an efficient gpu serving system for transformer models. In Proceedings of the 26th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (PPoPP), pages 389--402, 2021."},{"key":"e_1_3_2_1_10_1","first-page":"1","volume-title":"Journal of Machine Learning Research (JMLR)","author":"Fedus William","year":"2022","unstructured":"William Fedus, Barret Zoph, and Noam Shazeer. Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity. Journal of Machine Learning Research (JMLR), pages 1--39, 2022."},{"key":"e_1_3_2_1_11_1","volume-title":"Fastmoe: A fast mixture-of-expert training system. arXiv preprint arXiv:2103.13262","author":"He Jiaao","year":"2021","unstructured":"Jiaao He, Jiezhong Qiu, Aohan Zeng, Zhilin Yang, Jidong Zhai, and Jie Tang. Fastmoe: A fast mixture-of-expert training system. arXiv preprint arXiv:2103.13262, 2021."},{"key":"e_1_3_2_1_12_1","first-page":"120","volume-title":"Proceedings of the 27th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (PPoPP)","author":"He Jiaao","year":"2022","unstructured":"Jiaao He, Jidong Zhai, Tiago Antunes, Haojie Wang, Fuwen Luo, Shangfeng Shi, and Qin Li. Fastermoe: modeling and optimizing training of large-scale dynamic pre-trained models. In Proceedings of the 27th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (PPoPP), pages 120--134, 2022."},{"key":"e_1_3_2_1_13_1","unstructured":"Marcus Hutter. he human knowledge compression contest. http:\/\/prize.hutter1.net\/ 2012."},{"key":"e_1_3_2_1_14_1","volume-title":"Proceedings of Machine Learning and Systems (MLSys)","author":"Hwang Changho","year":"2023","unstructured":"Changho Hwang, Wei Cui, Yifan Xiong, Ziyue Yang, Ze Liu, Han Hu, Zilong Wang, Rafael Salas, Jithin Jose, Prabhat Ram, et al. Tutel: Adaptive mixture-of-experts at scale. In Proceedings of Machine Learning and Systems (MLSys), 2023."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507778"},{"key":"e_1_3_2_1_16_1","volume-title":"CoRR","author":"Jiang Albert Q.","year":"2024","unstructured":"Albert Q. Jiang, Alexandre Sablayrolles, Antoine Roux, Arthur Mensch, Blanche Savary, Chris Bamford, Devendra Singh Chaplot, Diego de Las Casas, Emma Bou Hanna, Florian Bressand, Gianna Lengyel, Guillaume Bour, Guillaume Lample, L\u00e9lio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Sandeep Subramanian, Sophia Yang, Szymon Antoniak, Teven Le Scao, Th\u00e9ophile Gervet, Thibaut Lavril, Thomas Wang, Timoth\u00e9e Lacroix, and William El Sayed. Mixtral of experts. CoRR, 2024."},{"key":"e_1_3_2_1_17_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Lepikhin Dmitry","year":"2021","unstructured":"Dmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu, Dehao Chen, Orhan Firat, Yanping Huang, Maxim Krikun, Noam Shazeer, and Zhifeng Chen. {GS}hard: Scaling giant models with conditional computation and automatic sharding. In International Conference on Learning Representations (ICLR), 2021."},{"key":"e_1_3_2_1_18_1","first-page":"6265","volume-title":"International Conference on Machine Learning (ICML)","author":"Lewis Mike","year":"2021","unstructured":"Mike Lewis, Shruti Bhosale, Tim Dettmers, Naman Goyal, and Luke Zettlemoyer. Base layers: Simplifying training of large, sparse models. In International Conference on Machine Learning (ICML), pages 6265--6274, 2021."},{"key":"e_1_3_2_1_19_1","first-page":"945","volume-title":"2023 USENIX Annual Technical Conference (ATC)","author":"Li Jiamin","year":"2023","unstructured":"Jiamin Li, Yimin Jiang, Yibo Zhu, Cong Wang, and Hong Xu. Accelerating distributed {MoE} training and inference with lina. In 2023 USENIX Annual Technical Conference (ATC), pages 945--959, 2023."},{"key":"e_1_3_2_1_20_1","first-page":"663","volume-title":"17th USENIX Symposium on Operating Systems Design and Implementation (OSDI)","author":"Li Zhuohan","year":"2023","unstructured":"Zhuohan Li, Lianmin Zheng, Yinmin Zhong, Vincent Liu, Ying Sheng, Xin Jin, Yanping Huang, Zhifeng Chen, Hao Zhang, Joseph E. Gonzalez, and Ion Stoica. AlpaServe: Statistical multiplexing with model parallelism for deep learning serving. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI), pages 663--679, 2023."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3603269.3604869"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3588964"},{"key":"e_1_3_2_1_24_1","volume-title":"Nvidia collective communication library (nccl). https:\/\/developer.nvidia.com\/nccl","year":"2024","unstructured":"Nvidia. Nvidia collective communication library (nccl). https:\/\/developer.nvidia.com\/nccl, 2024."},{"key":"e_1_3_2_1_25_1","volume-title":"Nvshmem communication library. https:\/\/developer.nvidia.com\/nvshmem","year":"2024","unstructured":"Nvidia. Nvshmem communication library. https:\/\/developer.nvidia.com\/nvshmem, 2024."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_2_1_27_1","first-page":"8583","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","author":"Riquelme Carlos","year":"2021","unstructured":"Carlos Riquelme, Joan Puigcerver, Basil Mustafa, Maxim Neumann, Rodolphe Jenatton, Andr\u00e9 Susano Pinto, Daniel Keysers, and Neil Houlsby. Scaling vision with sparse mixture of experts. In Advances in Neural Information Processing Systems (NeurIPS), pages 8583--8595, 2021."},{"key":"e_1_3_2_1_28_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Shazeer Noam","year":"2017","unstructured":"Noam Shazeer, Azalia Mirhoseini, Krzysztof Maziarz, Andy Davis, Quoc Le, Geoffrey Hinton, and Jeff Dean. Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. In International Conference on Learning Representations (ICLR), 2017."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM53939.2023.10228874"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3627703.3650083"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575712"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651359"},{"key":"e_1_3_2_1_33_1","volume-title":"CoRR","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton-Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez, Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushkar Mishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing Ellen Tan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aur\u00e9lien Rodriguez, Robert Stojnic, Sergey Edunov, and Thomas Scialom. Llama 2: Open foundation and fine-tuned chat models. CoRR, 2023."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3567955.3567959"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41406.2024.00055"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01385"},{"key":"e_1_3_2_1_37_1","first-page":"779","volume-title":"17th USENIX Symposium on Operating Systems Design and Implementation (OSDI)","author":"Wang Yuke","year":"2023","unstructured":"Yuke Wang, Boyuan Feng, Zheng Wang, Tong Geng, Kevin Barker, Ang Li, and Yufei Ding. MGG: Accelerating graph neural networks with Fine-Grained Intra-Kernel Communication-Computation pipelining on Multi-GPU platforms. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI), pages 779--795, 2023."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41406.2024.00056"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3588195.3592990"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.5555\/3691992.3692026"},{"key":"e_1_3_2_1_41_1","first-page":"961","volume-title":"2023 USENIX Annual Technical Conference (ATC)","author":"Zhai Mingshu","year":"2023","unstructured":"Mingshu Zhai, Jiaao He, Zixuan Ma, Zan Zong, Runqing Zhang, and Jidong Zhai. {SmartMoE}: Efficiently training {Sparsely-Activated} models through combining offline and online parallelization. In 2023 USENIX Annual Technical Conference (ATC), pages 961--975, 2023."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS54959.2023.00026"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-07312-0_1"},{"key":"e_1_3_2_1_44_1","first-page":"7103","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","author":"Zhou Yanqi","year":"2022","unstructured":"Yanqi Zhou, Tao Lei, Hanxiao Liu, Nan Du, Yanping Huang, Vincent Zhao, Andrew M Dai, Quoc V Le, James Laudon, et al. Mixture-of-experts with expert choice routing. In Advances in Neural Information Processing Systems (NeurIPS), pages 7103--7114, 2022."}],"event":{"name":"PPoPP '25: The 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming","location":"Las Vegas NV USA","acronym":"PPoPP '25","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3710848.3710868","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3710848.3710868","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T15:16:23Z","timestamp":1755875783000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3710848.3710868"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2,28]]},"references-count":44,"alternative-id":["10.1145\/3710848.3710868","10.1145\/3710848"],"URL":"https:\/\/doi.org\/10.1145\/3710848.3710868","relation":{},"subject":[],"published":{"date-parts":[[2025,2,28]]},"assertion":[{"value":"2025-02-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}