{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,9]],"date-time":"2026-03-09T21:21:36Z","timestamp":1773091296810,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":15,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,6,23]],"date-time":"2024-06-23T00:00:00Z","timestamp":1719100800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,6,23]]},"DOI":"10.1145\/3649329.3655951","type":"proceedings-article","created":{"date-parts":[[2024,11,7]],"date-time":"2024-11-07T19:27:22Z","timestamp":1731007642000},"page":"1-6","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":8,"title":["MoNDE: Mixture of Near-Data Experts for Large-Scale Sparse Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3005-2066","authenticated-orcid":false,"given":"Taehyun","family":"Kim","sequence":"first","affiliation":[{"name":"Seoul National University, Seoul, Seoul, Republic of Korea"},{"name":"Inter-University Semiconductor Research Center, Seoul, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-1570-4463","authenticated-orcid":false,"given":"Kwanseok","family":"Choi","sequence":"additional","affiliation":[{"name":"Seoul National University, Seoul, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-4091-6907","authenticated-orcid":false,"given":"Youngmock","family":"Cho","sequence":"additional","affiliation":[{"name":"Seoul National University, Seoul, Seoul, Republic of Korea"},{"name":"Inter-University Semiconductor Research Center, Seoul, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6853-368X","authenticated-orcid":false,"given":"Jaehoon","family":"Cho","sequence":"additional","affiliation":[{"name":"Seoul National University, Seoul, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8895-9117","authenticated-orcid":false,"given":"Hyuk-Jae","family":"Lee","sequence":"additional","affiliation":[{"name":"Seoul National University, Seoul, Seoul, Republic of Korea"},{"name":"Inter-University Semiconductor Research Center, Seoul, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0403-9928","authenticated-orcid":false,"given":"Jaewoong","family":"Sim","sequence":"additional","affiliation":[{"name":"Seoul National University, Seoul, Seoul, Republic of Korea"}]}],"member":"320","published-online":{"date-parts":[[2024,11,7]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NAACL-HLT.","author":"Jacob Devlin","year":"2019","unstructured":"Jacob Devlin et al. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NAACL-HLT."},{"key":"e_1_3_2_1_2_1","unstructured":"William Fedus et al. 2022. Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity. JMLR (2022)."},{"key":"e_1_3_2_1_3_1","unstructured":"Google. 2021. Hugging Face Switch Transformers. https:\/\/huggingface.co\/docs\/transformers\/model_doc\/switch_transformers."},{"key":"e_1_3_2_1_4_1","unstructured":"Haiyang Huang et al. 2023. Towards MoE Deployment: Mitigating Inefficiencies in Mixture-of-Expert (MoE) Inference. arXiv preprint arXiv:2303.06182 (2023)."},{"key":"e_1_3_2_1_5_1","unstructured":"Jin Hyun Kim et al. 2023. Samsung PIM\/PNM for Transformer Based AI: Energy Efficiency on PIM\/PNM Cluster. In HCS."},{"key":"e_1_3_2_1_6_1","volume-title":"Ramulator: A fast and extensible DRAM simulator","author":"Yoongu Kim","year":"2015","unstructured":"Yoongu Kim et al. 2015. Ramulator: A fast and extensible DRAM simulator. IEEE CAL (2015)."},{"key":"e_1_3_2_1_7_1","unstructured":"Meta. 2022. Hugging Face NLLB MoE Model Hub. https:\/\/huggingface.co\/facebook\/nllb-moe-54b."},{"key":"e_1_3_2_1_9_1","unstructured":"Colin Raffel et al. 2020. Exploring the limits of transfer learning with a unified text-to-text transformer. JMLR (2020)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476205"},{"key":"e_1_3_2_1_11_1","volume-title":"Deepspeed: System optimizations enable training deep learning models with over 100 billion parameters. In KD.","author":"Jeff Rasley","year":"2020","unstructured":"Jeff Rasley et al. 2020. Deepspeed: System optimizations enable training deep learning models with over 100 billion parameters. In KD."},{"key":"e_1_3_2_1_12_1","unstructured":"Jie Ren et al. 2021. ZeRO-Offload: Democratizing Billion-Scale Model Training.. In ATC."},{"key":"e_1_3_2_1_13_1","unstructured":"Swapnil Sharma et al. 2023. Stochastic Code Generation. arXiv:2304.08243 (2023)."},{"key":"e_1_3_2_1_14_1","unstructured":"Noam Shazeer et al. 2017. Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. In ICLR."},{"key":"e_1_3_2_1_15_1","unstructured":"Liang Shen et al. 2023. SE-MoE: A Scalable and Efficient Mixture-of-Experts Distributed Training and Inference System. arXiv:2205.10034 (2023)."},{"key":"e_1_3_2_1_16_1","volume-title":"Transformers: State-of-the-Art Natural Language Processing. In EMNLP.","author":"Thomas Wolf","year":"2020","unstructured":"Thomas Wolf et al. 2020. Transformers: State-of-the-Art Natural Language Processing. In EMNLP."}],"event":{"name":"DAC '24: 61st ACM\/IEEE Design Automation Conference","location":"San Francisco CA USA","acronym":"DAC '24","sponsor":["SIGDA ACM Special Interest Group on Design Automation","IEEE-CEDA","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 61st ACM\/IEEE Design Automation Conference"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3649329.3655951","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3649329.3655951","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:54Z","timestamp":1750295874000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3649329.3655951"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,23]]},"references-count":15,"alternative-id":["10.1145\/3649329.3655951","10.1145\/3649329"],"URL":"https:\/\/doi.org\/10.1145\/3649329.3655951","relation":{},"subject":[],"published":{"date-parts":[[2024,6,23]]},"assertion":[{"value":"2024-11-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}