{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T05:02:49Z","timestamp":1750309369803,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":18,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,31]],"date-time":"2025-03-31T00:00:00Z","timestamp":1743379200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"OpenWebSearch.eu","award":["GA 101070014"],"award-info":[{"award-number":["GA 101070014"]}]},{"name":"German Federal Ministry of Education and Research (BMBF)","award":["01|S20049"],"award-info":[{"award-number":["01|S20049"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,31]]},"DOI":"10.1145\/3672608.3707747","type":"proceedings-article","created":{"date-parts":[[2025,5,14]],"date-time":"2025-05-14T18:26:21Z","timestamp":1747247181000},"page":"945-852","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Mixture of Modular Experts: Distilling Knowledge from a Multilingual Teacher into Specialized Modular Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0127-8034","authenticated-orcid":false,"given":"Mohammed","family":"Al-Maamari","sequence":"first","affiliation":[{"name":"University of Passau, Passau, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1785-8367","authenticated-orcid":false,"given":"Mehdi","family":"Ben Amor","sequence":"additional","affiliation":[{"name":"University of Passau, Passau, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3220-8749","authenticated-orcid":false,"given":"Jelena","family":"Mitrovi\u0107","sequence":"additional","affiliation":[{"name":"University of Passau, Passau, Germany"},{"name":"Institute for Artificial Intelligence Research and Development of Serbia, Novi Sad, Serbia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3566-5507","authenticated-orcid":false,"given":"Michael","family":"Granitzer","sequence":"additional","affiliation":[{"name":"University of Passau, Passau, Germany"}]}],"member":"320","published-online":{"date-parts":[[2025,5,14]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Tianle Li, Dacheng Li, Hao Zhang, Banghua Zhu, Michael Jordan, Joseph E. Gonzalez, and Ion Stoica.","author":"Chiang Wei-Lin","year":"2024","unstructured":"Wei-Lin Chiang, Lianmin Zheng, Ying Sheng, Anastasios Nikolas Angelopoulos, Tianle Li, Dacheng Li, Hao Zhang, Banghua Zhu, Michael Jordan, Joseph E. Gonzalez, and Ion Stoica. 2024. Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference. arXiv:2403.04132 [cs.AI]"},{"key":"e_1_3_2_1_2_1","volume-title":"International conference on machine learning. PMLR, 4057\u20134086","author":"Clark Aidan","year":"2022","unstructured":"Aidan Clark, Diego de Las Casas, Aurelia Guy, Arthur Mensch, Michela Paganini, Jordan Hoffmann, Bogdan Damoc, Blake Hechtman, Trevor Cai, Sebastian Borgeaud, et al. 2022. Unified scaling laws for routed language models. In International conference on machine learning. PMLR, 4057\u20134086."},{"key":"e_1_3_2_1_3_1","unstructured":"CodeParrot. n.d.. github-code-clean Dataset. https:\/\/huggingface.co\/datasets\/codeparrot\/github-code-clean. Accessed: 2024-06-21."},{"key":"e_1_3_2_1_4_1","first-page":"1","article-title":"Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity","volume":"23","author":"Fedus William","year":"2022","unstructured":"William Fedus, Barret Zoph, and Noam Shazeer. 2022. Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity. Journal of Machine Learning Research 23, 120 (2022), 1\u201339.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_5_1","volume-title":"An empirical investigation of catastrophic forgetting in gradient-based neural networks. arXiv preprint arXiv:1312.6211","author":"Goodfellow Ian J","year":"2013","unstructured":"Ian J Goodfellow, Mehdi Mirza, Da Xiao, Aaron Courville, and Yoshua Bengio. 2013. An empirical investigation of catastrophic forgetting in gradient-based neural networks. arXiv preprint arXiv:1312.6211 (2013)."},{"key":"e_1_3_2_1_6_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Gu Yuxian","year":"2023","unstructured":"Yuxian Gu, Li Dong, Furu Wei, and Minlie Huang. 2023. MiniLLM: Knowledge distillation of large language models. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of the Twelfth Language Resources and Evaluation Conference. 2440\u20132452","author":"Guo Mandy","year":"2020","unstructured":"Mandy Guo, Zihang Dai, Denny Vrande\u010di\u0107, and Rami Al-Rfou. 2020. Wiki-40b: Multilingual language model dataset. In Proceedings of the Twelfth Language Resources and Evaluation Conference. 2440\u20132452."},{"key":"e_1_3_2_1_8_1","volume-title":"Don't stop pretraining: Adapt language models to domains and tasks. arXiv preprint arXiv:2004.10964","author":"Gururangan Suchin","year":"2020","unstructured":"Suchin Gururangan, Ana Marasovi\u0107, Swabha Swayamdipta, Kyle Lo, Iz Beltagy, Doug Downey, and Noah A Smith. 2020. Don't stop pretraining: Adapt language models to domains and tasks. arXiv preprint arXiv:2004.10964 (2020)."},{"key":"e_1_3_2_1_9_1","volume-title":"Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531","author":"Hinton Geoffrey","year":"2015","unstructured":"Geoffrey Hinton, Oriol Vinyals, and Jeff Dean. 2015. Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531 (2015)."},{"key":"e_1_3_2_1_10_1","volume-title":"Adaptive mixtures of local experts. Neural computation 3, 1","author":"Jacobs Robert A","year":"1991","unstructured":"Robert A Jacobs, Michael I Jordan, Steven J Nowlan, and Geoffrey E Hinton. 1991. Adaptive mixtures of local experts. Neural computation 3, 1 (1991), 79\u201387."},{"key":"e_1_3_2_1_11_1","volume-title":"Diego de las Casas, Emma Bou Hanna, Florian Bressand, et al.","author":"Jiang Albert Q","year":"2024","unstructured":"Albert Q Jiang, Alexandre Sablayrolles, Antoine Roux, Arthur Mensch, Blanche Savary, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Emma Bou Hanna, Florian Bressand, et al. 2024. Mixtral of experts. arXiv preprint arXiv:2401.04088 (2024)."},{"key":"e_1_3_2_1_12_1","volume-title":"Branch-train-merge: Embarrassingly parallel training of expert language models. arXiv preprint arXiv:2208.03306","author":"Li Margaret","year":"2022","unstructured":"Margaret Li, Suchin Gururangan, Tim Dettmers, Mike Lewis, Tim Althoff, Noah A Smith, and Luke Zettlemoyer. 2022. Branch-train-merge: Embarrassingly parallel training of expert language models. arXiv preprint arXiv:2208.03306 (2022)."},{"key":"e_1_3_2_1_13_1","volume-title":"Meta-KD: A meta knowledge distillation framework for language model compression across domains. arXiv preprint arXiv:2012.01266","author":"Pan Haojie","year":"2020","unstructured":"Haojie Pan, Chengyu Wang, Minghui Qiu, Yichang Zhang, Yaliang Li, and Jun Huang. 2020. Meta-KD: A meta knowledge distillation framework for language model compression across domains. arXiv preprint arXiv:2012.01266 (2020)."},{"key":"e_1_3_2_1_14_1","first-page":"8583","article-title":"Scaling vision with sparse mixture of experts","volume":"34","author":"Riquelme Carlos","year":"2021","unstructured":"Carlos Riquelme, Joan Puigcerver, Basil Mustafa, Maxim Neumann, Rodolphe Jenatton, Andr\u00e9 Susano Pinto, Daniel Keysers, and Neil Houlsby. 2021. Scaling vision with sparse mixture of experts. Advances in Neural Information Processing Systems 34 (2021), 8583\u20138595.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_15_1","volume-title":"a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108","author":"Sanh Victor","year":"2019","unstructured":"Victor Sanh, Lysandre Debut, Julien Chaumond, and Thomas Wolf. 2019. DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108 (2019)."},{"key":"e_1_3_2_1_16_1","volume-title":"Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. arXiv preprint arXiv:1701.06538","author":"Shazeer Noam","year":"2017","unstructured":"Noam Shazeer, Azalia Mirhoseini, Krzysztof Maziarz, Andy Davis, Quoc Le, Geoffrey Hinton, and Jeff Dean. 2017. Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. arXiv preprint arXiv:1701.06538 (2017)."},{"key":"e_1_3_2_1_17_1","volume-title":"Baptiste Rozi\u00e8re, Jacob Kahn, Daniel Li, Wen-tau Yih, Jason Weston, et al.","author":"Sukhbaatar Sainbayar","year":"2024","unstructured":"Sainbayar Sukhbaatar, Olga Golovneva, Vasu Sharma, Hu Xu, Xi Victoria Lin, Baptiste Rozi\u00e8re, Jacob Kahn, Daniel Li, Wen-tau Yih, Jason Weston, et al. 2024. Branch-Train-MiX: Mixing Expert LLMs into a Mixture-of-Experts LLM. arXiv preprint arXiv:2403.07816 (2024)."},{"key":"e_1_3_2_1_18_1","volume-title":"Adapt-and-distill: Developing small, fast and effective pretrained language models for domains. arXiv preprint arXiv:2106.13474","author":"Yao Yunzhi","year":"2021","unstructured":"Yunzhi Yao, Shaohan Huang, Wenhui Wang, Li Dong, and Furu Wei. 2021. Adapt-and-distill: Developing small, fast and effective pretrained language models for domains. arXiv preprint arXiv:2106.13474 (2021)."}],"event":{"name":"SAC '25: 40th ACM\/SIGAPP Symposium on Applied Computing","sponsor":["SIGAPP ACM Special Interest Group on Applied Computing"],"location":"Catania International Airport Catania Italy","acronym":"SAC '25"},"container-title":["Proceedings of the 40th ACM\/SIGAPP Symposium on Applied Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3672608.3707747","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3672608.3707747","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:06:14Z","timestamp":1750291574000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3672608.3707747"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,31]]},"references-count":18,"alternative-id":["10.1145\/3672608.3707747","10.1145\/3672608"],"URL":"https:\/\/doi.org\/10.1145\/3672608.3707747","relation":{},"subject":[],"published":{"date-parts":[[2025,3,31]]},"assertion":[{"value":"2025-05-14","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}