{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,20]],"date-time":"2026-01-20T21:11:37Z","timestamp":1768943497180,"version":"3.49.0"},"reference-count":59,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/100020595","name":"National Science and Technology Council","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100020595","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neural Networks"],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1016\/j.neunet.2025.108274","type":"journal-article","created":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T00:26:15Z","timestamp":1761870375000},"page":"108274","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Expertfuse: A huffman tree-based gradual expert integration framework for MoE models"],"prefix":"10.1016","volume":"195","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-3565-683X","authenticated-orcid":false,"given":"Yizeng","family":"Fang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5961-7863","authenticated-orcid":false,"given":"Juinndar","family":"Huang","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.neunet.2025.108274_bib0001","unstructured":"Abdin, M., Aneja, J., Awadalla, H., Awadallah, A., Awan, A. A., Bach, N., Bahree, A., Bakhtiari, A., Bao, J., Behl, H. et al. (2024). Phi-3 technical report: A highly capable language model locally on your phone. arXiv preprint arXiv: 2404.14219."},{"key":"10.1016\/j.neunet.2025.108274_bib0002","series-title":"Proceedings of the 2019 conference of the north american chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers)","article-title":"MathQA: Towards interpretable math word problem solving with operation-Based formalisms","author":"Amini","year":"2019"},{"key":"10.1016\/j.neunet.2025.108274_bib0003","series-title":"Proceedings of the 2020\u202fAAAI conference on artificial intelligence","first-page":"7432","article-title":"PIQA: Reasoning about physical commonsense in natural language","volume":"vol. 34","author":"Bisk","year":"2020"},{"key":"10.1016\/j.neunet.2025.108274_bib0004","doi-asserted-by":"crossref","first-page":"529","DOI":"10.1016\/j.neucom.2014.07.064","article-title":"Neighbourhood sampling in bagging for imbalanced data","volume":"150","author":"B\u0142aszczy\u0144ski","year":"2015","journal-title":"Neurocomputing"},{"key":"10.1016\/j.neunet.2025.108274_bib0005","doi-asserted-by":"crossref","first-page":"123","DOI":"10.1023\/A:1018054314350","article-title":"Bagging predictors","volume":"24","author":"Breiman","year":"1996","journal-title":"Machine Learning"},{"key":"10.1016\/j.neunet.2025.108274_bib0006","doi-asserted-by":"crossref","DOI":"10.1109\/TKDE.2025.3554028","article-title":"A survey on mixture of experts in large language models","author":"Cai","year":"2025","journal-title":"IEEE Transactions on Knowledge and Data Engineering"},{"key":"10.1016\/j.neunet.2025.108274_bib0007","unstructured":"Chen, T., Huang, S., Xie, Y., Jiao, B., Jiang, D., Zhou, H., Li, J., & Wei, F. (2022). Task-specific expert pruning for sparse mixture-of-experts. arXiv preprint arXiv: 2206.00277. 10.48550\/arXiv.2206.00277."},{"key":"10.1016\/j.neunet.2025.108274_bib0008","series-title":"Proceedings of the 2022 international conference on machine learning","first-page":"4057","article-title":"Unified scaling laws for routed language models","author":"Clark","year":"2022"},{"key":"10.1016\/j.neunet.2025.108274_bib0009","series-title":"Proceedings of the 2019 conference of the north american chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers)","first-page":"2924","article-title":"Boolq: Exploring the surprising difficulty of natural yes\/no questions","author":"Clark","year":"2019"},{"key":"10.1016\/j.neunet.2025.108274_bib0010","unstructured":"Clark, P., Cowhey, I., Etzioni, O., Khot, T., Sabharwal, A., Schoenick, C., & Tafjord, O. (2018). Think you have solved question answering? try ARC, the AI2 reasoning challenge. arXiv preprint arXiv: 1803.05457. 10.48550\/arXiv.1803.05457."},{"key":"10.1016\/j.neunet.2025.108274_bib0011","unstructured":"Computer, T. (2023). RedPajama: an open source recipe to reproduce LLaMA training dataset. https:\/\/github.com\/togethercomputer\/RedPajama-Data."},{"key":"10.1016\/j.neunet.2025.108274_bib0012","series-title":"Proceedings of the 2005 machine learning challenges workshop","first-page":"177","article-title":"The PASCAL recognising textual entailment challenge","author":"Dagan","year":"2005"},{"key":"10.1016\/j.neunet.2025.108274_bib0013","series-title":"Proceedings of the 2024 annual meeting of the association for computational linguistics (volume 1: Long papers)","article-title":"Deepseekmoe: Towards ultimate expert specialization in mixture-of-experts language models","author":"Dai","year":"2024"},{"key":"10.1016\/j.neunet.2025.108274_bib0014","series-title":"Proceedings of the 2005 international workshop on paraphrasing","article-title":"Automatically constructing a corpus of sentential paraphrases","author":"Dolan","year":"2005"},{"key":"10.1016\/j.neunet.2025.108274_bib0015","series-title":"Proceedings of the 2022 international conference on machine learning","first-page":"5547","article-title":"GLaM: Efficient scaling of language models with mixture-of-experts","author":"Du","year":"2022"},{"key":"10.1016\/j.neunet.2025.108274_bib0016","series-title":"Proceedings of the 2024 machine learning and systems","first-page":"224","article-title":"SiDA-MoE: Sparsity-inspired data-aware serving for efficient and scalable large mixture-of-experts models","volume":"vol. 6","author":"Du","year":"2024"},{"key":"10.1016\/j.neunet.2025.108274_bib0017","doi-asserted-by":"crossref","DOI":"10.1016\/j.neunet.2025.107426","article-title":"Multi-head ensemble of smoothed classifiers for certified robustness","volume":"188","author":"Fang","year":"2025","journal-title":"Neural Networks"},{"key":"10.1016\/j.neunet.2025.108274_bib0018","unstructured":"Fang, Z., Hong, Z., Huang, Y., Lyu, Y., Chen, W., Yu, Y., Yu, F., & Zheng, Z. (2025b). Fate: Fast edge inference of mixture-of-experts models via cross-layer gate. arXiv preprint arXiv: 2502.12224."},{"issue":"120","key":"10.1016\/j.neunet.2025.108274_bib0019","first-page":"1","article-title":"Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity","volume":"23","author":"Fedus","year":"2022","journal-title":"Journal of Machine Learning Research"},{"key":"10.1016\/j.neunet.2025.108274_bib0020","series-title":"Proceedings of the 1996 international conference on machine learning","first-page":"148","article-title":"Experiments with a new boosting algorithm","author":"Freund","year":"1996"},{"key":"10.1016\/j.neunet.2025.108274_bib0021","doi-asserted-by":"crossref","DOI":"10.1016\/j.neunet.2025.107544","article-title":"Pruning the ensemble of convolutional neural networks using second-order cone programming","author":"G\u00fcldo\u011fu\u015f","year":"2025","journal-title":"Neural Networks"},{"key":"10.1016\/j.neunet.2025.108274_bib0022","article-title":"Towards efficient mixture of experts: A holistic study of compression techniques","author":"He","year":"2025","journal-title":"Transactions on Machine Learning Research"},{"key":"10.1016\/j.neunet.2025.108274_bib0023","series-title":"Proceedings of the 2021 international conference on learning representations","article-title":"Measuring massive multitask language understanding","author":"Hendrycks","year":"2021"},{"key":"10.1016\/j.neunet.2025.108274_bib0024","series-title":"Proceedings of the 2017 international conference on learning representations","article-title":"Snapshot ensembles: Train 1, get m for free","author":"Huang","year":"2017"},{"issue":"9","key":"10.1016\/j.neunet.2025.108274_bib0025","doi-asserted-by":"crossref","first-page":"1098","DOI":"10.1109\/JRPROC.1952.273898","article-title":"A method for the construction of minimum-redundancy codes","volume":"40","author":"Huffman","year":"1952","journal-title":"Proceedings of the IRE"},{"issue":"1","key":"10.1016\/j.neunet.2025.108274_bib0026","doi-asserted-by":"crossref","first-page":"79","DOI":"10.1162\/neco.1991.3.1.79","article-title":"Adaptive mixtures of local experts","volume":"3","author":"Jacobs","year":"1991","journal-title":"Neural Computation"},{"key":"10.1016\/j.neunet.2025.108274_bib0027","unstructured":"Jiang, A. Q., Sablayrolles, A., Roux, A., Mensch, A., Savary, B., Bamford, C., Chaplot, D. S., de las, C. D., Hanna, E. B., Bressand, F. et al. (2024). Mixtral of experts. arXiv preprint arXiv: 2401.04088. 10.48550\/arXiv.2401.04088."},{"key":"10.1016\/j.neunet.2025.108274_bib0028","series-title":"Proceedings of the 2023 international conference on learning representations","article-title":"REPAIR: Renormalizing permuted activations for interpolation repair","author":"Jordan","year":"2023"},{"key":"10.1016\/j.neunet.2025.108274_bib0029","series-title":"Proceedings of the 2020\u202fACM SIGKDD international conference on knowledge discovery & data mining","first-page":"1316","article-title":"A novel deep learning model by stacking conditional restricted boltzmann machine and deep neural network","author":"Kang","year":"2020"},{"key":"10.1016\/j.neunet.2025.108274_bib0030","doi-asserted-by":"crossref","first-page":"109","DOI":"10.1016\/j.epsr.2015.03.027","article-title":"Improved short-term load forecasting using bagged neural networks","volume":"125","author":"Khwaja","year":"2015","journal-title":"Electric Power Systems Research"},{"key":"10.1016\/j.neunet.2025.108274_bib0031","article-title":"Simple and scalable predictive uncertainty estimation using deep ensembles","volume":"30","author":"Lakshminarayanan","year":"2017","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2025.108274_bib0032","series-title":"Proceedings of the 2021 international conference on learning representations","article-title":"GShard: Scaling giant models with conditional computation and automatic sharding","author":"Lepikhin","year":"2021"},{"key":"10.1016\/j.neunet.2025.108274_bib0033","series-title":"Proceedings of the 2023 international conference on learning representations","article-title":"Sparse mixture-of-experts are domain generalizable learners","author":"Li","year":"2023"},{"key":"10.1016\/j.neunet.2025.108274_bib0034","series-title":"Proceedings of the 2024 international conference on learning representations","article-title":"Merge, then compress: demystify efficient smoe with hints from its routing policy","author":"Li","year":"2024"},{"key":"10.1016\/j.neunet.2025.108274_bib0035","series-title":"Proceedings of the 2004 workshop on text summarization branches out","first-page":"74","article-title":"ROUGE: A package for automatic evaluation of summaries","author":"Lin","year":"2004"},{"key":"10.1016\/j.neunet.2025.108274_bib0036","unstructured":"Liu, A., Feng, B., Wang, B., Wang, B., Liu, B., Zhao, C., Dengr, C., Ruan, C., Dai, D., Guo, D. et al. (2024a). DeepSeek-V2: A strong, economical, and efficient mixture-of-experts language model. arXiv preprint arXiv: 2405.04434. 10.48550\/arXiv.2405.04434."},{"key":"10.1016\/j.neunet.2025.108274_bib0037","unstructured":"Liu, A., Feng, B., Xue, B., Wang, B., Wu, B., Lu, C., Zhao, C., Deng, C., Zhang, C., Ruan, C. et al. (2024b). DeepSeek-V3 technical report. arXiv preprint arXiv: 2412.19437. 10.48550\/arXiv.2412.19437."},{"issue":"12","key":"10.1016\/j.neunet.2025.108274_bib0038","doi-asserted-by":"crossref","first-page":"5021","DOI":"10.1109\/TCYB.2019.2908387","article-title":"Stacking-Based deep neural network: Deep analytic network for pattern classification","volume":"50","author":"Low","year":"2019","journal-title":"IEEE Transactions on Cybernetics"},{"key":"10.1016\/j.neunet.2025.108274_bib0039","series-title":"Proceedings of the 2024 annual meeting of the association for computational linguistics (volume 1: Long papers)","article-title":"Not all experts are equal: Efficient expert pruning and skipping for mixture-of-experts large language models","author":"Lu","year":"2024"},{"key":"10.1016\/j.neunet.2025.108274_bib0040","series-title":"Proceedings of the 2025 conference on international conference on learning representations","article-title":"OLMoE: Open mixture-of-experts language models","author":"Muennighoff","year":"2025"},{"key":"10.1016\/j.neunet.2025.108274_bib0041","series-title":"Proceedings the 2016\u202fSIGNLL conference on computational natural language learning","article-title":"Abstractive text summarization using sequence-to-sequence RNNs and beyond","author":"Nallapati","year":"2016"},{"key":"10.1016\/j.neunet.2025.108274_bib0042","series-title":"Proceedings of the 2019 conference of the north american chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers)","first-page":"1267","article-title":"WiC: The word-in-Context dataset for evaluating context-sensitive meaning representations","author":"Pilehvar","year":"2019"},{"key":"10.1016\/j.neunet.2025.108274_bib0043","series-title":"Proceddings of the 2022 international conference on machine learning","first-page":"18332","article-title":"Deepspeed-Moe: Advancing mixture-of-experts inference and training to power next-generation AI scale","author":"Rajbhandari","year":"2022"},{"key":"10.1016\/j.neunet.2025.108274_bib0044","series-title":"Proceedings of the 2016 conference on empirical methods in natural language processing","first-page":"2383","article-title":"SQuAd: 100,000+ questions for machine comprehension of text","author":"Rajpurkar","year":"2016"},{"issue":"9","key":"10.1016\/j.neunet.2025.108274_bib0045","doi-asserted-by":"crossref","first-page":"99","DOI":"10.1145\/3474381","article-title":"Winogrande: An adversarial winograd schema challenge at scale","volume":"64","author":"Sakaguchi","year":"2021","journal-title":"Communications of the ACM"},{"key":"10.1016\/j.neunet.2025.108274_bib0046","series-title":"Proceedings of the 2017 international conference on learning representations","article-title":"Outrageously large neural networks: The sparsely-gated mixture-of-experts layer","author":"Shazeer","year":"2017"},{"key":"10.1016\/j.neunet.2025.108274_bib0047","doi-asserted-by":"crossref","DOI":"10.1016\/j.neunet.2025.107151","article-title":"Online ensemble model compression for nonstationary data stream learning","author":"Soares","year":"2025","journal-title":"Neural Networks"},{"key":"10.1016\/j.neunet.2025.108274_bib0048","series-title":"Proceedings of the 2013 conference on empirical methods in natural language processing","first-page":"1631","article-title":"Recursive deep models for semantic compositionality over a sentiment treebank","author":"Socher","year":"2013"},{"key":"10.1016\/j.neunet.2025.108274_bib0049","series-title":"Proceedings of the 2024 international conference on learning representations","article-title":"Zipit! merging models from different tasks without training","author":"Stoica","year":"2024"},{"key":"10.1016\/j.neunet.2025.108274_bib0050","unstructured":"Tang, P., Liu, J., Hou, X., Pu, Y., Wang, J., Heng, P.-A., Li, C., & Guo, M. (2024). HOBBIT: A mixed precision expert offloading system for fast moe inference. arXiv preprint arXiv: 2411.01433."},{"key":"10.1016\/j.neunet.2025.108274_bib0051","series-title":"Proceedings of the 2019\u202fIEEE winter conference on applications of computer vision (WACV)","first-page":"599","article-title":"Hibster: Hierarchical boosted deep metric learning for image retrieval","author":"Waltner","year":"2019"},{"key":"10.1016\/j.neunet.2025.108274_bib0052","doi-asserted-by":"crossref","DOI":"10.1016\/j.neunet.2025.107285","article-title":"A hierarchical mixture-Of-experts framework for few labeled node classification","volume":"188","author":"Wang","year":"2025","journal-title":"Neural Networks"},{"key":"10.1016\/j.neunet.2025.108274_bib0053","doi-asserted-by":"crossref","first-page":"625","DOI":"10.1162\/tacl_a_00290","article-title":"Neural network acceptability judgments","volume":"7","author":"Warstadt","year":"2019","journal-title":"Transactions of the Association for Computational Linguistics"},{"issue":"2","key":"10.1016\/j.neunet.2025.108274_bib0054","doi-asserted-by":"crossref","first-page":"241","DOI":"10.1016\/S0893-6080(05)80023-1","article-title":"Stacked generalization","volume":"5","author":"Wolpert","year":"1992","journal-title":"Neural Networks"},{"key":"10.1016\/j.neunet.2025.108274_bib0055","unstructured":"Yang, A., Li, A., Yang, B., Zhang, B., Hui, B., Zheng, B., Yu, B., Gao, C., Huang, C., Lv, C. et al. (2025). Qwen3 technical report. arXiv preprint arXiv: 2505.09388. 10.48550\/arXiv.2505.09388."},{"key":"10.1016\/j.neunet.2025.108274_bib0056","doi-asserted-by":"crossref","DOI":"10.1109\/TMC.2025.3546466","article-title":"Edgemoe: Empowering sparse large language models on mobile devices","author":"Yi","year":"2025","journal-title":"IEEE Transactions on Mobile Computing"},{"key":"10.1016\/j.neunet.2025.108274_bib0057","series-title":"Proceedings of the 2109 annual meeting of the association for computational linguistics","article-title":"Hellaswag: Can a machine really finish your sentence?","author":"Zellers","year":"2019"},{"key":"10.1016\/j.neunet.2025.108274_bib0058","series-title":"First conference on language modeling","article-title":"Lory: Fully differentiable mixture-of-experts for autoregressive language model pre-training","author":"Zhong","year":"2024"},{"key":"10.1016\/j.neunet.2025.108274_bib0059","article-title":"Mixture-of-experts with expert choice routing","volume":"35","author":"Zhou","year":"2022","journal-title":"Advances in neural information processing systems"}],"container-title":["Neural Networks"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608025011554?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608025011554?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,1,20]],"date-time":"2026-01-20T07:27:53Z","timestamp":1768894073000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0893608025011554"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3]]},"references-count":59,"alternative-id":["S0893608025011554"],"URL":"https:\/\/doi.org\/10.1016\/j.neunet.2025.108274","relation":{},"ISSN":["0893-6080"],"issn-type":[{"value":"0893-6080","type":"print"}],"subject":[],"published":{"date-parts":[[2026,3]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Expertfuse: A huffman tree-based gradual expert integration framework for MoE models","name":"articletitle","label":"Article Title"},{"value":"Neural Networks","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neunet.2025.108274","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2025 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"108274"}}