{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:51:27Z","timestamp":1781538687755,"version":"3.54.5"},"reference-count":43,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/100007224","name":"NAFOSTED","doi-asserted-by":"publisher","award":["102.05-2025.16"],"award-info":[{"award-number":["102.05-2025.16"]}],"id":[{"id":"10.13039\/100007224","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Knowledge-Based Systems"],"published-print":{"date-parts":[[2026,6]]},"DOI":"10.1016\/j.knosys.2026.116001","type":"journal-article","created":{"date-parts":[[2026,4,15]],"date-time":"2026-04-15T06:17:29Z","timestamp":1776233849000},"page":"116001","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":1,"special_numbering":"C","title":["MoL: Mixture of Layers in Cross-Tokenizer embedding model distillation"],"prefix":"10.1016","volume":"343","author":[{"given":"Hai An","family":"Vu","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-3834-4882","authenticated-orcid":false,"given":"Minh-Phuc","family":"Truong","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Tu","family":"Vu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0011-5137","authenticated-orcid":false,"given":"Linh Ngo","family":"Van","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.knosys.2026.116001_b1","series-title":"Distilling the knowledge in a neural network","author":"Hinton","year":"2015"},{"key":"10.1016\/j.knosys.2026.116001_b2","series-title":"A survey of large language models","author":"Zhao","year":"2025"},{"key":"10.1016\/j.knosys.2026.116001_b3","series-title":"DistilBERT, a distilled version of BERT: Smaller, faster, cheaper and lighter","author":"Sanh","year":"2020"},{"key":"10.1016\/j.knosys.2026.116001_b4","article-title":"TinyBERT: Distilling BERT for natural language understanding","author":"Jiao","year":"2020","journal-title":"EMNLP"},{"key":"10.1016\/j.knosys.2026.116001_b5","series-title":"MiniLM: Deep self-attention distillation for task-agnostic compression of pre-trained transformers","author":"Wang","year":"2020"},{"key":"10.1016\/j.knosys.2026.116001_b6","series-title":"Distilling task-specific knowledge from BERT into simple neural networks","author":"Tang","year":"2019"},{"key":"10.1016\/j.knosys.2026.116001_b7","article-title":"Making monolingual sentence embeddings multilingual using knowledge distillation","author":"Reimers","year":"2020","journal-title":"EMNLP"},{"key":"10.1016\/j.knosys.2026.116001_b8","series-title":"MTEB: Massive text embedding benchmark","author":"Muennighoff","year":"2023"},{"key":"10.1016\/j.knosys.2026.116001_b9","series-title":"LLM2Vec: Compressing large language models into general-purpose text embeddings","author":"Li","year":"2023"},{"key":"10.1016\/j.knosys.2026.116001_b10","series-title":"Jasper and Stella: Distillation of SOTA embedding models","author":"Zhang","year":"2025"},{"key":"10.1016\/j.knosys.2026.116001_b11","series-title":"DistilCSE: Effective knowledge distillation for contrastive sentence embeddings","author":"Gao","year":"2023"},{"key":"10.1016\/j.knosys.2026.116001_b12","series-title":"Patient knowledge distillation for BERT model compression","author":"Sun","year":"2019"},{"key":"10.1016\/j.knosys.2026.116001_b13","series-title":"MiniLLM: Knowledge distillation of large language models","author":"Gu","year":"2024"},{"key":"10.1016\/j.knosys.2026.116001_b14","series-title":"Towards cross-tokenizer distillation: the universal logit distillation loss for LLMs","author":"Boizard","year":"2025"},{"issue":"22","key":"10.1016\/j.knosys.2026.116001_b15","first-page":"23724","article-title":"Multi-level optimal transport for universal cross-tokenizer knowledge distillation on language models","volume":"39","author":"Cui","year":"2025","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"10.1016\/j.knosys.2026.116001_b16","series-title":"Knowledge fusion of large language models","author":"Wan","year":"2024"},{"key":"10.1016\/j.knosys.2026.116001_b17","series-title":"Dual-space knowledge distillation for large language models","author":"Zhang","year":"2024"},{"key":"10.1016\/j.knosys.2026.116001_b18","series-title":"Less is more: Task-aware layer-wise distillation for language model compression","author":"Liang","year":"2023"},{"key":"10.1016\/j.knosys.2026.116001_b19","first-page":"842","article-title":"A primer in BERTology: What we know about how BERT works","volume":"8","author":"Rogers","year":"2020","journal-title":"Trans. Comput. Linguist."},{"key":"10.1016\/j.knosys.2026.116001_b20","series-title":"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics","first-page":"3651","article-title":"What does BERT learn about the structure of language?","author":"Jawahar","year":"2019"},{"key":"10.1016\/j.knosys.2026.116001_b21","series-title":"Investigating layer importance in large language models","author":"Zhang","year":"2024"},{"key":"10.1016\/j.knosys.2026.116001_b22","series-title":"BERT-EMD: Many-to-many layer mapping for BERT compression with earth mover\u2019s distance","author":"Li","year":"2020"},{"key":"10.1016\/j.knosys.2026.116001_b23","series-title":"Why skip if you can combine: A simple knowledge distillation technique for intermediate layers","author":"Wu","year":"2020"},{"key":"10.1016\/j.knosys.2026.116001_b24","series-title":"Similarity of neural network representations revisited","author":"Kornblith","year":"2019"},{"key":"10.1016\/j.knosys.2026.116001_b25","series-title":"FitNets: Hints for thin deep nets","author":"Romero","year":"2015"},{"key":"10.1016\/j.knosys.2026.116001_b26","series-title":"Flexible feature distillation for large language models","author":"Saadi","year":"2025"},{"key":"10.1016\/j.knosys.2026.116001_b27","series-title":"Sequence-level knowledge distillation","author":"Kim","year":"2016"},{"key":"10.1016\/j.knosys.2026.116001_b28","series-title":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","first-page":"7594","article-title":"EMO: Embedding model distillation via intra-model relation and optimal transport alignments","author":"Truong","year":"2025"},{"key":"10.1016\/j.knosys.2026.116001_b29","series-title":"Enhancing cross-tokenizer knowledge distillation with contextual dynamical mapping","author":"Chen","year":"2025"},{"key":"10.1016\/j.knosys.2026.116001_b30","series-title":"Universal cross-tokenizer distillation via approximate likelihood matching","author":"Minixhofer","year":"2025"},{"key":"10.1016\/j.knosys.2026.116001_b31","series-title":"Knowledge distillation through geometry-aware representational alignment","author":"Bhattarai","year":"2025"},{"key":"10.1016\/j.knosys.2026.116001_b32","series-title":"The Thirteenth International Conference on Learning Representations","article-title":"Improving language model distillation through hidden state matching","author":"Dasgupta","year":"2025"},{"key":"10.1016\/j.knosys.2026.116001_b33","series-title":"Rethinking centered kernel alignment in knowledge distillation","author":"Zhou","year":"2024"},{"key":"10.1016\/j.knosys.2026.116001_b34","series-title":"Less is more: Task-aware layer-wise distillation for language model compression","author":"Liang","year":"2023"},{"issue":"12","key":"10.1016\/j.knosys.2026.116001_b35","doi-asserted-by":"crossref","first-page":"2639","DOI":"10.1162\/0899766042321814","article-title":"Canonical correlation analysis: An overview with application to learning methods","volume":"16","author":"Hardoon","year":"2004","journal-title":"Neural Comput."},{"key":"10.1016\/j.knosys.2026.116001_b36","series-title":"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics","first-page":"2204","article-title":"BIGPATENT: A large-scale dataset for abstractive and coherent summarization","author":"Sharma","year":"2019"},{"key":"10.1016\/j.knosys.2026.116001_b37","series-title":"AAAI","article-title":"SciTail: A textual entailment dataset from science question answering","author":"Khot","year":"2018"},{"issue":"15","key":"10.1016\/j.knosys.2026.116001_b38","first-page":"13388","article-title":"Natural language inference in context - investigating contextual reasoning over long texts","volume":"35","author":"Liu","year":"2021","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"10.1016\/j.knosys.2026.116001_b39","series-title":"Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics","article-title":"Adversarial NLI: A new benchmark for natural language understanding","author":"Nie","year":"2020"},{"key":"10.1016\/j.knosys.2026.116001_b40","series-title":"Proceedings of the 8th International Workshop on Semantic Evaluation","first-page":"1","article-title":"SemEval-2014 task 1: Evaluation of compositional distributional semantic models on full sentences through semantic relatedness and textual entailment","author":"Marelli","year":"2014"},{"key":"10.1016\/j.knosys.2026.116001_b41","series-title":"BGE M3-embedding: Multi-lingual, multi-functionality, multi-granularity text embeddings through self-knowledge distillation","author":"Chen","year":"2024"},{"key":"10.1016\/j.knosys.2026.116001_b42","series-title":"Qwen3 embedding: Advancing text embedding and reranking through foundation models","author":"Zhang","year":"2025"},{"key":"10.1016\/j.knosys.2026.116001_b43","series-title":"First Conference on Language Modeling","article-title":"LLM2Vec: Large language models are secretly powerful text encoders","author":"BehnamGhader","year":"2024"}],"container-title":["Knowledge-Based Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705126007276?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705126007276?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,12]],"date-time":"2026-06-12T00:16:12Z","timestamp":1781223372000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0950705126007276"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6]]},"references-count":43,"alternative-id":["S0950705126007276"],"URL":"https:\/\/doi.org\/10.1016\/j.knosys.2026.116001","relation":{},"ISSN":["0950-7051"],"issn-type":[{"value":"0950-7051","type":"print"}],"subject":[],"published":{"date-parts":[[2026,6]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"MoL: Mixture of Layers in Cross-Tokenizer embedding model distillation","name":"articletitle","label":"Article Title"},{"value":"Knowledge-Based Systems","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.knosys.2026.116001","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"116001"}}