{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:17:59Z","timestamp":1763191079398,"version":"3.45.0"},"reference-count":37,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1109\/ijcnn64981.2025.11228585","type":"proceedings-article","created":{"date-parts":[[2025,11,14]],"date-time":"2025-11-14T18:46:15Z","timestamp":1763145975000},"page":"1-8","source":"Crossref","is-referenced-by-count":0,"title":["TensorLLM: Tensorising Multi-Head Attention for Enhanced Reasoning and Compression in LLMs"],"prefix":"10.1109","author":[{"given":"Yuxuan","family":"Gu","sequence":"first","affiliation":[{"name":"Imperial College,Department of Electrical and Electronic Engineering,London,United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wuyang","family":"Zhou","sequence":"additional","affiliation":[{"name":"Imperial College,Department of Electrical and Electronic Engineering,London,United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Giorgos","family":"Iacovides","sequence":"additional","affiliation":[{"name":"Imperial College,Department of Electrical and Electronic Engineering,London,United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Danilo","family":"Mandic","sequence":"additional","affiliation":[{"name":"Imperial College,Department of Electrical and Electronic Engineering,London,United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"article-title":"GPT-4 technical report","year":"2023","author":"Achiam","key":"ref2"},{"article-title":"LLaMA: Open and efficient foundation language models","year":"2023","author":"Touvron","key":"ref3"},{"article-title":"Llama 2: Open foundation and fine-tuned chat models","year":"2023","author":"Touvron","key":"ref4"},{"article-title":"The Llama 3 herd of models","year":"2024","author":"Dubey","key":"ref5"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1176"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2019-2225"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1387"},{"article-title":"Distilling the Knowledge in a Neural Network","year":"2015","author":"Hinton","key":"ref9"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2022.101429"},{"article-title":"Reducing Transformer Depth on Demand with Structured Dropout","volume-title":"Proceedings of the International Conference on Learning Representations","author":"Fan","key":"ref11"},{"article-title":"The Truth is in There: Improving Reasoning in Language Models with Layer-Selective Rank Reduction","volume-title":"Proceedings of The Twelfth International Conference on Learning Representations","author":"Sharma","key":"ref12"},{"article-title":"Compressing Large Language Models using Low Rank and Low Precision Decomposition","volume-title":"The Thirty-eighth Annual Conference on Neural Information Processing Systems","author":"Saha","key":"ref13"},{"article-title":"Compression and Interpretability of Deep Neural Networks Via Tucker Tensor Layer: From First Principles to Tensor Valued Back-Propagation","year":"2019","author":"Calvi","key":"ref14"},{"article-title":"TensorGPT: Efficient compression of the embedding layer in LLMs based on the tensor-train decomposition","year":"2023","author":"Xu","key":"ref15"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-96-8298-0_32"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W19-4828"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-3007"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W19-4808"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1016\/j.aop.2014.06.013"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2013.2297439"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1137\/07070111X"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/BF02289464"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1137\/S0895479896305696"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-47969-4_30"},{"article-title":"Towards LLM-guided Efficient and Interpretable Multi-linear Tensor Network Rank Selection","year":"2024","author":"Iacovides","key":"ref26"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02480"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"article-title":"Layer normalization","year":"2016","author":"Lei Ba","key":"ref29"},{"issue":"26","key":"ref30","first-page":"1","article-title":"TensorLy: Tensor Learning in Python","volume":"20","author":"Kossaifi","year":"2019","journal-title":"Journal of Machine Learning Research"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1137\/S0895479898346995"},{"key":"ref32","article-title":"RoBERTa: A robustly optimized BERT pretraining approach","volume":"364","author":"Liu","year":"2019"},{"article-title":"GPT-J-6B: A 6 Billion Parameter Autoregressive Language Model","year":"2021","author":"Wang","key":"ref33"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1259"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-1074"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/3287560.3287572"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D15-1075"}],"event":{"name":"2025 International Joint Conference on Neural Networks (IJCNN)","start":{"date-parts":[[2025,6,30]]},"location":"Rome, Italy","end":{"date-parts":[[2025,7,5]]}},"container-title":["2025 International Joint Conference on Neural Networks (IJCNN)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11227166\/11227148\/11228585.pdf?arnumber=11228585","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:13:45Z","timestamp":1763190825000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11228585\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":37,"URL":"https:\/\/doi.org\/10.1109\/ijcnn64981.2025.11228585","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]}}}