{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:23:06Z","timestamp":1763191386925,"version":"3.45.0"},"reference-count":44,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1109\/ijcnn64981.2025.11228828","type":"proceedings-article","created":{"date-parts":[[2025,11,14]],"date-time":"2025-11-14T18:46:15Z","timestamp":1763145975000},"page":"1-8","source":"Crossref","is-referenced-by-count":0,"title":["DYNAMAX: Dynamic computing for Transformers and Mamba based architectures"],"prefix":"10.1109","author":[{"given":"Miguel","family":"Nogales","sequence":"first","affiliation":[{"name":"Universit&#x00E0; Della Svizzera Italiana,Lugano,Switzerland"}]},{"given":"Matteo","family":"Gambella","sequence":"additional","affiliation":[{"name":"Politecnico di Milano,Milano,Italy"}]},{"given":"Manuel","family":"Roveri","sequence":"additional","affiliation":[{"name":"Politecnico di Milano,Milano,Italy"}]}],"member":"263","reference":[{"year":"2023","author":"Yu","article-title":"Scaling Autoregressive MultiModal Models: Pretraining and Instruction Tuning","key":"ref1"},{"doi-asserted-by":"publisher","key":"ref2","DOI":"10.18653\/v1\/2023.emnlp-main.482"},{"doi-asserted-by":"publisher","key":"ref3","DOI":"10.48550\/ARXIV.1706.03762"},{"year":"2023","author":"Touvron","article-title":"LLaMA: Open and Efficient Foundation Language Models","key":"ref4"},{"year":"2023","author":"Jiang","article-title":"Mistral 7B","key":"ref5"},{"year":"2019","author":"Radford","article-title":"Language models are unsupervised multitask learners","key":"ref6"},{"volume-title":"tech. rep","year":"2023","article-title":"Claude 3 model card","key":"ref7"},{"year":"2024","author":"Snell","article-title":"Scaling LLM Test-Time Compute Optimally can be More Effective than Scaling Model Parameters","key":"ref8"},{"year":"2024","article-title":"Learning to reason with LLMs","key":"ref9"},{"doi-asserted-by":"publisher","key":"ref10","DOI":"10.1038\/s41586-025-09422-z"},{"year":"2023","author":"Gu","article-title":"Mamba: Linear-Time Sequence Modeling with Selective State Spaces","key":"ref11"},{"doi-asserted-by":"publisher","key":"ref12","DOI":"10.1145\/1718487.1718538"},{"doi-asserted-by":"publisher","key":"ref13","DOI":"10.1109\/ICPR.2016.7900006"},{"doi-asserted-by":"publisher","key":"ref14","DOI":"10.1007\/978-3-030-01261-8_25"},{"key":"ref15","first-page":"9782","article-title":"Dynabert: Dynamic bert with adaptive width and depth","volume":"33","author":"Hou","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"year":"2024","author":"Gromov","article-title":"The Unreasonable Ineffectiveness of the Deeper Layers","key":"ref16"},{"year":"2018","author":"Devlin","article-title":"BERT: Pretraining of Deep Bidirectional Transformers for Language Understanding","key":"ref17"},{"doi-asserted-by":"publisher","key":"ref18","DOI":"10.1109\/cvpr52688.2022.01054"},{"doi-asserted-by":"publisher","key":"ref19","DOI":"10.1145\/3581783.3611762"},{"key":"ref20","first-page":"27387","article-title":"Nodeformer: A scalable graph structure learning transformer for node classification","volume":"35","author":"Wu","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"doi-asserted-by":"publisher","key":"ref21","DOI":"10.18653\/v1\/2021.findings-emnlp.43"},{"doi-asserted-by":"publisher","key":"ref22","DOI":"10.18653\/v1\/2020.acl-main.537"},{"year":"2019","author":"Elbayad","article-title":"Depth-Adaptive Transformer","key":"ref23"},{"key":"ref24","first-page":"17456","article-title":"Confident adaptive language modeling","volume":"35","author":"Schuster","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"doi-asserted-by":"publisher","key":"ref25","DOI":"10.18653\/v1\/2023.emnlp-main.362"},{"year":"2023","author":"Chen","article-title":"EE-LLM: Large-Scale Training and Inference of Early-Exit Large Language Models with 3D Parallelism","key":"ref26"},{"issue":"140","key":"ref27","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"Journal of machine learning research"},{"year":"2024","author":"Han","article-title":"Parameter-Efficient Fine-Tuning for Large Models: A Comprehensive Survey","key":"ref28"},{"year":"2021","author":"Hu","article-title":"LoRA: Low-Rank Adaptation of Large Language Models","key":"ref29"},{"volume-title":"International Conference on Machine Learning (ICML)","author":"Dao","article-title":"Transformers are SSMs: Generalized models and efficient algorithms through structured state space duality","key":"ref30"},{"doi-asserted-by":"publisher","key":"ref31","DOI":"10.1109\/IJCNN54540.2023.10191876"},{"doi-asserted-by":"publisher","key":"ref32","DOI":"10.7551\/mitpress\/7503.003.0024"},{"year":"2015","author":"Hinton","article-title":"Distilling the Knowledge in a Neural Network","key":"ref33"},{"doi-asserted-by":"publisher","key":"ref34","DOI":"10.1109\/TPAMI.2024.3447085"},{"year":"2024","author":"Ma","article-title":"The Era of 1-bit LLMs: All Large Language Models are in 1.58 Bits","key":"ref35"},{"year":"2020","author":"Wang","article-title":"Linformer: Self-Attention with Linear Complexity","key":"ref36"},{"year":"2020","author":"Beltagy","article-title":"Longformer: The Long-Document Transformer","key":"ref37"},{"year":"2024","article-title":"Codestral mamba","key":"ref38"},{"doi-asserted-by":"publisher","key":"ref39","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"year":"2024","author":"Lozhkov","article-title":"Fineweb-edu","key":"ref40"},{"doi-asserted-by":"publisher","key":"ref41","DOI":"10.18653\/v1\/2022.acl-long.229"},{"doi-asserted-by":"publisher","key":"ref42","DOI":"10.1162\/tacl_a_00266"},{"doi-asserted-by":"publisher","key":"ref43","DOI":"10.18653\/v1\/P17-1147"},{"year":"2024","author":"Gao","article-title":"A framework for few-shot language model evaluation","key":"ref44"}],"event":{"name":"2025 International Joint Conference on Neural Networks (IJCNN)","start":{"date-parts":[[2025,6,30]]},"location":"Rome, Italy","end":{"date-parts":[[2025,7,5]]}},"container-title":["2025 International Joint Conference on Neural Networks (IJCNN)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11227166\/11227148\/11228828.pdf?arnumber=11228828","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:19:12Z","timestamp":1763191152000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11228828\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":44,"URL":"https:\/\/doi.org\/10.1109\/ijcnn64981.2025.11228828","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]}}}