{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,14]],"date-time":"2026-04-14T00:09:20Z","timestamp":1776125360550,"version":"3.50.1"},"reference-count":44,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T00:00:00Z","timestamp":1772755200000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100000780","name":"European Commission","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100000780","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neural Networks"],"published-print":{"date-parts":[[2026,8]]},"DOI":"10.1016\/j.neunet.2026.108800","type":"journal-article","created":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T20:45:24Z","timestamp":1772829924000},"page":"108800","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["EmbBERT: Attention under 2\u202fMB memory"],"prefix":"10.1016","volume":"200","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5453-1988","authenticated-orcid":false,"given":"Riccardo","family":"Bravin","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5964-5685","authenticated-orcid":false,"given":"Massimo","family":"Pavan","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0003-0182-3885","authenticated-orcid":false,"given":"Hazem","family":"Hesham Yousef Shalby","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1919-6141","authenticated-orcid":false,"given":"Fabrizio","family":"Pittorino","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7828-7687","authenticated-orcid":false,"given":"Manuel","family":"Roveri","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.neunet.2026.108800_bib0001","unstructured":"Resource-efficient neural networks for embedded systems (2024). Resource-efficient neural networks for embedded systems. https:\/\/arxiv.org\/HTML\/2001.03048v3."},{"key":"10.1016\/j.neunet.2026.108800_bib0002","unstructured":"Akbulut, M. E., Shalby, H. H. Y., Pittorino, F., & Roveri, M. (2025). Infoq: Mixed-precision quantization via global information flow. https:\/\/arxiv.org\/abs\/2508.04753."},{"key":"10.1016\/j.neunet.2026.108800_bib0003","doi-asserted-by":"crossref","unstructured":"Cioflan, C., Cavigelli, L., Rusci, M., Prado, d., & Benini, L. (2024). On-device domain learning for keyword spotting on low-power extreme edge embedded systems. https:\/\/arxiv.org\/abs\/2403.10549.","DOI":"10.1109\/AICAS59952.2024.10595987"},{"key":"10.1016\/j.neunet.2026.108800_bib0004","unstructured":"Clark, K., Luong, M.-T., Le, Q. V., & Manning, C. D. (2020). Electra: Pre-training text encoders as discriminators rather than generators. http:\/\/arxiv.org\/abs\/2003.10555."},{"key":"10.1016\/j.neunet.2026.108800_bib0005","unstructured":"Devlin, J., Chang, M.-W., Lee, K., & Toutanova, K. (2018). Bert: Pre-training of deep bidirectional transformers for language understanding. https:\/\/arxiv.org\/abs\/1810.04805."},{"key":"10.1016\/j.neunet.2026.108800_bib0006","doi-asserted-by":"crossref","unstructured":"Fields, C., & Kennington, C. (2023). Exploring transformers as compact, data-efficient language models. https:\/\/aclanthology.org\/2023.conll-1.35\/. 10.18653\/v1\/2023.conll-1.35.","DOI":"10.18653\/v1\/2023.conll-1.35"},{"key":"10.1016\/j.neunet.2026.108800_bib0007","series-title":"Low-power computer vision","first-page":"291","article-title":"A survey of quantization methods for efficient neural network inference","author":"Gholami","year":"2022"},{"key":"10.1016\/j.neunet.2026.108800_bib0008","unstructured":"Gu, A., & Dao, T. (2023). Mamba: Linear-time sequence modeling with selective state spaces. 10.48550\/arXiv.2312.00752."},{"key":"10.1016\/j.neunet.2026.108800_bib0009","unstructured":"Hinton, G. (2015). Distilling the knowledge in a neural network. arXiv preprint arXiv: 1503.02531."},{"key":"10.1016\/j.neunet.2026.108800_bib0010","unstructured":"Hosseini, M., & Hosseini, P. (2024). You need to pay better attention: Rethinking the mathematics of attention mechanism. http:\/\/arxiv.org\/abs\/2403.01643."},{"key":"10.1016\/j.neunet.2026.108800_bib0011","unstructured":"Hu, E. J., Shen, Y., Wallis, P., Allen-Zhu, Z., Li, Y., Wang, S., Wang, L., & Chen, W. (2021). LoRA: Low-rank adaptation of large language models. arXiv: 2106.09685 [cs]. https:\/\/arxiv.org\/abs\/2106.09685."},{"key":"10.1016\/j.neunet.2026.108800_bib0012","unstructured":"HuggingFace (2024). Bitsandbytes. https:\/\/github.com\/bitsandbytes-foundation\/bitsandbytes\/tree\/main."},{"key":"10.1016\/j.neunet.2026.108800_bib0013","unstructured":"Jiang, Z., Yu, W., Zhou, D., Chen, Y., Feng, J., & Yan, S. (2020). ConvBERT: Improving BERT with span-based dynamic convolution. arXiv (Cornell University). 10.48550\/arxiv.2008.02496."},{"key":"10.1016\/j.neunet.2026.108800_bib0014","unstructured":"Joshi, M., Chen, D., Liu, Y., Weld, D. S., Zettlemoyer, L., & Levy, O. (2020). SpanBERT: Improving pre-training by representing and predicting spans. arXiv: 1907.10529 [cs]. https:\/\/arxiv.org\/abs\/1907.10529."},{"key":"10.1016\/j.neunet.2026.108800_bib0015","unstructured":"Kim, S., Gholaminejad, A., Yao, Z., Mahoney, M., & Keutzer, E. K. (2021). I-BERT: Integer-only BERT quantization. arXiv (Cornell University). 10.48550\/arxiv.2101.01321."},{"key":"10.1016\/j.neunet.2026.108800_bib0016","doi-asserted-by":"crossref","unstructured":"Kopetz, H., & Steiner, W. (2022). Internet of things. 10.1007\/978-3-031-11992-7_13.","DOI":"10.1007\/978-3-031-11992-7_13"},{"key":"10.1016\/j.neunet.2026.108800_bib0017","unstructured":"Lan, Z., Chen, M., Goodman, S., Gimpel, K., Sharma, P., & Soricut, R. (2020). Albert: A lite bert for self-supervised learning of language representations. arXiv: 1909.11942 [cs]. https:\/\/arxiv.org\/abs\/1909.11942."},{"key":"10.1016\/j.neunet.2026.108800_bib0018","unstructured":"Li, Y., Chen, Y., Dai, X., Chen, D., Liu, M., Yuan, L., Liu, Z., Zhang, L., & Vasconcelos, N. (2020). Micronet: Towards image recognition with extremely low FLOPs. https:\/\/arxiv.org\/abs\/2011.12289."},{"key":"10.1016\/j.neunet.2026.108800_bib0019","unstructured":"Lin, J., Chen, W.-M., Lin, Y., Cohn, J., Gan, C., & Han, S. (2020). Mcunet: Tiny deep learning on iot devices. arXiv: 2007.10319 [cs]. https:\/\/arxiv.org\/abs\/2007.10319."},{"issue":"3","key":"10.1016\/j.neunet.2026.108800_bib0020","doi-asserted-by":"crossref","first-page":"8","DOI":"10.1109\/MCAS.2023.3302182","article-title":"Tiny machine learning: Progress and futures [feature]","volume":"23","author":"Lin","year":"2023","journal-title":"IEEE Circuits and Systems Magazine"},{"key":"10.1016\/j.neunet.2026.108800_bib0021","unstructured":"Liu, X., Zheng, Y., Du, Z., Ding, M., Qian, Y., Yang, Z., & Tang, J. (2021). Gpt understands, too. http:\/\/arxiv.org\/abs\/2103.10385."},{"key":"10.1016\/j.neunet.2026.108800_bib0022","unstructured":"Liu, Y., Ott, M., Goyal, N., Du, J., Joshi, M., Chen, D., Levy, O., Lewis, M., Zettlemoyer, L., & Stoyanov, V. (2019). RoBERTa: A robustly optimized BERT pretraining approach. https:\/\/arxiv.org\/abs\/1907.11692."},{"key":"10.1016\/j.neunet.2026.108800_bib0023","unstructured":"Lu, Y., Shen, M., Wang, H., Wang, X., van Rechem, C., & Wei, W. (2023). Machine learning for synthetic data generation: a review. arXiv preprint arXiv: 2302.04062."},{"key":"10.1016\/j.neunet.2026.108800_bib0024","series-title":"Proceedings of the 49th annual meeting of the association for computational linguistics: Human language technologies","first-page":"142","article-title":"Learning word vectors for sentiment analysis","author":"Maas","year":"2011"},{"key":"10.1016\/j.neunet.2026.108800_bib0025","series-title":"Proceedings of the 7th joint international conference on data science & management of data (11th ACM IKDD CODS and 29th COMAD)","first-page":"342","article-title":"NanoBERT: An extremely compact language model","author":"Maity","year":"2024"},{"key":"10.1016\/j.neunet.2026.108800_bib0026","series-title":"Findings of the association for computational linguistics: EMNLP 2020","first-page":"991","article-title":"Limit: The literal motion in text dataset","author":"Manotas","year":"2020"},{"key":"10.1016\/j.neunet.2026.108800_bib0027","unstructured":"Nagel, M., Fournarakis, M., Amjad, R. A., Bondarenko, Y., van Baalen, M., & Blankevoort, T. (2021). A White Paper on Neural Network Quantization. arXiv: 2106.08295 [cs]\">http:\/\/arxiv.org\/abs\/2106.08295."},{"key":"10.1016\/j.neunet.2026.108800_bib0028","doi-asserted-by":"crossref","DOI":"10.1016\/j.comnet.2021.108074","article-title":"A survey on wearable technology: History, state-of-the-art and current challenges","volume":"193","author":"Ometov","year":"2021","journal-title":"Computer Networks"},{"key":"10.1016\/j.neunet.2026.108800_bib0029","doi-asserted-by":"crossref","first-page":"50","DOI":"10.1109\/MM.2023.3311826","article-title":"On-device customization of tiny deep learning models for keyword spotting with few examples","volume":"43","author":"Rusci","year":"2023","journal-title":"IEEE Micro"},{"key":"10.1016\/j.neunet.2026.108800_bib0030","doi-asserted-by":"crossref","unstructured":"Sandler, M., Howard, A., Zhu, M., Zhmoginov, A., & Chen, L.-C. (2018). Mobilenetv2: Inverted residuals and linear bottlenecks. https:\/\/arxiv.org\/abs\/1801.04381.","DOI":"10.1109\/CVPR.2018.00474"},{"key":"10.1016\/j.neunet.2026.108800_bib0031","unstructured":"Sanh, V., Debut, L., Chaumond, J., & Wolf, T. (2019). DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. https:\/\/arxiv.org\/abs\/1910.01108."},{"key":"10.1016\/j.neunet.2026.108800_bib0032","series-title":"Proceedings of the 2018 conference on empirical methods in natural language processing","first-page":"3687","article-title":"CARER: Contextualized affect representations for emotion recognition","author":"Saravia","year":"2018"},{"key":"10.1016\/j.neunet.2026.108800_bib0033","unstructured":"Shalby, H. H. Y., Pittorino, F., Palermo, F., Trojaniello, D., & Roveri, M. (2025). Dqt: Dynamic quantization training via dequantization-free nested integer arithmetic. https:\/\/arxiv.org\/abs\/2508.09176."},{"key":"10.1016\/j.neunet.2026.108800_bib0034","unstructured":"Sun, Y., Li, Z., Zhang, Y., Pan, T., Dong, B., Guo, Y., & Wang, J. (2025). Efficient attention mechanisms for large language models: A survey. https:\/\/arxiv.org\/abs\/2507.19595."},{"key":"10.1016\/j.neunet.2026.108800_bib0035","doi-asserted-by":"crossref","unstructured":"Sun, Z., Yu, H., Song, X., Liu, R., Yang, Y., & Zhou, D. (2020). MobileBERT: a compact task-agnostic BERT for resource-limited devices. http:\/\/arxiv.org\/abs\/2004.02984.","DOI":"10.18653\/v1\/2020.acl-main.195"},{"key":"10.1016\/j.neunet.2026.108800_bib0036","unstructured":"Tan, M., Pang, R., & Le, Q. V. (2020). Efficientdet: Scalable and efficient object detection. arXiv: 1911.09070 [cs, eess]. https:\/\/arxiv.org\/abs\/1911.09070."},{"key":"10.1016\/j.neunet.2026.108800_bib0037","unstructured":"Turc, I., Chang, M.-W., Lee, K., & Toutanova, K. (2019). Well-read students learn better: On the importance of pre-training compact models. arXiv preprint arXiv: 1908.08962."},{"key":"10.1016\/j.neunet.2026.108800_bib0038","article-title":"Attention is all you need","author":"Vaswani","year":"2017","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2026.108800_bib0039","unstructured":"Wang, A., Singh, A., Michael, J., Hill, F., Levy, O., & Bowman, S. R. (2019). Glue: A multi-task benchmark and analysis platform for natural language understanding. arXiv: 1804.07461 [cs]. https:\/\/arxiv.org\/abs\/1804.07461."},{"key":"10.1016\/j.neunet.2026.108800_bib0040","doi-asserted-by":"crossref","unstructured":"Wang, J., Fu, K., & Lu, C.-T. (2020). Sosnet: A graph convolutional network approach to fine-grained cyberbullying detection. https:\/\/people.cs.vt.edu\/ctlu\/Publication\/2020\/IEEE-BD-SOSNet-Wang.pdf.","DOI":"10.1109\/BigData50022.2020.9378065"},{"key":"10.1016\/j.neunet.2026.108800_bib0041","series-title":"Proceedings of the tenth international workshop on spoken dialogue systems technology (IWSDS)","article-title":"Benchmarking natural language understanding services for building conversational agents","author":"Xingkun Liu","year":"2019"},{"key":"10.1016\/j.neunet.2026.108800_bib0042","unstructured":"Yang, Z., Dai, Z., Yang, Y., Carbonell, J., Salakhutdinov, R., & Le, Q. V. (2019). Xlnet: Generalized autoregressive pretraining for language understanding. https:\/\/arxiv.org\/abs\/1906.08237."},{"key":"10.1016\/j.neunet.2026.108800_bib0043","series-title":"Nips","first-page":"0","article-title":"Character-level convolutional networks for text classification","author":"Zhang","year":"2015"},{"key":"10.1016\/j.neunet.2026.108800_bib0044","series-title":"The IEEE international conference on computer vision (ICCV)","article-title":"Aligning books and movies: Towards story-like visual explanations by watching movies and reading books","author":"Zhu","year":"2015"}],"container-title":["Neural Networks"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608026002625?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608026002625?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T23:22:50Z","timestamp":1776122570000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0893608026002625"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,8]]},"references-count":44,"alternative-id":["S0893608026002625"],"URL":"https:\/\/doi.org\/10.1016\/j.neunet.2026.108800","relation":{},"ISSN":["0893-6080"],"issn-type":[{"value":"0893-6080","type":"print"}],"subject":[],"published":{"date-parts":[[2026,8]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"EmbBERT: Attention under 2\u202fMB memory","name":"articletitle","label":"Article Title"},{"value":"Neural Networks","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neunet.2026.108800","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 The Authors. Published by Elsevier Ltd.","name":"copyright","label":"Copyright"}],"article-number":"108800"}}