{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,8]],"date-time":"2026-01-08T00:34:40Z","timestamp":1767832480264,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":36,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,1,4]],"date-time":"2024-01-04T00:00:00Z","timestamp":1704326400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-sa\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,1,4]]},"DOI":"10.1145\/3632410.3632451","type":"proceedings-article","created":{"date-parts":[[2024,1,3]],"date-time":"2024-01-03T18:15:16Z","timestamp":1704305716000},"page":"342-349","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["NanoBERT: An Extremely Compact Language Model"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9542-9250","authenticated-orcid":false,"given":"Krishanu","family":"Maity","sequence":"first","affiliation":[{"name":"Samsung R&amp;D Bangalore, India"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3466-4027","authenticated-orcid":false,"given":"Amit Tulsidas","family":"Chaulwar","sequence":"additional","affiliation":[{"name":"Samsung R&amp;D Bangalore, India"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-1182-4529","authenticated-orcid":false,"given":"Vanraj","family":"Vala","sequence":"additional","affiliation":[{"name":"Samsung R&amp;D Bangalore, India"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0157-5091","authenticated-orcid":false,"given":"Ravi Sankar","family":"Guntur","sequence":"additional","affiliation":[{"name":"Samsung R&amp;D Bangalore, India"}]}],"member":"320","published-online":{"date-parts":[[2024,1,4]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Extreme compression of sentence-transformer ranker models: faster inference, longer battery life, and less storage on edge devices. ArXiv abs\/2207.12852","author":"Chaulwar Amit","year":"2022","unstructured":"Amit Chaulwar, Lukas Malik, Maciej Krajewski, Felix Reichel, Leif-Nissen Lundb\u00e6k, Michael Huth, and Bartlomiej Matejczyk. 2022. Extreme compression of sentence-transformer ranker models: faster inference, longer battery life, and less storage on edge devices. ArXiv abs\/2207.12852 (2022)."},{"key":"e_1_3_2_1_2_1","volume-title":"EELBERT: Tiny Models through Dynamic Embeddings. In EMNLP. https:\/\/arxiv.org\/abs\/2310.20144","author":"Cohn Gabrielle","year":"2023","unstructured":"Gabrielle Cohn, Rishika Agarwal, Deepanshu Gupta, and Siddharth Patwardhan. 2023. EELBERT: Tiny Models through Dynamic Embeddings. In EMNLP. https:\/\/arxiv.org\/abs\/2310.20144"},{"key":"e_1_3_2_1_3_1","volume-title":"Hate speech dataset from a white supremacy forum. arXiv preprint arXiv:1809.04444","author":"De\u00a0Gibert Ona","year":"2018","unstructured":"Ona De\u00a0Gibert, Naiara Perez, Aitor Garc\u00eda-Pablos, and Montse Cuadros. 2018. Hate speech dataset from a white supremacy forum. arXiv preprint arXiv:1809.04444 (2018)."},{"key":"e_1_3_2_1_4_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_5_1","volume-title":"Kronecker Decomposition for GPT Compression. CoRR abs\/2110.08152","author":"Edalati Ali","year":"2021","unstructured":"Ali Edalati, Marzieh\u00a0S. Tahaei, Ahmad Rashid, Vahid\u00a0Partovi Nia, James\u00a0J. Clark, and Mehdi Rezagholizadeh. 2021. Kronecker Decomposition for GPT Compression. CoRR abs\/2110.08152 (2021). arXiv:2110.08152https:\/\/arxiv.org\/abs\/2110.08152"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Artem\u00a0M. Grachev D. Ignatov and A. Savchenko. 2017. Neural Networks Compression for Language Modeling. In Pattern Recognition and Machine Intelligence.","DOI":"10.1007\/978-3-319-69900-4_44"},{"key":"e_1_3_2_1_7_1","volume-title":"Learning word vectors for 157 languages. arXiv preprint arXiv:1802.06893","author":"Grave Edouard","year":"2018","unstructured":"Edouard Grave, Piotr Bojanowski, Prakhar Gupta, Armand Joulin, and Tomas Mikolov. 2018. Learning word vectors for 157 languages. arXiv preprint arXiv:1802.06893 (2018)."},{"key":"e_1_3_2_1_8_1","volume-title":"Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685","author":"Hu J","year":"2021","unstructured":"Edward\u00a0J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2021. Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)."},{"key":"e_1_3_2_1_9_1","first-page":"1","article-title":"Quantized Neural Networks: Training Neural Networks with Low Precision Weights and Activations","volume":"18","author":"Hubara Itay","year":"2017","unstructured":"Itay Hubara, Matthieu Courbariaux, Daniel Soudry, Ran El-Yaniv, and Yoshua Bengio. 2017. Quantized Neural Networks: Training Neural Networks with Low Precision Weights and Activations. J. Mach. Learn. Res. 18, 1 (jan 2017), 6869\u20136898.","journal-title":"J. Mach. Learn. Res."},{"key":"e_1_3_2_1_10_1","volume-title":"TinyBERT: Distilling BERT for Natural Language Understanding. CoRR abs\/1909.10351","author":"Jiao Xiaoqi","year":"2019","unstructured":"Xiaoqi Jiao, Yichun Yin, Lifeng Shang, Xin Jiang, Xiao Chen, Linlin Li, Fang Wang, and Qun Liu. 2019. TinyBERT: Distilling BERT for Natural Language Understanding. CoRR abs\/1909.10351 (2019). arXiv:1909.10351http:\/\/arxiv.org\/abs\/1909.10351"},{"key":"e_1_3_2_1_11_1","volume-title":"Character-Aware Neural Language Models. CoRR abs\/1508.06615","author":"Kim Yoon","year":"2015","unstructured":"Yoon Kim, Yacine Jernite, David\u00a0A. Sontag, and Alexander\u00a0M. Rush. 2015. Character-Aware Neural Language Models. CoRR abs\/1508.06615 (2015). arXiv:1508.06615http:\/\/arxiv.org\/abs\/1508.06615"},{"key":"e_1_3_2_1_12_1","volume-title":"Albert: A lite bert for self-supervised learning of language representations. arXiv preprint arXiv:1909.11942","author":"Lan Zhenzhong","year":"2019","unstructured":"Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, and Radu Soricut. 2019. Albert: A lite bert for self-supervised learning of language representations. arXiv preprint arXiv:1909.11942 (2019)."},{"key":"e_1_3_2_1_13_1","volume-title":"LightRNN: Memory and Computation-Efficient Recurrent Neural Networks. CoRR abs\/1610.09893","author":"Li Xiang","year":"2016","unstructured":"Xiang Li, Tao Qin, Jian Yang, and Tie-Yan Liu. 2016. LightRNN: Memory and Computation-Efficient Recurrent Neural Networks. CoRR abs\/1610.09893 (2016). arXiv:1610.09893http:\/\/arxiv.org\/abs\/1610.09893"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.3390\/computers12030060"},{"key":"e_1_3_2_1_15_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i17.17745"},{"key":"e_1_3_2_1_17_1","volume-title":"Distributed representations of words and phrases and their compositionality. Advances in neural information processing systems 26","author":"Mikolov Tomas","year":"2013","unstructured":"Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg\u00a0S Corrado, and Jeff Dean. 2013. Distributed representations of words and phrases and their compositionality. Advances in neural information processing systems 26 (2013)."},{"key":"e_1_3_2_1_18_1","volume-title":"Improved Knowledge Distillation via Teacher Assistant: Bridging the Gap Between Student and Teacher. CoRR abs\/1902.03393","author":"Mirzadeh Seyed-Iman","year":"2019","unstructured":"Seyed-Iman Mirzadeh, Mehrdad Farajtabar, Ang Li, and Hassan Ghasemzadeh. 2019. Improved Knowledge Distillation via Teacher Assistant: Bridging the Gap Between Student and Teacher. CoRR abs\/1902.03393 (2019). arXiv:1902.03393http:\/\/arxiv.org\/abs\/1902.03393"},{"key":"e_1_3_2_1_19_1","volume-title":"Exploring Sparsity in Recurrent Neural Networks. CoRR abs\/1704.05119","author":"Narang Sharan","year":"2017","unstructured":"Sharan Narang, Gregory\u00a0F. Diamos, Shubho Sengupta, and Erich Elsen. 2017. Exploring Sparsity in Recurrent Neural Networks. CoRR abs\/1704.05119 (2017). arXiv:1704.05119http:\/\/arxiv.org\/abs\/1704.05119"},{"key":"e_1_3_2_1_20_1","volume-title":"Recurrent Neural Networks With Limited Numerical Precision. CoRR abs\/1608.06902","author":"Ott Joachim","year":"2016","unstructured":"Joachim Ott, Zhouhan Lin, Ying Zhang, Shih-Chii Liu, and Yoshua Bengio. 2016. Recurrent Neural Networks With Limited Numerical Precision. CoRR abs\/1608.06902 (2016). arXiv:1608.06902http:\/\/arxiv.org\/abs\/1608.06902"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patrec.2022.03.023"},{"key":"e_1_3_2_1_22_1","volume-title":"Deep contextualized word representations. CoRR abs\/1802.05365","author":"Peters E.","year":"2018","unstructured":"Matthew\u00a0E. Peters, Mark Neumann, Mohit Iyyer, Matt Gardner, Christopher Clark, Kenton Lee, and Luke Zettlemoyer. 2018. Deep contextualized word representations. CoRR abs\/1802.05365 (2018). arXiv:1802.05365http:\/\/arxiv.org\/abs\/1802.05365"},{"key":"e_1_3_2_1_23_1","volume-title":"Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation. CoRR abs\/2004.09813","author":"Reimers Nils","year":"2020","unstructured":"Nils Reimers and Iryna Gurevych. 2020. Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation. CoRR abs\/2004.09813 (2020). arXiv:2004.09813https:\/\/arxiv.org\/abs\/2004.09813"},{"key":"e_1_3_2_1_24_1","volume-title":"Carlo Gatta, and Yoshua Bengio","author":"Romero Adriana","year":"2014","unstructured":"Adriana Romero, Nicolas Ballas, Samira\u00a0Ebrahimi Kahou, Antoine Chassang, Carlo Gatta, and Yoshua Bengio. 2014. FitNets: Hints for Thin Deep Nets. CoRR abs\/1412.6550 (2014)."},{"key":"e_1_3_2_1_25_1","volume-title":"Q-BERT: Hessian Based Ultra Low Precision Quantization of BERT. CoRR abs\/1909.05840","author":"Shen Sheng","year":"2019","unstructured":"Sheng Shen, Zhen Dong, Jiayu Ye, Linjian Ma, Zhewei Yao, Amir Gholami, Michael\u00a0W. Mahoney, and Kurt Keutzer. 2019. Q-BERT: Hessian Based Ultra Low Precision Quantization of BERT. CoRR abs\/1909.05840 (2019). arXiv:1909.05840http:\/\/arxiv.org\/abs\/1909.05840"},{"key":"e_1_3_2_1_26_1","volume-title":"Data-free parameter pruning for Deep Neural Networks. CoRR abs\/1507.06149","author":"Srinivas Suraj","year":"2015","unstructured":"Suraj Srinivas and R.\u00a0Venkatesh Babu. 2015. Data-free parameter pruning for Deep Neural Networks. CoRR abs\/1507.06149 (2015). arXiv:1507.06149http:\/\/arxiv.org\/abs\/1507.06149"},{"key":"e_1_3_2_1_27_1","volume-title":"Patient Knowledge Distillation for BERT Model Compression. CoRR abs\/1908.09355","author":"Sun Siqi","year":"2019","unstructured":"Siqi Sun, Yu Cheng, Zhe Gan, and Jingjing Liu. 2019. Patient Knowledge Distillation for BERT Model Compression. CoRR abs\/1908.09355 (2019). arXiv:1908.09355http:\/\/arxiv.org\/abs\/1908.09355"},{"key":"e_1_3_2_1_28_1","volume-title":"MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices. CoRR abs\/2004.02984","author":"Sun Zhiqing","year":"2020","unstructured":"Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou. 2020. MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices. CoRR abs\/2004.02984 (2020). arXiv:2004.02984https:\/\/arxiv.org\/abs\/2004.02984"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.5555\/3060832.3060907"},{"key":"e_1_3_2_1_30_1","first-page":"03","volume-title":"Proceedings of the Seventh Conference on Natural Language Learning at HLT-NAACL","author":"F.","year":"2003","unstructured":"Erik\u00a0F. Tjong Kim\u00a0Sang and Fien De\u00a0Meulder. 2003. Introduction to the CoNLL-2003 Shared Task: Language-Independent Named Entity Recognition. In Proceedings of the Seventh Conference on Natural Language Learning at HLT-NAACL 2003. 142\u2013147. https:\/\/aclanthology.org\/W03-0419"},{"key":"e_1_3_2_1_31_1","volume-title":"Well-read students learn better: The impact of student initialization on knowledge distillation. arXiv preprint arXiv:1908.08962 13","author":"Turc Iulia","year":"2019","unstructured":"Iulia Turc, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Well-read students learn better: The impact of student initialization on knowledge distillation. arXiv preprint arXiv:1908.08962 13 (2019)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1580"},{"key":"e_1_3_2_1_33_1","volume-title":"KDD","author":"Wang Haoyu","year":"2023","unstructured":"Haoyu Wang, Ruirui Li, Haoming Jiang, Zhengyang Wang, Xianfeng Tang, Bin Bi, Monica Cheng, Bing Yin, Yaqing Wang, Tuo Zhao, and Jing Gao. 2023. LightToken: A task and model-agnostic lightweight token embedding framework for pre-trained language models. In KDD 2023. https:\/\/www.amazon.science\/publications\/lighttoken-a-task-and-model-agnostic-lightweight-token-embedding-framework-for-pre-trained-language-models"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/BigData50022.2020.9378065"},{"key":"e_1_3_2_1_35_1","volume-title":"Advances in Neural Information Processing Systems, S.\u00a0Bengio, H.\u00a0Wallach, H.\u00a0Larochelle, K.\u00a0Grauman, N.\u00a0Cesa-Bianchi, and R.\u00a0Garnett (Eds.). Vol.\u00a031. Curran Associates","author":"Wang Peiqi","year":"2018","unstructured":"Peiqi Wang, Xinfeng Xie, Lei Deng, Guoqi Li, Dongsheng Wang, and Yuan Xie. 2018. HitNet: Hybrid Ternary Recurrent Neural Network. In Advances in Neural Information Processing Systems, S.\u00a0Bengio, H.\u00a0Wallach, H.\u00a0Larochelle, K.\u00a0Grauman, N.\u00a0Cesa-Bianchi, and R.\u00a0Garnett (Eds.). Vol.\u00a031. Curran Associates, Inc.https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2018\/file\/82cec96096d4281b7c95cd7e74623496-Paper.pdf"},{"key":"e_1_3_2_1_36_1","volume-title":"Xlnet: Generalized autoregressive pretraining for language understanding. Advances in neural information processing systems 32","author":"Yang Zhilin","year":"2019","unstructured":"Zhilin Yang, Zihang Dai, Yiming Yang, Jaime Carbonell, Russ\u00a0R Salakhutdinov, and Quoc\u00a0V Le. 2019. Xlnet: Generalized autoregressive pretraining for language understanding. Advances in neural information processing systems 32 (2019)."}],"event":{"name":"CODS-COMAD 2024: 7th Joint International Conference on Data Science & Management of Data (11th ACM IKDD CODS and 29th COMAD)","location":"Bangalore India","acronym":"CODS-COMAD 2024"},"container-title":["Proceedings of the 7th Joint International Conference on Data Science &amp; Management of Data (11th ACM IKDD CODS and 29th COMAD)"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3632410.3632451","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3632410.3632451","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T13:36:29Z","timestamp":1755869789000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3632410.3632451"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,1,4]]},"references-count":36,"alternative-id":["10.1145\/3632410.3632451","10.1145\/3632410"],"URL":"https:\/\/doi.org\/10.1145\/3632410.3632451","relation":{},"subject":[],"published":{"date-parts":[[2024,1,4]]},"assertion":[{"value":"2024-01-04","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}