{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,11]],"date-time":"2026-04-11T06:55:36Z","timestamp":1775890536426,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":59,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,8,14]],"date-time":"2022-08-14T00:00:00Z","timestamp":1660435200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,8,14]]},"DOI":"10.1145\/3534678.3539260","type":"proceedings-article","created":{"date-parts":[[2022,8,12]],"date-time":"2022-08-12T19:06:12Z","timestamp":1660331172000},"page":"784-794","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":82,"title":["Learned Token Pruning for Transformers"],"prefix":"10.1145","author":[{"given":"Sehoon","family":"Kim","sequence":"first","affiliation":[{"name":"University of California, Berkeley, Berkeley, CA, USA"}]},{"given":"Sheng","family":"Shen","sequence":"additional","affiliation":[{"name":"University of California, Berkeley, Berkeley, CA, USA"}]},{"given":"David","family":"Thorsley","sequence":"additional","affiliation":[{"name":"Samsung Semiconductor, Inc., San Jose, CA, USA"}]},{"given":"Amir","family":"Gholami","sequence":"additional","affiliation":[{"name":"University of California, Berkeley, Berkeley, CA, USA"}]},{"given":"Woosuk","family":"Kwon","sequence":"additional","affiliation":[{"name":"University of California, Berkeley, Berkeley, CA, USA"}]},{"given":"Joseph","family":"Hassoun","sequence":"additional","affiliation":[{"name":"Samsung Semiconductor, Inc., Berkeley, CA, USA"}]},{"given":"Kurt","family":"Keutzer","sequence":"additional","affiliation":[{"name":"University of California, Berkeley, Berkeley, CA, USA"}]}],"member":"320","published-online":{"date-parts":[[2022,8,14]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"BinaryBERT: Pushing the Limit of BERT Quantization. arXiv preprint arXiv:2012.15701","author":"Bai Haoli","year":"2020","unstructured":"Haoli Bai,Wei Zhang, Lu Hou, Lifeng Shang, Jing Jin, Xin Jiang, Qun Liu, Michael Lyu, and Irwin King. 2020. BinaryBERT: Pushing the Limit of BERT Quantization. arXiv preprint arXiv:2012.15701 (2020)."},{"key":"e_1_3_2_2_2_1","volume-title":"Efficient 8-bit quantization of transformer neural machine language translation model. arXiv preprint arXiv:1906.00532","author":"Bhandare Aishwarya","year":"2019","unstructured":"Aishwarya Bhandare, Vamsi Sripathi, Deepthi Karkada, Vivek Menon, Sun Choi, Kushal Datta, and Vikram Saletore. 2019. Efficient 8-bit quantization of transformer neural machine language translation model. arXiv preprint arXiv:1906.00532 (2019)."},{"key":"e_1_3_2_2_3_1","volume-title":"Semeval-2017 task 1: Semantic textual similarity-multilingual and cross-lingual focused evaluation. arXiv preprint arXiv:1708.00055","author":"Cer Daniel","year":"2017","unstructured":"Daniel Cer, Mona Diab, Eneko Agirre, Inigo Lopez-Gazpio, and Lucia Specia. 2017. Semeval-2017 task 1: Semantic textual similarity-multilingual and cross-lingual focused evaluation. arXiv preprint arXiv:1708.00055 (2017)."},{"key":"e_1_3_2_2_4_1","volume-title":"The lottery ticket hypothesis for pre-trained BERT networks. arXiv preprint arXiv:2007.12223","author":"Chen Tianlong","year":"2020","unstructured":"Tianlong Chen, Jonathan Frankle, Shiyu Chang, Sijia Liu, Yang Zhang, Zhangyang Wang, and Michael Carbin. 2020. The lottery ticket hypothesis for pre-trained BERT networks. arXiv preprint arXiv:2007.12223 (2020)."},{"key":"e_1_3_2_2_5_1","volume-title":"Generating long sequences with sparse transformers. arXiv preprint arXiv:1904.10509","author":"Child Rewon","year":"2019","unstructured":"Rewon Child, Scott Gray, Alec Radford, and Ilya Sutskever. 2019. Generating long sequences with sparse transformers. arXiv preprint arXiv:1904.10509 (2019)."},{"key":"e_1_3_2_2_6_1","volume-title":"Machine Learning Challenges Workshop. Springer, 177--190","author":"Dagan Ido","year":"2005","unstructured":"Ido Dagan, Oren Glickman, and Bernardo Magnini. 2005. The PASCAL recognising textual entailment challenge. In Machine Learning Challenges Workshop. Springer, 177--190."},{"key":"e_1_3_2_2_7_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_2_8_1","volume-title":"Proceedings of the Third International Workshop on Paraphrasing (IWP2005)","author":"Dolan William B","year":"2005","unstructured":"William B Dolan and Chris Brockett. 2005. Automatically constructing a corpus of sentential paraphrases. In Proceedings of the Third International Workshop on Paraphrasing (IWP2005)."},{"key":"e_1_3_2_2_9_1","volume-title":"Reducing transformer depth on demand with structured dropout. arXiv preprint arXiv:1909.11556","author":"Fan Angela","year":"2019","unstructured":"Angela Fan, Edouard Grave, and Armand Joulin. 2019. Reducing transformer depth on demand with structured dropout. arXiv preprint arXiv:1909.11556 (2019)."},{"key":"e_1_3_2_2_10_1","volume-title":"Training with quantization noise for extreme model compression. arXiv e-prints","author":"Fan Angela","year":"2020","unstructured":"Angela Fan, Pierre Stock, Benjamin Graham, Edouard Grave, R\u00e9mi Gribonval, Herv\u00e9 J\u00e9gou, and Armand Joulin. 2020. Training with quantization noise for extreme model compression. arXiv e-prints (2020), arXiv--2004."},{"key":"e_1_3_2_2_11_1","volume-title":"The lottery ticket hypothesis: Finding sparse, trainable neural networks. arXiv preprint arXiv:1803.03635","author":"Frankle Jonathan","year":"2018","unstructured":"Jonathan Frankle and Michael Carbin. 2018. The lottery ticket hypothesis: Finding sparse, trainable neural networks. arXiv preprint arXiv:1803.03635 (2018)."},{"key":"e_1_3_2_2_12_1","volume-title":"A survey of quantization methods for efficient neural network inference. arXiv preprint arXiv:2103.13630","author":"Gholami Amir","year":"2021","unstructured":"Amir Gholami, Sehoon Kim, Zhen Dong, Zhewei Yao, Michael W Mahoney, and Kurt Keutzer. 2021. A survey of quantization methods for efficient neural network inference. arXiv preprint arXiv:2103.13630 (2021)."},{"key":"e_1_3_2_2_13_1","volume-title":"International Conference on Machine Learning. PMLR, 3690--3699","author":"Goyal Saurabh","year":"2020","unstructured":"Saurabh Goyal, Anamitra Roy Choudhury, Saurabh Raje, Venkatesan Chakaravarthy, Yogish Sabharwal, and Ashish Verma. 2020. Power-bert: Accelerating bert inference via progressive word-vector elimination. In International Conference on Machine Learning. PMLR, 3690--3699."},{"key":"e_1_3_2_2_14_1","volume-title":"Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531","author":"Hinton Geoffrey","year":"2015","unstructured":"Geoffrey Hinton, Oriol Vinyals, and Jeff Dean. 2015. Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531 (2015)."},{"key":"e_1_3_2_2_15_1","volume-title":"Dynabert: Dynamic bert with adaptive width and depth. arXiv preprint arXiv:2004.04037","author":"Hou Lu","year":"2020","unstructured":"Lu Hou, Zhiqi Huang, Lifeng Shang, Xin Jiang, Xiao Chen, and Qun Liu. 2020. Dynabert: Dynamic bert with adaptive width and depth. arXiv preprint arXiv:2004.04037 (2020)."},{"key":"e_1_3_2_2_16_1","volume-title":"SqueezeBERT: What can computer vision teach NLP about efficient neural networks? arXiv preprint arXiv:2006.11316","author":"Iandola Forrest N","year":"2020","unstructured":"Forrest N Iandola, Albert E Shaw, Ravi Krishna, and Kurt W Keutzer. 2020. SqueezeBERT: What can computer vision teach NLP about efficient neural networks? arXiv preprint arXiv:2006.11316 (2020)."},{"key":"e_1_3_2_2_17_1","volume-title":"First Quora Dataset Release: Question Pairs.(2017). URL https:\/\/data. quora. com\/First-Quora-Dataset-Release-Question-Pairs","author":"Iyer Shankar","year":"2017","unstructured":"Shankar Iyer, Nikhil Dandekar, and Kornl Csernai. 2017. First Quora Dataset Release: Question Pairs.(2017). URL https:\/\/data. quora. com\/First-Quora-Dataset-Release-Question-Pairs (2017)."},{"key":"e_1_3_2_2_18_1","volume-title":"Tinybert: Distilling bert for natural language understanding. arXiv preprint arXiv:1909.10351","author":"Jiao Xiaoqi","year":"2019","unstructured":"Xiaoqi Jiao, Yichun Yin, Lifeng Shang, Xin Jiang, Xiao Chen, Linlin Li, FangWang, and Qun Liu. 2019. Tinybert: Distilling bert for natural language understanding. arXiv preprint arXiv:1909.10351 (2019)."},{"key":"e_1_3_2_2_19_1","volume-title":"International Conference on Machine Learning. PMLR, 5156--5165","author":"Katharopoulos Angelos","year":"2020","unstructured":"Angelos Katharopoulos, Apoorv Vyas, Nikolaos Pappas, and Fran\u00e7ois Fleuret. 2020. Transformers are rnns: Fast autoregressive transformers with linear attention. In International Conference on Machine Learning. PMLR, 5156--5165."},{"key":"e_1_3_2_2_20_1","volume-title":"schubert: Optimizing elements of bert. arXiv preprint arXiv:2005.06628","author":"Khetan Ashish","year":"2020","unstructured":"Ashish Khetan and Zohar Karnin. 2020. schubert: Optimizing elements of bert. arXiv preprint arXiv:2005.06628 (2020)."},{"key":"e_1_3_2_2_21_1","volume-title":"Use Anytime with Search. arXiv preprint arXiv:2010.07003","author":"Kim Gyuwan","year":"2020","unstructured":"Gyuwan Kim and Kyunghyun Cho. 2020. Length-Adaptive Transformer: Train Once with Length Drop, Use Anytime with Search. arXiv preprint arXiv:2010.07003 (2020)."},{"key":"e_1_3_2_2_22_1","volume-title":"I-BERT: Integer-only BERT Quantization. International conference on machine learning","author":"Kim Sehoon","year":"2021","unstructured":"Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W Mahoney, and Kurt Keutzer. 2021. I-BERT: Integer-only BERT Quantization. International conference on machine learning (2021)."},{"key":"e_1_3_2_2_23_1","volume-title":"Reformer: The efficient transformer. arXiv preprint arXiv:2001.04451","author":"Kitaev Nikita","year":"2020","unstructured":"Nikita Kitaev, Lukasz Kaiser, and Anselm Levskaya. 2020. Reformer: The efficient transformer. arXiv preprint arXiv:2001.04451 (2020)."},{"key":"e_1_3_2_2_24_1","volume-title":"Block pruning for faster transformers. arXiv preprint arXiv:2109.04838","author":"Lagunas Fran\u00e7ois","year":"2021","unstructured":"Fran\u00e7ois Lagunas, Ella Charlaix, Victor Sanh, and Alexander M Rush. 2021. Block pruning for faster transformers. arXiv preprint arXiv:2109.04838 (2021)."},{"key":"e_1_3_2_2_25_1","volume-title":"Albert: A lite bert for self-supervised learning of language representations. arXiv preprint arXiv:1909.11942","author":"Lan Zhenzhong","year":"2019","unstructured":"Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, and Radu Soricut. 2019. Albert: A lite bert for self-supervised learning of language representations. arXiv preprint arXiv:1909.11942 (2019)."},{"key":"e_1_3_2_2_26_1","volume-title":"Efficient transformer-based large scale language representations using hardware-friendly block structured pruning. arXiv preprint arXiv:2009.08065","author":"Li Bingbing","year":"2020","unstructured":"Bingbing Li, Zhenglun Kong, Tianyun Zhang, Ji Li, Zhengang Li, Hang Liu, and Caiwen Ding. 2020. Efficient transformer-based large scale language representations using hardware-friendly block structured pruning. arXiv preprint arXiv:2009.08065 (2020)."},{"key":"e_1_3_2_2_27_1","volume-title":"Zi Yang, Nan Hua, and Dan Roth.","author":"Lin Zi","year":"2020","unstructured":"Zi Lin, Jeremiah Zhe Liu, Zi Yang, Nan Hua, and Dan Roth. 2020. Pruning Redundant Mappings in Transformer Models via Spectral-Normalized Identity Prior. arXiv preprint arXiv:2010.01791 (2020)."},{"key":"e_1_3_2_2_28_1","volume-title":"RoBERTa: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. RoBERTa: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-acl.425"},{"key":"e_1_3_2_2_30_1","volume-title":"Are sixteen heads really better than one? arXiv preprint arXiv:1905.10650","author":"Michel Paul","year":"2019","unstructured":"Paul Michel, Omer Levy, and Graham Neubig. 2019. Are sixteen heads really better than one? arXiv preprint arXiv:1905.10650 (2019)."},{"key":"e_1_3_2_2_31_1","volume-title":"When BERT plays the lottery, all tickets are winning. arXiv preprint arXiv:2005.00561","author":"Prasanna Sai","year":"2020","unstructured":"Sai Prasanna, Anna Rogers, and Anna Rumshisky. 2020. When BERT plays the lottery, all tickets are winning. arXiv preprint arXiv:2005.00561 (2020)."},{"key":"e_1_3_2_2_32_1","volume-title":"Train Short","author":"Press Ofir","year":"2021","unstructured":"Ofir Press, Noah A Smith, and Mike Lewis. 2021. Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation. arXiv preprint arXiv:2108.12409 (2021)."},{"key":"e_1_3_2_2_33_1","volume-title":"Know what you don't know: Unanswerable questions for SQuAD. arXiv preprint arXiv:1806.03822","author":"Rajpurkar Pranav","year":"2018","unstructured":"Pranav Rajpurkar, Robin Jia, and Percy Liang. 2018. Know what you don't know: Unanswerable questions for SQuAD. arXiv preprint arXiv:1806.03822 (2018)."},{"key":"e_1_3_2_2_34_1","volume-title":"100,000+ questions for machine comprehension of text. arXiv preprint arXiv:1606.05250","author":"Rajpurkar Pranav","year":"2016","unstructured":"Pranav Rajpurkar, Jian Zhang, Konstantin Lopyrev, and Percy Liang. 2016. SQuAD: 100,000+ questions for machine comprehension of text. arXiv preprint arXiv:1606.05250 (2016)."},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00353"},{"key":"e_1_3_2_2_36_1","volume-title":"On the Effect of Dropping Layers of Pre-trained Transformer Models. arXiv preprint arXiv:2004.03844","author":"Sajjad Hassan","year":"2020","unstructured":"Hassan Sajjad, Fahim Dalvi, Nadir Durrani, and Preslav Nakov. 2020. On the Effect of Dropping Layers of Pre-trained Transformer Models. arXiv preprint arXiv:2004.03844 (2020)."},{"key":"e_1_3_2_2_37_1","volume-title":"a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108","author":"Sanh Victor","year":"2019","unstructured":"Victor Sanh, Lysandre Debut, Julien Chaumond, and Thomas Wolf. 2019. DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108 (2019)."},{"key":"e_1_3_2_2_38_1","volume-title":"Movement pruning: Adaptive sparsity by fine-tuning. arXiv preprint arXiv:2005.07683","author":"Sanh Victor","year":"2020","unstructured":"Victor Sanh, Thomas Wolf, and Alexander M Rush. 2020. Movement pruning: Adaptive sparsity by fine-tuning. arXiv preprint arXiv:2005.07683 (2020)."},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"crossref","unstructured":"Sheng Shen Zhen Dong Jiayu Ye Linjian Ma Zhewei Yao Amir Gholami Michael W Mahoney and Kurt Keutzer. 2020. Q-BERT: Hessian Based Ultra Low Precision Quantization of BERT.. In AAAI. 8815--8821.","DOI":"10.1609\/aaai.v34i05.6409"},{"key":"e_1_3_2_2_40_1","volume-title":"Proceedings of the 2013 conference on empirical methods in natural language processing. 1631--1642","author":"Socher Richard","year":"2013","unstructured":"Richard Socher, Alex Perelygin, JeanWu, Jason Chuang, Christopher D Manning, Andrew Y Ng, and Christopher Potts. 2013. Recursive deep models for semantic compositionality over a sentiment treebank. In Proceedings of the 2013 conference on empirical methods in natural language processing. 1631--1642."},{"key":"e_1_3_2_2_41_1","volume-title":"Patient knowledge distillation for bert model compression. arXiv preprint arXiv:1908.09355","author":"Sun Siqi","year":"2019","unstructured":"Siqi Sun, Yu Cheng, Zhe Gan, and Jingjing Liu. 2019. Patient knowledge distillation for bert model compression. arXiv preprint arXiv:1908.09355 (2019)."},{"key":"e_1_3_2_2_42_1","volume-title":"Mobilebert: a compact task-agnostic bert for resource-limited devices. arXiv preprint arXiv:2004.02984","author":"Sun Zhiqing","year":"2020","unstructured":"Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou. 2020. Mobilebert: a compact task-agnostic bert for resource-limited devices. arXiv preprint arXiv:2004.02984 (2020)."},{"key":"e_1_3_2_2_43_1","volume-title":"Distilling task-specific knowledge from BERT into simple neural networks. arXiv preprint arXiv:1903.12136","author":"Tang Raphael","year":"2019","unstructured":"Raphael Tang, Yao Lu, Linqing Liu, Lili Mou, Olga Vechtomova, and Jimmy Lin. 2019. Distilling task-specific knowledge from BERT into simple neural networks. arXiv preprint arXiv:1903.12136 (2019)."},{"key":"e_1_3_2_2_44_1","volume-title":"International Conference on Machine Learning. PMLR, 9438--9447","author":"Tay Yi","year":"2020","unstructured":"Yi Tay, Dara Bahri, Liu Yang, Donald Metzler, and Da-Cheng Juan. 2020. Sparse sinkhorn attention. In International Conference on Machine Learning. PMLR, 9438--9447."},{"key":"e_1_3_2_2_45_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N Gomez Lukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. In Advances in neural information processing systems. 5998--6008."},{"key":"e_1_3_2_2_46_1","volume-title":"Analyzing multi-head self-attention: Specialized heads do the heavy lifting, the rest can be pruned. arXiv preprint arXiv:1905.09418","author":"Voita Elena","year":"2019","unstructured":"Elena Voita, David Talbot, Fedor Moiseev, Rico Sennrich, and Ivan Titov. 2019. Analyzing multi-head self-attention: Specialized heads do the heavy lifting, the rest can be pruned. arXiv preprint arXiv:1905.09418 (2019)."},{"key":"e_1_3_2_2_47_1","volume-title":"Fast transformers with clustered attention. Advances in Neural Information Processing Systems 33","author":"Vyas Apoorv","year":"2020","unstructured":"Apoorv Vyas, Angelos Katharopoulos, and Fran\u00e7ois Fleuret. 2020. Fast transformers with clustered attention. Advances in Neural Information Processing Systems 33 (2020)."},{"key":"e_1_3_2_2_48_1","volume-title":"GLUE: A multi-task benchmark and analysis platform for natural language understanding. arXiv preprint arXiv:1804.07461","author":"Singh Amanpreet","year":"2018","unstructured":"AlexWang, Amanpreet Singh, Julian Michael, Felix Hill, Omer Levy, and Samuel R Bowman. 2018. GLUE: A multi-task benchmark and analysis platform for natural language understanding. arXiv preprint arXiv:1804.07461 (2018)."},{"key":"e_1_3_2_2_49_1","volume-title":"SpAtten: Efficient Sparse Attention Architecture with Cascade Token and Head Pruning. arXiv preprint arXiv:2012.09852","author":"Wang Hanrui","year":"2020","unstructured":"Hanrui Wang, Zhekai Zhang, and Song Han. 2020. SpAtten: Efficient Sparse Attention Architecture with Cascade Token and Head Pruning. arXiv preprint arXiv:2012.09852 (2020)."},{"key":"e_1_3_2_2_50_1","volume-title":"Linformer: Self-Attention with Linear Complexity. arXiv preprint arXiv:2006.04768","author":"Li Belinda","year":"2020","unstructured":"SinongWang, Belinda Li, Madian Khabsa, Han Fang, and Hao Ma. 2020. Linformer: Self-Attention with Linear Complexity. arXiv preprint arXiv:2006.04768 (2020)."},{"key":"e_1_3_2_2_51_1","volume-title":"Structured pruning of large language models. arXiv preprint arXiv:1910.04732","author":"Wang Ziheng","year":"2019","unstructured":"Ziheng Wang, Jeremy Wohlwend, and Tao Lei. 2019. Structured pruning of large language models. arXiv preprint arXiv:1910.04732 (2019)."},{"key":"e_1_3_2_2_52_1","volume-title":"A broad-coverage challenge corpus for sentence understanding through inference. arXiv preprint arXiv:1704.05426","author":"Williams Adina","year":"2017","unstructured":"Adina Williams, Nikita Nangia, and Samuel R Bowman. 2017. A broad-coverage challenge corpus for sentence understanding through inference. arXiv preprint arXiv:1704.05426 (2017)."},{"key":"e_1_3_2_2_53_1","volume-title":"MLPruning: A Multilevel Structured Pruning Framework for Transformer-based Models. arXiv preprint arXiv:2105.14636","author":"Yao Zhewei","year":"2021","unstructured":"Zhewei Yao, Linjian Ma, Sheng Shen, Kurt Keutzer, and MichaelWMahoney. 2021. MLPruning: A Multilevel Structured Pruning Framework for Transformer-based Models. arXiv preprint arXiv:2105.14636 (2021)."},{"key":"e_1_3_2_2_54_1","volume-title":"TR-BERT: Dynamic Token Reduction for Accelerating BERT Inference. arXiv preprint arXiv:2105.11618","author":"Ye Deming","year":"2021","unstructured":"Deming Ye, Yankai Lin, Yufei Huang, and Maosong Sun. 2021. TR-BERT: Dynamic Token Reduction for Accelerating BERT Inference. arXiv preprint arXiv:2105.11618 (2021)."},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00071"},{"key":"e_1_3_2_2_56_1","volume-title":"Q8BERT: Quantized 8bit bert. arXiv preprint arXiv:1910.06188","author":"Zafrir Ofir","year":"2019","unstructured":"Ofir Zafrir, Guy Boudoukh, Peter Izsak, and Moshe Wasserblat. 2019. Q8BERT: Quantized 8bit bert. arXiv preprint arXiv:1910.06188 (2019)."},{"key":"e_1_3_2_2_57_1","unstructured":"Manzil Zaheer Guru Guruganesh Avinava Dubey Joshua Ainslie Chris Alberti Santiago Ontanon Philip Pham Anirudh Ravula QifanWang Li Yang et al. 2020. Big bird: Transformers for longer sequences. arXiv preprint arXiv:2007.14062 (2020)."},{"key":"e_1_3_2_2_58_1","volume-title":"Ternarybert: Distillation-aware ultra-low bit bert. arXiv preprint arXiv:2009.12812","author":"Zhang Wei","year":"2020","unstructured":"Wei Zhang, Lu Hou, Yichun Yin, Lifeng Shang, Xiao Chen, Xin Jiang, and Qun Liu. 2020. Ternarybert: Distillation-aware ultra-low bit bert. arXiv preprint arXiv:2009.12812 (2020)."},{"key":"e_1_3_2_2_59_1","volume-title":"Masking as an efficient alternative to finetuning for pretrained language models. arXiv preprint arXiv:2004.12406","author":"Zhao Mengjie","year":"2020","unstructured":"Mengjie Zhao, Tao Lin, Fei Mi, Martin Jaggi, and Hinrich Sch\u00fctze. 2020. Masking as an efficient alternative to finetuning for pretrained language models. arXiv preprint arXiv:2004.12406 (2020)."}],"event":{"name":"KDD '22: The 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining","location":"Washington DC USA","acronym":"KDD '22","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data"]},"container-title":["Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3534678.3539260","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3534678.3539260","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T18:59:59Z","timestamp":1750186799000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3534678.3539260"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,8,14]]},"references-count":59,"alternative-id":["10.1145\/3534678.3539260","10.1145\/3534678"],"URL":"https:\/\/doi.org\/10.1145\/3534678.3539260","relation":{},"subject":[],"published":{"date-parts":[[2022,8,14]]},"assertion":[{"value":"2022-08-14","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}