{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,4,5]],"date-time":"2025-04-05T12:10:04Z","timestamp":1743855004100,"version":"3.40.3"},"publisher-location":"Cham","reference-count":49,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031887130","type":"print"},{"value":"9783031887147","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-88714-7_20","type":"book-chapter","created":{"date-parts":[[2025,4,5]],"date-time":"2025-04-05T11:33:43Z","timestamp":1743852823000},"page":"219-228","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Hierarchical Skip Decoding for\u00a0Efficient Autoregressive Language Model"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8147-3138","authenticated-orcid":false,"given":"Yunqi","family":"Zhu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8343-125X","authenticated-orcid":false,"given":"Xuebing","family":"Yang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3409-2970","authenticated-orcid":false,"given":"Yuanyuan","family":"Wu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0752-941X","authenticated-orcid":false,"given":"Wensheng","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,4,4]]},"reference":[{"key":"20_CR1","unstructured":"Abdin, M., et al.: Phi-2: The surprising power of small language models (2023)"},{"key":"20_CR2","unstructured":"Beltagy, I., Peters, M.E., Cohan, A.: LongFormer: The long-document transformer. arXiv: 2004.05150 (2020)"},{"key":"20_CR3","unstructured":"Bolukbasi, T., Wang, J., Dekel, O., Saligrama, V.: Adaptive neural networks for efficient inference. In: Proceedings of the 34th International Conference on Machine Learning, pp. 527-536 (2017)"},{"key":"20_CR4","unstructured":"Corro, L.D., Giorno, A.D., Agarwal, S., Yu, B., Awadallah, A., Mukherjee, S.: Skipdecode: autoregressive skip decoding with batching and caching for efficient LLM inference. arXiv: 2307.02628 (2023)"},{"key":"20_CR5","unstructured":"Dao, T.: Flashattention-2: Faster attention with better parallelism and work partitioning. arXiv: 2307.08691 (2023)"},{"key":"20_CR6","unstructured":"Dao, T., Fu, D.Y., Ermon, S., Rudra, A., R\u00e9, C.: FlashAttention: fast and memory-efficient exact attention with IO-awareness. In: Proceedings of NeurIPS (2022)"},{"key":"20_CR7","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of NAACL, pp. 4171\u20134186, June 2019"},{"key":"20_CR8","unstructured":"Elbayad, M., Gu, J., Grave, E., Auli, M.: Depth-adaptive transformer. In: Proceedings of ICLR (2020)"},{"key":"20_CR9","unstructured":"Fan, A., Grave, E., Joulin, A.: Reducing transformer depth on demand with structured dropout. In: Proceedings of ICLR (2020)"},{"key":"20_CR10","unstructured":"Gim, I., Chen, G., Seob Lee, S., Sarda, N., Khandelwal, A., Zhong, L.: Prompt cache: Modular attention reuse for low-latency inference. arXiv: 2311.04934 (2023)"},{"key":"20_CR11","unstructured":"Han, S., Mao, H., Dally, W.J.: Deep compression: Compressing deep neural networks with pruning, trained quantization and Huffman coding. In: Proceedings of ICLR (2016)"},{"key":"20_CR12","unstructured":"Hinton, G., Vinyals, O., Dean, J.: Distilling the knowledge in a neural network, March 2015"},{"key":"20_CR13","unstructured":"Hou, L., Huang, Z., Shang, L., Jiang, X., Chen, X., Liu, Q.: DynaBERT: Dynamic BERT with adaptive width and depth. In: Proceedings of NeurIPS, vol.\u00a033, pp. 9782\u20139793 (2020)"},{"key":"20_CR14","unstructured":"Hu, E.J., et al.: LoRA: low-rank adaptation of large language models. In: Proceedings of ICLR (2022)"},{"key":"20_CR15","unstructured":"Kaya, Y., Hong, S., Dumitras, T.: Shallow-deep networks: understanding and mitigating network overthinking. In: Proceedings of ICML, vol.\u00a097, pp. 3301\u20133310, 09\u201315 June 2019"},{"key":"20_CR16","unstructured":"Kitaev, N., Kaiser, L., Levskaya, A.: Reformer: the efficient transformer. In: Proceedings of ICLR (2020)"},{"key":"20_CR17","unstructured":"Lan, T., Cai, D., Wang, Y., Huang, H., Mao, X.L.: Copy is all you need. In: Proceedings of ICLR (2023)"},{"key":"20_CR18","unstructured":"Li, H., Kadav, A., Durdanovic, I., Samet, H., Graf, H.P.: Pruning filters for efficient convnets. In: Proceedings of ICLR (2017)"},{"key":"20_CR19","doi-asserted-by":"crossref","unstructured":"Li, L., et al.: CascadeBERT: accelerating inference of pre-trained language models via calibrated complete models cascade. In: Findings of EMNLP, pp. 475\u2013486, November 2021","DOI":"10.18653\/v1\/2021.findings-emnlp.43"},{"key":"20_CR20","doi-asserted-by":"crossref","unstructured":"Lin, B.Y., et al.: CommonGen: a constrained text generation challenge for generative commonsense reasoning. In: Findings of EMNLP, pp. 1823\u20131840 (Nov 2020)","DOI":"10.18653\/v1\/2020.findings-emnlp.165"},{"key":"20_CR21","unstructured":"Lin, C.Y.: ROUGE: a package for automatic evaluation of summaries. In: Proceedings of ACL, pp. 74\u201381, July 2004"},{"key":"20_CR22","doi-asserted-by":"crossref","unstructured":"Liu, W., Zhou, P., Wang, Z., Zhao, Z., Deng, H., Ju, Q.: FastBERT: a self-distilling BERT with adaptive inference time. In: Proceedings of ACL. pp. 6035\u20136044, July 2020","DOI":"10.18653\/v1\/2020.acl-main.537"},{"key":"20_CR23","doi-asserted-by":"crossref","unstructured":"Liu, Y., Meng, F., Zhou, J., Chen, Y., Xu, J.: Faster depth-adaptive transformers. In: Proceedings of AAAI (2020)","DOI":"10.1609\/aaai.v35i15.17584"},{"key":"20_CR24","doi-asserted-by":"crossref","unstructured":"Nallapati, R., Zhou, B., dos Santos, C., Gulcehre, C., Xiang, B.: Abstractive text summarization using sequence-to-sequence RNNs and beyond. In: Proceedings of SIGNLL, pp. 280\u2013290, August 2016","DOI":"10.18653\/v1\/K16-1028"},{"key":"20_CR25","doi-asserted-by":"crossref","unstructured":"Narayan, S., Cohen, S.B., Lapata, M.: Don\u2019t give me the details, just the summary! topic-aware convolutional neural networks for extreme summarization. In: Proceedings of EMNLP, pp. 1797\u20131807, October\u2013November 2018","DOI":"10.18653\/v1\/D18-1206"},{"key":"20_CR26","doi-asserted-by":"crossref","unstructured":"Novikova, J., Du\u0161ek, O., Rieser, V.: The E2E dataset: new challenges for end-to-end generation. In: Proceedings of the 18th Annual SIGdial Meeting on Discourse and Dialogue, pp. 201\u2013206, August 2017","DOI":"10.18653\/v1\/W17-5525"},{"key":"20_CR27","doi-asserted-by":"crossref","unstructured":"Pan, H., Wang, C., Qiu, M., Zhang, Y., Li, Y., Huang, J.: Meta-KD: A meta knowledge distillation framework for language model compression across domains. In: Proceedings of ACL-IJCNLP, pp. 3026\u20133036, August 2021","DOI":"10.18653\/v1\/2021.acl-long.236"},{"key":"20_CR28","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: BLEU: a method for automatic evaluation of machine translation. In: Proceedings of ACL, pp. 311\u2013318, July 2002","DOI":"10.3115\/1073083.1073135"},{"key":"20_CR29","doi-asserted-by":"crossref","unstructured":"Peng, B., et al.: RWKV: Reinventing RNNs for the transformer era. arXiv: 2305.13048 (2023)","DOI":"10.18653\/v1\/2023.findings-emnlp.936"},{"key":"20_CR30","unstructured":"Radford, A., Narasimhan, K.: Improving language understanding by generative pre-training (2018)"},{"key":"20_CR31","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I.: Language models are unsupervised multitask learners (2019)"},{"issue":"140","key":"20_CR32","first-page":"1","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. 21(140), 1\u201367 (2020)","journal-title":"J. Mach. Learn. Res."},{"key":"20_CR33","unstructured":"Ranzato, M., Chopra, S., Auli, M., Zaremba, W.: Sequence level training with recurrent neural networks. In: Proceedings of ICLR (2016)"},{"key":"20_CR34","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2022.101429","volume":"77","author":"H Sajjad","year":"2023","unstructured":"Sajjad, H., Dalvi, F., Durrani, N., Nakov, P.: On the effect of dropping layers of pre-trained transformer models. Comput. Speech Lang. 77, 101429 (2023)","journal-title":"Comput. Speech Lang."},{"key":"20_CR35","unstructured":"Schuster, T., et al.: Confident adaptive language modeling. In: Proceedings of NeurIPS, vol.\u00a035, pp. 17456\u201317472 (2022)"},{"key":"20_CR36","doi-asserted-by":"crossref","unstructured":"Schwartz, R., Stanovsky, G., Swayamdipta, S., Dodge, J., Smith, N.A.: The right tool for the job: Matching model and instance complexities. In: Proceedings of ACL, pp. 6640\u20136651 (Jul 2020)","DOI":"10.18653\/v1\/2020.acl-main.593"},{"key":"20_CR37","doi-asserted-by":"crossref","unstructured":"Srinivas, S., Babu, R.V.: Data-free parameter pruning for deep neural networks. In: British Machine Vision Conference (2015)","DOI":"10.5244\/C.29.31"},{"key":"20_CR38","doi-asserted-by":"crossref","unstructured":"Sun, T., et al.: A simple hash-based early exiting approach for language understanding and generation. In: Findings of the Association for Computational Linguistics: ACL 2022, pp. 2409\u20132421, May 2022","DOI":"10.18653\/v1\/2022.findings-acl.189"},{"key":"20_CR39","unstructured":"Sun, Y., et al.: Retentive network: a successor to transformer for large language models. arXiv: 2307.08621 (2023)"},{"key":"20_CR40","doi-asserted-by":"crossref","unstructured":"Teerapittayanon, S., McDanel, B., Kung, H.: BranchyNet: Fast inference via early exiting from deep neural networks. In: 2016 23rd International Conference on Pattern Recognition (ICPR), pp. 2464\u20132469 (2016)","DOI":"10.1109\/ICPR.2016.7900006"},{"key":"20_CR41","unstructured":"Wang, S., Li, B.Z., Khabsa, M., Fang, H., Ma, H.: Linformer: self-attention with linear complexity. arXiv: 2006.04768 (2020)"},{"key":"20_CR42","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"420","DOI":"10.1007\/978-3-030-01261-8_25","volume-title":"Computer Vision \u2013 ECCV 2018","author":"X Wang","year":"2018","unstructured":"Wang, X., Yu, F., Dou, Z.-Y., Darrell, T., Gonzalez, J.E.: SkipNet: learning dynamic routing in convolutional networks. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11217, pp. 420\u2013436. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01261-8_25"},{"key":"20_CR43","doi-asserted-by":"crossref","unstructured":"Xia, M., Zhong, Z., Chen, D.: Structured pruning learns compact and accurate models. In: Proceedings of ACL, pp. 1513\u20131528, May 2022","DOI":"10.18653\/v1\/2022.acl-long.107"},{"key":"20_CR44","unstructured":"Xiao, G., Tian, Y., Chen, B., Han, S., Lewis, M.: Efficient streaming language models with attention sinks. arXiv: 2309.17453 (2023)"},{"key":"20_CR45","doi-asserted-by":"crossref","unstructured":"Xin, J., Tang, R., Lee, J., Yu, Y., Lin, J.: DeeBERT: dynamic early exiting for accelerating BERT inference. In: Proceedings of ACL, pp. 2246\u20132251, July 2020","DOI":"10.18653\/v1\/2020.acl-main.204"},{"key":"20_CR46","doi-asserted-by":"crossref","unstructured":"Xin, J., Tang, R., Yu, Y., Lin, J.: BERxiT: early exiting for BERT with better fine-tuning and extension to regression. In: Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume, pp. 91\u2013104, April 2021","DOI":"10.18653\/v1\/2021.eacl-main.8"},{"key":"20_CR47","doi-asserted-by":"crossref","unstructured":"Zeng, G., et al.: MedDialog: large-scale medical dialogue datasets. In: Proceedings of EMNLP, pp. 9241\u20139250, November 2020","DOI":"10.18653\/v1\/2020.emnlp-main.743"},{"key":"20_CR48","unstructured":"Zhang, T., Kishore, V., Wu, F., Weinberger, K.Q., Artzi, Y.: BERTscore: evaluating text generation with BERT. In: Proceedings of ICLR (2020)"},{"key":"20_CR49","unstructured":"Zhou, W., Xu, C., Ge, T., McAuley, J., Xu, K., Wei, F.: Bert loses patience: fast and robust inference with early exit. In: Proceedings of NeurIPS, vol.\u00a033, pp. 18330\u201318341 (2020)"}],"container-title":["Lecture Notes in Computer Science","Advances in Information Retrieval"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-88714-7_20","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,5]],"date-time":"2025-04-05T11:33:53Z","timestamp":1743852833000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-88714-7_20"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031887130","9783031887147"],"references-count":49,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-88714-7_20","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"4 April 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECIR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Information Retrieval","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Lucca","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7 April 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 April 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"47","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ecir2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/ecir2025.eu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}