{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,19]],"date-time":"2025-09-19T09:17:30Z","timestamp":1758273450066,"version":"3.40.3"},"publisher-location":"Cham","reference-count":30,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783030863647"},{"type":"electronic","value":"9783030863654"}],"license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021]]},"DOI":"10.1007\/978-3-030-86365-4_46","type":"book-chapter","created":{"date-parts":[[2021,9,10]],"date-time":"2021-09-10T11:02:39Z","timestamp":1631271759000},"page":"570-581","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Extract then Distill: Efficient and\u00a0Effective Task-Agnostic BERT Distillation"],"prefix":"10.1007","author":[{"given":"Cheng","family":"Chen","sequence":"first","affiliation":[]},{"given":"Yichun","family":"Yin","sequence":"additional","affiliation":[]},{"given":"Lifeng","family":"Shang","sequence":"additional","affiliation":[]},{"given":"Zhi","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Xin","family":"Jiang","sequence":"additional","affiliation":[]},{"given":"Xiao","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Qun","family":"Liu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,9,7]]},"reference":[{"key":"46_CR1","unstructured":"Clark, K., Luong, M.T., Le, Q.V., Manning, C.D.: ELECTRA: pre-training text encoders as discriminators rather than generators. In: ICLR 2019"},{"key":"46_CR2","unstructured":"Devlin, J., Chang, M.W., Lee, K., et al.: BERT: pre-training of deep bidirectional transformers for language understanding. In: NAACL 2019"},{"key":"46_CR3","unstructured":"Fan, A., Grave, E., Joulin, A.: Reducing transformer depth on demand with structured dropout. In: ICLR (2019)"},{"key":"46_CR4","unstructured":"Frankle, J., Carbin, M.: The lottery ticket hypothesis: finding sparse, trainable neural networks. In: ICLR (2018)"},{"key":"46_CR5","doi-asserted-by":"crossref","unstructured":"Gordon, M.A., Duh, K., Andrews, N.: Compressing BERT: studying the effects of weight pruning on transfer learning. In: ACL (2020)","DOI":"10.18653\/v1\/2020.repl4nlp-1.18"},{"key":"46_CR6","unstructured":"Han, S., Pool, J., Tran, J., Dally, W.J.: Learning both weights and connections for efficient neural networks. In: NIPS (2015)"},{"key":"46_CR7","unstructured":"Hendrycks, D., Gimpel, K.: Gaussian error linear units (GELUs). arXiv preprint arXiv:1606.08415 (2016)"},{"key":"46_CR8","unstructured":"Hinton, G., Vinyals, O., Dean, J.: Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531 (2015)"},{"key":"46_CR9","unstructured":"Hou, L., Huang, Z., Shang, L., Jiang, X., Chen, X., Liu, Q.: DynaBERT: dynamic BERT with adaptive width and depth. In: NIPS (2020)"},{"key":"46_CR10","doi-asserted-by":"crossref","unstructured":"Jiao, X., Chang, H., Yin, Y., et al.: Improving task-agnostic BERT distillation with layer mapping search. arXiv preprint arXiv:2012.06153 (2020)","DOI":"10.1016\/j.neucom.2021.07.050"},{"key":"46_CR11","doi-asserted-by":"crossref","unstructured":"Jiao, X., Yin, Y., Shang, L., et al.: TinyBERT: distilling BERT for natural language understanding. In: EMNLP 2020: Findings (2020)","DOI":"10.18653\/v1\/2020.findings-emnlp.372"},{"key":"46_CR12","unstructured":"Lan, Z., Chen, M., Goodman, S., Gimpel, K., et al.: ALBERT: a lite BERT for self-supervised learning of language representations. In: ICLR (2019)"},{"key":"46_CR13","unstructured":"Liu, Y., Ott, M., Goyal, N., Du, J., et al.: RoBERTa: a robustly optimized BERT pretraining approach. CoRR (2019)"},{"key":"46_CR14","unstructured":"McCarley, J., Chakravarti, R., Sil, A.: Structured pruning of a BERT-based question answering model. arXiv preprint arXiv:1910.06360 (2019)"},{"key":"46_CR15","unstructured":"Michel, P., Levy, O., Neubig, G.: Are sixteen heads really better than one? In: NIPS (2019)"},{"key":"46_CR16","unstructured":"Raffel, C., Shazeer, N., Roberts, A., Lee, K., Narang, S., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. In: JMLR (2020)"},{"key":"46_CR17","doi-asserted-by":"crossref","unstructured":"Rajpurkar, P., Zhang, J., Lopyrev, K., Liang, P.: Squad: 100,000+ questions for machine comprehension of text. In: EMNLP (2016)","DOI":"10.18653\/v1\/D16-1264"},{"key":"46_CR18","unstructured":"Sajjad, H., Dalvi, F., Durrani, N., Nakov, P.: Poor man\u2019s BERT: smaller and faster transformer models. arXiv preprint arXiv:2004.03844 (2020)"},{"key":"46_CR19","unstructured":"Sanh, V., Debut, L., Chaumond, J., Wolf, T.: DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108 (2019)"},{"key":"46_CR20","doi-asserted-by":"crossref","unstructured":"Sun, S., Cheng, Y., Gan, Z., Liu, J.: Patient knowledge distillation for BERT model compression. In: EMNLP-IJCNLP (2019)","DOI":"10.18653\/v1\/D19-1441"},{"key":"46_CR21","doi-asserted-by":"crossref","unstructured":"Sun, Z., Yu, H., Song, X., Liu, R., Yang, Y., Zhou, D.: MobileBERT: a compact task-agnostic BERT for resource-limited devices. In: ACL (2020)","DOI":"10.18653\/v1\/2020.acl-main.195"},{"key":"46_CR22","unstructured":"Tang, R., Lu, Y., Liu, L., Mou, L., et al.: Distilling task-specific knowledge from BERT into simple neural networks. arXiv preprint arXiv:1903.12136 (2019)"},{"key":"46_CR23","doi-asserted-by":"crossref","unstructured":"Voita, E., Talbot, D., Moiseev, F., et al.: Analyzing multi-head self-attention: specialized heads do the heavy lifting, the rest can be pruned. In: ACL (2019)","DOI":"10.18653\/v1\/P19-1580"},{"key":"46_CR24","doi-asserted-by":"crossref","unstructured":"Wang, A., Singh, A., Michael, J., et al.: GLUE: a multi-task benchmark and analysis platform for natural language understanding. In: EMNLP (2018)","DOI":"10.18653\/v1\/W18-5446"},{"key":"46_CR25","doi-asserted-by":"crossref","unstructured":"Wang, W., Wei, F., Dong, L., Bao, H., et al.: MiniLM: deep self-attention distillation for task-agnostic compression of pre-trained transformers. In: NIPS (2020)","DOI":"10.18653\/v1\/2021.findings-acl.188"},{"key":"46_CR26","unstructured":"de Wynter, A., Perry, D.J.: Optimal subarchitecture extraction for BERT. arXiv preprint arXiv:2010.10499 (2020)"},{"key":"46_CR27","unstructured":"Yang, Z., Dai, Z., Yang, Y., Carbonell, J., Salakhutdinov, R.R., Le, Q.V.: XLNet: generalized autoregressive pretraining for language understanding. In: NIPS (2019)"},{"issue":"1","key":"46_CR28","doi-asserted-by":"publisher","first-page":"49","DOI":"10.1111\/j.1467-9868.2005.00532.x","volume":"68","author":"M Yuan","year":"2006","unstructured":"Yuan, M., Lin, Y.: Model selection and estimation in regression with grouped variables. J. R. Stat. Soc. Ser. B 68(1), 49\u201367 (2006)","journal-title":"J. R. Stat. Soc. Ser. B"},{"key":"46_CR29","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Qi, F., Liu, Z., Liu, Q., Sun, M.: Know what you don\u2019t need: single-shot meta-pruning for attention heads. arXiv preprint arXiv:2011.03770 (2020)","DOI":"10.1016\/j.aiopen.2021.05.003"},{"key":"46_CR30","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Kiros, R., Zemel, R., et al.: Aligning books and movies: towards story-like visual explanations by watching movies and reading books. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.11"}],"container-title":["Lecture Notes in Computer Science","Artificial Neural Networks and Machine Learning \u2013 ICANN 2021"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-86365-4_46","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,9,10]],"date-time":"2021-09-10T11:14:25Z","timestamp":1631272465000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-86365-4_46"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"ISBN":["9783030863647","9783030863654"],"references-count":30,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-86365-4_46","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2021]]},"assertion":[{"value":"7 September 2021","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICANN","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Artificial Neural Networks","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Bratislava","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Slovakia","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2021","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 September 2021","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17 September 2021","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"30","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icann2021","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/e-nns.org\/icann2021\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"OCS","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"496","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"265","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"53% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2.5","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Conference was held online due to the COVID-19 pandemic.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}