{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:24:04Z","timestamp":1750220644156,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":24,"publisher":"ACM","license":[{"start":{"date-parts":[[2020,12,18]],"date-time":"2020-12-18T00:00:00Z","timestamp":1608249600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2020,12,18]]},"DOI":"10.1145\/3443279.3443302","type":"proceedings-article","created":{"date-parts":[[2021,2,1]],"date-time":"2021-02-01T22:50:44Z","timestamp":1612219844000},"page":"131-137","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Compressed-Transformer"],"prefix":"10.1145","author":[{"given":"Yuan,","family":"Chen","sequence":"first","affiliation":[{"name":"School of Data and Computer Science Sun Yat-sen University, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Pan,","family":"Rong","sequence":"additional","affiliation":[{"name":"School of Data and Computer Science Sun Yat-sen University, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2021,2]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Aguilar G. Ling Y. Zhang Y. Yao B. Fan X. and Guo C. 2019. Knowledge distillation from internal representations. arXiv:1910.03723.  Aguilar G. Ling Y. Zhang Y. Yao B. Fan X. and Guo C. 2019. Knowledge distillation from internal representations. arXiv:1910.03723."},{"key":"e_1_3_2_1_2_1","unstructured":"Bahdanau D. Cho K. and Bengio Y. 2014. Neural machine translation by jointly learning to align and translate. arXiv:1409.0473.  Bahdanau D. Cho K. and Bengio Y. 2014. Neural machine translation by jointly learning to align and translate. arXiv:1409.0473."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Bucilu C. Caruana R. and Niculescu-Mizil A. 2006. Model compression. In KDD.  Bucilu C. Caruana R. and Niculescu-Mizil A. 2006. Model compression. In KDD.","DOI":"10.1145\/1150402.1150464"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Chen Y. C. Gan Z. Cheng Y. Liu J. & Liu J. 2019. Distilling knowledge learned in BERT for text generation. arXiv:1911.03829v2.  Chen Y. C. Gan Z. Cheng Y. Liu J. & Liu J. 2019. Distilling knowledge learned in BERT for text generation. arXiv:1911.03829v2.","DOI":"10.18653\/v1\/2020.acl-main.705"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Clark K. Khandelwal U. Levy O. and Manning C. D. 2019. What does BERT look at? An analysis of BERT's attention. CoRR abs: 1906.04341.  Clark K. Khandelwal U. Levy O. and Manning C. D. 2019. What does BERT look at? An analysis of BERT's attention. CoRR abs: 1906.04341.","DOI":"10.18653\/v1\/W19-4828"},{"key":"e_1_3_2_1_6_1","unstructured":"Denil M. Shakibi B. Dinh L. Ranzato M. and De Freitas N. Predicting parameters in deep learning. arXiv:1306.0543.  Denil M. Shakibi B. Dinh L. Ranzato M. and De Freitas N. Predicting parameters in deep learning. arXiv:1306.0543."},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of the 5th Workshop on Vision and Language.","author":"Desmond E.","year":"2016","unstructured":"Desmond , E. , Stella , F. , Khalil , S. , Lucia , S. 2016 . Multi30K: Multilingual English-German Image Descriptions . In Proceedings of the 5th Workshop on Vision and Language. Desmond, E., Stella, F., Khalil, S., Lucia, S. 2016. Multi30K: Multilingual English-German Image Descriptions. In Proceedings of the 5th Workshop on Vision and Language."},{"key":"e_1_3_2_1_8_1","volume-title":"Computer Science","author":"Hinton G.","year":"2015","unstructured":"Hinton , G. , Vinyals , O. , Dean , J. 2015 . Distilling the knowledge in a neural network . Computer Science , 2015, 14(7), 38--39. Hinton, G., Vinyals, O., Dean, J. 2015. Distilling the knowledge in a neural network. Computer Science, 2015, 14(7), 38--39."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Jiao X. Yin Y. Shang L. Jiang X. Chen X. and Li L. 2019. TinyBERT: distilling BERT for natural language understanding. arXiv: 1909.10351.  Jiao X. Yin Y. Shang L. Jiang X. Chen X. and Li L. 2019. TinyBERT: distilling BERT for natural language understanding. arXiv: 1909.10351.","DOI":"10.18653\/v1\/2020.findings-emnlp.372"},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of EMNLP.","author":"Kalchbrenner N.","year":"2013","unstructured":"Kalchbrenner , N. , Blunsom , P. 2013 . Recurrent continuous translation models . In Proceedings of EMNLP. Kalchbrenner, N., Blunsom, P. 2013. Recurrent continuous translation models. In Proceedings of EMNLP."},{"key":"e_1_3_2_1_11_1","unstructured":"Kim Y. and Rush A. M. 2016. Sequence-level KD. arXiv:1606.07947v4.  Kim Y. and Rush A. M. 2016. Sequence-level KD. arXiv:1606.07947v4."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1137\/07070111X"},{"volume-title":"Proceedings of EMNLP.","author":"Kyunghyun C.","key":"e_1_3_2_1_13_1","unstructured":"Kyunghyun C. , Merrienboer B. , Gulcehre C. , Bahdanau D. , Bougares F. , Schwenk H. , and Bengio Y . 2014. Learning phrase representations using RNN encoder-decoder for statistical machine translation . In Proceedings of EMNLP. Kyunghyun C., Merrienboer B., Gulcehre C., Bahdanau D., Bougares F., Schwenk H., and Bengio Y. 2014. Learning phrase representations using RNN encoder-decoder for statistical machine translation. In Proceedings of EMNLP."},{"key":"e_1_3_2_1_14_1","unstructured":"Lan Z. Chen M. Goodman S. Gimpel K. Sharma P. and Soricut R. 2019. ALBERT: a lite BERT for self-supervised learning of language representations. arXiv:1909.11942.  Lan Z. Chen M. Goodman S. Gimpel K. Sharma P. and Soricut R. 2019. ALBERT: a lite BERT for self-supervised learning of language representations. arXiv:1909.11942."},{"key":"e_1_3_2_1_15_1","unstructured":"Micha\u0142 Z. Marcin J. D. and Bruno P. 2016. The United Nations parallel corpus v1.0. In Lrec 2016.  Micha\u0142 Z. Marcin J. D. and Bruno P. 2016. The United Nations parallel corpus v1.0. In Lrec 2016."},{"key":"e_1_3_2_1_16_1","unstructured":"Mukherjee S. and Awadallah A. H. 2019. Distilling transformers into simple neural networks with unlabeled transfer data. arXiv:1910.01769.  Mukherjee S. and Awadallah A. H. 2019. Distilling transformers into simple neural networks with unlabeled transfer data. arXiv:1910.01769."},{"key":"e_1_3_2_1_17_1","unstructured":"Paul M. Omer L. and Graham N. 2019. Are sixteen heads really better than one? arXiv:1905.10650.  Paul M. Omer L. and Graham N. 2019. Are sixteen heads really better than one? arXiv:1905.10650."},{"key":"e_1_3_2_1_18_1","unstructured":"Sanh V. Debut L. Chaumond J. and Wolf T. 2019. Distilbert a distilled version of BERT: smaller faster cheaper and lighter. arXiv:1910.01108.  Sanh V. Debut L. Chaumond J. and Wolf T. 2019. Distilbert a distilled version of BERT: smaller faster cheaper and lighter. arXiv:1910.01108."},{"key":"e_1_3_2_1_19_1","unstructured":"Sun S. Cheng Y. Gan Z. and Liu J. 2019. Patient KD for BERT model compression. In EMNLP.  Sun S. Cheng Y. Gan Z. and Liu J. 2019. Patient KD for BERT model compression. In EMNLP."},{"key":"e_1_3_2_1_20_1","unstructured":"Tang R. Lu Y. Liu L. Mou L. Vechtomova O. and Lin J. 2019. Distilling task-specific knowledge from BERT into simple neural networks. arXiv:1903.12136.  Tang R. Lu Y. Liu L. Mou L. Vechtomova O. and Lin J. 2019. Distilling task-specific knowledge from BERT into simple neural networks. arXiv:1903.12136."},{"key":"e_1_3_2_1_21_1","unstructured":"Vaswani A. Shazeer N. Parmar N. Uszkoreit J. Jones L. Aidan N. Gomez Kaiser L. Polosukhin I. 2017. Attention is all you need. arXiv:1706.03762.  Vaswani A. Shazeer N. Parmar N. Uszkoreit J. Jones L. Aidan N. Gomez Kaiser L. Polosukhin I. 2017. Attention is all you need. arXiv:1706.03762."},{"key":"e_1_3_2_1_22_1","unstructured":"Wen T. Lai S. Qian X. 2019. Preparing lessons: improve knowledge distillation with better supervision. arXiv:1911.07471.  Wen T. Lai S. Qian X. 2019. Preparing lessons: improve knowledge distillation with better supervision. arXiv:1911.07471."},{"key":"e_1_3_2_1_23_1","unstructured":"Wu Y. Schuster M. Chen Z. Le Q. V. Norouzi M. Macherey W. et al. 2016. Google's neural machine translation system: bridging the gap between human and machine translation. arXiv:1609.08144.  Wu Y. Schuster M. Chen Z. Le Q. V. Norouzi M. Macherey W. et al. 2016. Google's neural machine translation system: bridging the gap between human and machine translation. arXiv:1609.08144."},{"key":"e_1_3_2_1_24_1","unstructured":"Zagoruyko S. Komodakis N. 2016. Paying more attention to attention: Improving the performance of convolutional neural networks via attention transfer. arXiv:1612.03928.  Zagoruyko S. Komodakis N. 2016. Paying more attention to attention: Improving the performance of convolutional neural networks via attention transfer. arXiv:1612.03928."}],"event":{"name":"NLPIR 2020: 4th International Conference on Natural Language Processing and Information Retrieval","sponsor":["FernUniversit\u00e4t in Hagen"],"location":"Seoul Republic of Korea","acronym":"NLPIR 2020"},"container-title":["Proceedings of the 4th International Conference on Natural Language Processing and Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3443279.3443302","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3443279.3443302","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T22:02:12Z","timestamp":1750197732000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3443279.3443302"}},"subtitle":["Distilling Knowledge from Transformer for Neural Machine Translation"],"short-title":[],"issued":{"date-parts":[[2020,12,18]]},"references-count":24,"alternative-id":["10.1145\/3443279.3443302","10.1145\/3443279"],"URL":"https:\/\/doi.org\/10.1145\/3443279.3443302","relation":{},"subject":[],"published":{"date-parts":[[2020,12,18]]},"assertion":[{"value":"2021-02-01","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}