{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T16:44:11Z","timestamp":1772901851243,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":31,"publisher":"ACM","license":[{"start":{"date-parts":[[2020,1,20]],"date-time":"2020-01-20T00:00:00Z","timestamp":1579478400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2020,1,20]]},"DOI":"10.1145\/3336191.3371792","type":"proceedings-article","created":{"date-parts":[[2020,1,22]],"date-time":"2020-01-22T19:08:16Z","timestamp":1579720096000},"page":"690-698","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":62,"title":["Model Compression with Two-stage Multi-teacher Knowledge Distillation for Web Question Answering System"],"prefix":"10.1145","author":[{"given":"Ze","family":"Yang","sequence":"first","affiliation":[{"name":"Microsoft, Beijing, China"}]},{"given":"Linjun","family":"Shou","sequence":"additional","affiliation":[{"name":"Microsoft, Beijing, China"}]},{"given":"Ming","family":"Gong","sequence":"additional","affiliation":[{"name":"Microsoft, Beijing, China"}]},{"given":"Wutao","family":"Lin","sequence":"additional","affiliation":[{"name":"Microsoft, Beijing, China"}]},{"given":"Daxin","family":"Jiang","sequence":"additional","affiliation":[{"name":"Microsoft, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2020,1,22]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.2200\/S00561ED1V01Y201401HLT024"},{"key":"e_1_3_2_1_2_1","volume-title":"Proceedings of the 57th Conference of the Association for Computational Linguistics, ACL 2019","volume":"5937","author":"Clark Kevin","year":"2019"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/1390156.1390177"},{"key":"e_1_3_2_1_4_1","volume-title":"Advances in Neural Information Processing Systems 27: Annual Conference on Neural Information Processing Systems","author":"Denton Emily L.","year":"2014"},{"key":"e_1_3_2_1_5_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding.","author":"Devlin Jacob","year":"2019"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/2623330.2623703"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1178"},{"key":"e_1_3_2_1_8_1","unstructured":"Jonathan Frankle and Michael Carbin. 2018. The lottery ticket hypothesis: Finding sparse trainable neural networks. arXiv preprint arXiv:1803.03635 (2018).  Jonathan Frankle and Michael Carbin. 2018. The lottery ticket hypothesis: Finding sparse trainable neural networks. arXiv preprint arXiv:1803.03635 (2018)."},{"key":"e_1_3_2_1_9_1","unstructured":"Babak Hassibi and David G. Stork. 1993. Second order derivatives for network pruning: Optimal Brain Surgeon. In Advances in Neural Information Processing Systems 5 S. J. Hanson J. D. Cowan and C. L. Giles (Eds.). Morgan-Kaufmann 164--171.  Babak Hassibi and David G. Stork. 1993. Second order derivatives for network pruning: Optimal Brain Surgeon. In Advances in Neural Information Processing Systems 5 S. J. Hanson J. D. Cowan and C. L. Giles (Eds.). Morgan-Kaufmann 164--171."},{"key":"e_1_3_2_1_10_1","volume-title":"Channel Pruning for Accelerating Very Deep Neural Networks. In IEEE International Conference on Computer Vision, ICCV 2017","author":"He Yihui","year":"2017"},{"key":"e_1_3_2_1_11_1","unstructured":"Geoffrey E Hinton Oriol Vinyals and Jeffrey Dean. 2015. Distilling the Knowledge in a Neural Network. arXiv: Machine Learning (2015).  Geoffrey E Hinton Oriol Vinyals and Jeffrey Dean. 2015. Distilling the Knowledge in a Neural Network. arXiv: Machine Learning (2015)."},{"key":"e_1_3_2_1_12_1","series-title":"Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics, ACL","volume-title":"Long Papers . 328--339. https:\/\/doi.org\/10.18653\/v1\/P18--1031","author":"Howard Jeremy","year":"2018"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Max Jaderberg Andrea Vedaldi and Andrew Zisserman. 2014. Speeding up convolutional neural networks with low rank expansions. arXiv preprint arXiv:1405.3866 (2014).  Max Jaderberg Andrea Vedaldi and Andrew Zisserman. 2014. Speeding up convolutional neural networks with low rank expansions. arXiv preprint arXiv:1405.3866 (2014).","DOI":"10.5244\/C.28.88"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1139"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1180"},{"key":"e_1_3_2_1_16_1","first-page":"27","article-title":"Optimal Brain Damage","author":"LeCun Yann","year":"1989","journal-title":"Advances in Neural Information Processing Systems 2, [NIPS Conference, Denver, Colorado, USA"},{"key":"e_1_3_2_1_17_1","unstructured":"Stefan Lee Senthil Purushwalkam Michael Cogswell David J. Crandall and Dhruv Batra. 2015. Why M Heads are Better than One: Training a Diverse Ensemble of Deep Networks. CoRR Vol. abs\/1511.06314 (2015). arxiv: 1511.06314 http:\/\/arxiv.org\/abs\/1511.06314  Stefan Lee Senthil Purushwalkam Michael Cogswell David J. Crandall and Dhruv Batra. 2015. Why M Heads are Better than One: Training a Diverse Ensemble of Deep Networks. CoRR Vol. abs\/1511.06314 (2015). arxiv: 1511.06314 http:\/\/arxiv.org\/abs\/1511.06314"},{"key":"e_1_3_2_1_18_1","unstructured":"Xiaodong Liu Pengcheng He Weizhu Chen and Jianfeng Gao. 2019 a. Improving Multi-Task Deep Neural Networks via Knowledge Distillation for Natural Language Understanding. CoRR Vol. abs\/1904.09482 (2019).  Xiaodong Liu Pengcheng He Weizhu Chen and Jianfeng Gao. 2019 a. Improving Multi-Task Deep Neural Networks via Knowledge Distillation for Natural Language Understanding. CoRR Vol. abs\/1904.09482 (2019)."},{"key":"e_1_3_2_1_19_1","series-title":"Proceedings of the 57th Conference of the Association for Computational Linguistics, ACL","volume-title":"Long Papers . 4487--4496.","author":"Liu Xiaodong","year":"2019"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/2983323.2983888"},{"key":"e_1_3_2_1_21_1","volume-title":"5th International Conference on Learning Representations, ICLR 2017, Toulon, France, April 24--26, 2017, Conference Track Proceedings .","author":"Papernot Nicolas","year":"2017"},{"key":"e_1_3_2_1_22_1","unstructured":"Anastasia Pentina and Christoph H Lampert. 2017. Multi-Task Learning with Labeled and Unlabeled Tasks. stat Vol. 1050 (2017) 1.  Anastasia Pentina and Christoph H Lampert. 2017. Multi-Task Learning with Labeled and Unlabeled Tasks. stat Vol. 1050 (2017) 1."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-1202"},{"key":"e_1_3_2_1_24_1","unstructured":"Antonio Polino Razvan Pascanu and Dan Alistarh. 2018. Model compression via distillation and quantization. CoRR Vol. abs\/1802.05668 (2018).  Antonio Polino Razvan Pascanu and Dan Alistarh. 2018. Model compression via distillation and quantization. CoRR Vol. abs\/1802.05668 (2018)."},{"key":"e_1_3_2_1_25_1","unstructured":"Alec Radford. 2018. Improving Language Understanding by Generative Pre-Training.  Alec Radford. 2018. Improving Language Understanding by Generative Pre-Training."},{"key":"e_1_3_2_1_26_1","volume-title":"GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding. In 7th International Conference on Learning Representations, ICLR 2019","author":"Wang Alex","year":"2019"},{"key":"e_1_3_2_1_27_1","volume-title":"Multi-teacher Knowledge Distillation for Compressed Video Action Recognition on Deep Neural Networks. In IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2019","author":"Wu Meng-Chieh","year":"2019"},{"key":"e_1_3_2_1_28_1","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 676--684","author":"Yim Junho","year":"2015"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3097983.3098135"},{"key":"e_1_3_2_1_30_1","unstructured":"Yang You Jing Li Jonathan Hseu Xiaodan Song James Demmel and Cho-Jui Hsieh. 2019. Reducing BERT Pre-Training Time from 3 Days to 76 Minutes. CoRR Vol. abs\/1904.00962 (2019).  Yang You Jing Li Jonathan Hseu Xiaodan Song James Demmel and Cho-Jui Hsieh. 2019. Reducing BERT Pre-Training Time from 3 Days to 76 Minutes. CoRR Vol. abs\/1904.00962 (2019)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298809"}],"event":{"name":"WSDM '20: The Thirteenth ACM International Conference on Web Search and Data Mining","location":"Houston TX USA","acronym":"WSDM '20","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data","SIGIR ACM Special Interest Group on Information Retrieval"]},"container-title":["Proceedings of the 13th International Conference on Web Search and Data Mining"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3336191.3371792","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3336191.3371792","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T00:26:10Z","timestamp":1750206370000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3336191.3371792"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,1,20]]},"references-count":31,"alternative-id":["10.1145\/3336191.3371792","10.1145\/3336191"],"URL":"https:\/\/doi.org\/10.1145\/3336191.3371792","relation":{},"subject":[],"published":{"date-parts":[[2020,1,20]]},"assertion":[{"value":"2020-01-22","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}