{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,16]],"date-time":"2026-01-16T07:40:07Z","timestamp":1768549207235,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":40,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,10,26]],"date-time":"2021-10-26T00:00:00Z","timestamp":1635206400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Central Government Guiding Local Science And Technology Development Foundation Projects (Science and Technology Innovation Base Projects)","award":["206Z0302G"],"award-info":[{"award-number":["206Z0302G"]}]},{"name":"National Key Research And Development Program of China","award":["2019YFB1405802"],"award-info":[{"award-number":["2019YFB1405802"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,10,26]]},"DOI":"10.1145\/3459637.3481909","type":"proceedings-article","created":{"date-parts":[[2021,11,15]],"date-time":"2021-11-15T15:53:43Z","timestamp":1636991623000},"page":"3965-3975","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":10,"title":["Distilling Knowledge from BERT into Simple Fully Connected Neural Networks for Efficient Vertical Retrieval"],"prefix":"10.1145","author":[{"given":"Peiyang","family":"Liu","sequence":"first","affiliation":[{"name":"Peking University, Beijing, China"}]},{"given":"Xi","family":"Wang","sequence":"additional","affiliation":[{"name":"PX Securities, Beijing, China"}]},{"given":"Lin","family":"Wang","sequence":"additional","affiliation":[{"name":"PX Securities, Beijing, China"}]},{"given":"Wei","family":"Ye","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]},{"given":"Xiangyu","family":"Xi","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]},{"given":"Shikun","family":"Zhang","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2021,10,30]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.5555\/3026877.3026899"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Amin Ahmad Noah Constant Yinfei Yang and Daniel Cer. 2019. ReQA: An Evaluation for End-to-End Answer Retrieval Models. In MRQA@EMNLP. 137--146. https:\/\/doi.org\/10.18653\/v1\/D19-5819  Amin Ahmad Noah Constant Yinfei Yang and Daniel Cer. 2019. ReQA: An Evaluation for End-to-End Answer Retrieval Models. In MRQA@EMNLP. 137--146. https:\/\/doi.org\/10.18653\/v1\/D19-5819","DOI":"10.18653\/v1\/D19-5819"},{"key":"e_1_3_2_1_3_1","volume-title":"Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020","author":"Brown Tom B.","year":"2020"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2020\/341"},{"key":"e_1_3_2_1_5_1","volume-title":"DiPair: Fast and Accurate Distillation for Trillion-Scale Text Matching and Pair Modeling. CoRR abs\/2010.03099","author":"Chen Jiecao","year":"2020"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/1060745.1060764"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","unstructured":"Yiming Cui Ting Liu Wanxiang Che Li Xiao Zhipeng Chen Wentao Ma Shijin Wang and Guoping Hu. 2019. A Span-Extraction Dataset for Chinese Machine Reading Comprehension. In EMNLP-IJCNLP. Hong Kong China 5886--5891. https:\/\/doi.org\/10.18653\/v1\/D19-1600  Yiming Cui Ting Liu Wanxiang Che Li Xiao Zhipeng Chen Wentao Ma Shijin Wang and Guoping Hu. 2019. A Span-Extraction Dataset for Chinese Machine Reading Comprehension. In EMNLP-IJCNLP. Hong Kong China 5886--5891. https:\/\/doi.org\/10.18653\/v1\/D19-1600","DOI":"10.18653\/v1\/D19-1600"},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Associa- tion for Computational Linguistics: Human Language Technologies, NAACL-HLT 2019","volume":"1","author":"Devlin Jacob","year":"2019"},{"key":"e_1_3_2_1_9_1","volume-title":"Reducing Transformer Depth on Demand with Structured Dropout. In 8th International Conference on Learning Representations, ICLR 2020","author":"Fan Angela","year":"2020"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.repl4nlp-1.18"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/1062745.1062778"},{"key":"e_1_3_2_1_12_1","volume-title":"Reweighted Proximal Pruning for Large-Scale Language Representation. CoRR abs\/1909.12486","author":"Guo Fu-Ming","year":"2019"},{"key":"e_1_3_2_1_13_1","volume-title":"Bidirectional LSTM-CRF models for sequence tagging. arXiv preprint arXiv:1508.01991","author":"Huang Zhiheng","year":"2015"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.372"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/TBDATA.2019.2921572"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00276"},{"key":"e_1_3_2_1_17_1","volume-title":"ALBERT: A Lite BERT for Self-supervised Learning of Language Representations. In 8th International Conference on Learning Representations, ICLR 2020","author":"Lan Zhenzhong","year":"2020"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.3233\/SW-140134"},{"key":"e_1_3_2_1_19_1","volume-title":"Proceedings of the 2021 Conference of the North American","author":"Liu Peiyang","year":"2021"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.537"},{"key":"e_1_3_2_1_21_1","volume-title":"Proceedings of the 27th International Conference on Computational Linguistics, COLING 2018","author":"Liu Xin","year":"2018"},{"key":"e_1_3_2_1_22_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.5555\/2002472.2002491"},{"key":"e_1_3_2_1_24_1","volume-title":"Pruning a BERT-based Question Answering Model. CoRR abs\/1910.06360","author":"McCarley J. S.","year":"2019"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.5555\/3454287.3455544"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ins.2018.12.041"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-1202"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01716-3_18"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Pranav Rajpurkar Jian Zhang Konstantin Lopyrev and Percy Liang. 2016. SQuAD: 100 000+ Questions for Machine Comprehension of Text. In EMNLP Jian Su Xavier Carreras and Kevin Duh (Eds.). 2383--2392. https:\/\/doi.org\/10.18653\/v1\/d16-1264  Pranav Rajpurkar Jian Zhang Konstantin Lopyrev and Percy Liang. 2016. SQuAD: 100 000+ Questions for Machine Comprehension of Text. In EMNLP Jian Su Xavier Carreras and Kevin Duh (Eds.). 2383--2392. https:\/\/doi.org\/10.18653\/v1\/d16-1264","DOI":"10.18653\/v1\/D16-1264"},{"key":"e_1_3_2_1_30_1","volume-title":"Dis- tilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108","author":"Sanh Victor","year":"2019"},{"key":"e_1_3_2_1_31_1","volume-title":"DRCD: a Chinese Machine Reading Comprehension Dataset. CoRR abs\/1806.00920","author":"Shao Chih-Chieh","year":"2018"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6409"},{"key":"e_1_3_2_1_33_1","volume-title":"Andrew Y. Ng, and Christopher Potts.","author":"Socher Richard","year":"2013"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1441"},{"key":"e_1_3_2_1_35_1","volume-title":"FLOPs as a Direct Opti- mization Objective for Learning Sparse Neural Networks. CoRR abs\/1811.03060","author":"Tang Raphael","year":"2018"},{"key":"e_1_3_2_1_36_1","volume-title":"Distilling task-specific knowledge from bert into simple neural networks. arXiv preprint arXiv:1903.12136","author":"Tang Raphael","year":"2019"},{"key":"e_1_3_2_1_37_1","volume-title":"Well- Read Students Learn Better: The Impact of Student Initialization on Knowledge Distillation. CoRR abs\/1908.08962","author":"Turc Iulia","year":"2019"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.5555\/3295222.3295349"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.496"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/EMC2-NIPS53020.2019.00016"}],"event":{"name":"CIKM '21: The 30th ACM International Conference on Information and Knowledge Management","location":"Virtual Event Queensland Australia","acronym":"CIKM '21","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web","SIGIR ACM Special Interest Group on Information Retrieval"]},"container-title":["Proceedings of the 30th ACM International Conference on Information &amp; Knowledge Management"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3459637.3481909","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3459637.3481909","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T17:49:11Z","timestamp":1750268951000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3459637.3481909"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,26]]},"references-count":40,"alternative-id":["10.1145\/3459637.3481909","10.1145\/3459637"],"URL":"https:\/\/doi.org\/10.1145\/3459637.3481909","relation":{},"subject":[],"published":{"date-parts":[[2021,10,26]]},"assertion":[{"value":"2021-10-30","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}