{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T10:08:47Z","timestamp":1775815727112,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":55,"publisher":"ACM","funder":[{"name":"Research Impact Fund","award":["No.R1015-23"],"award-info":[{"award-number":["No.R1015-23"]}]},{"name":"Collaborative Research Fund","award":["No.C1043-24GF"],"award-info":[{"award-number":["No.C1043-24GF"]}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/100018735","name":"Ant Group","doi-asserted-by":"publisher","award":["CCF-Ant Research Fund"],"award-info":[{"award-number":["CCF-Ant Research Fund"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/100018735","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,3]]},"DOI":"10.1145\/3711896.3737257","type":"proceedings-article","created":{"date-parts":[[2025,8,3]],"date-time":"2025-08-03T21:04:26Z","timestamp":1754255066000},"page":"4975-4985","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Put Teacher in Student's Shoes: Cross-Distillation for Ultra-compact Model Compression Framework"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0073-0172","authenticated-orcid":false,"given":"Maolin","family":"Wang","sequence":"first","affiliation":[{"name":"City University of Hong Kong, Hong Kong SAR, China and Ant Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-9643-8059","authenticated-orcid":false,"given":"Jun","family":"Chu","sequence":"additional","affiliation":[{"name":"Ant Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8526-7163","authenticated-orcid":false,"given":"Sicong","family":"Xie","sequence":"additional","affiliation":[{"name":"Ant Group, Hanghzou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-5351-1752","authenticated-orcid":false,"given":"Xiaoling","family":"Zang","sequence":"additional","affiliation":[{"name":"Ant Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0807-929X","authenticated-orcid":false,"given":"Yao","family":"Zhao","sequence":"additional","affiliation":[{"name":"Ant Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8861-9503","authenticated-orcid":false,"given":"Wenliang","family":"Zhong","sequence":"additional","affiliation":[{"name":"Ant Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2926-4416","authenticated-orcid":false,"given":"Xiangyu","family":"Zhao","sequence":"additional","affiliation":[{"name":"City University of Hong Kong, Hong Kong SAR, China"}]}],"member":"320","published-online":{"date-parts":[[2025,8,3]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Aiman Erbad, Amr Mohamed, Mounir Hamdi, and Mohsen Guizani.","author":"Baccour Emna","year":"2022","unstructured":"Emna Baccour, Naram Mhaisen, Alaa Awad Abdellatif, Aiman Erbad, Amr Mohamed, Mounir Hamdi, and Mohsen Guizani. 2022. Pervasive AI for IoT applications: A survey on resource-efficient distributed artificial intelligence. IEEE Communications Surveys & Tutorials(2022)."},{"key":"e_1_3_2_2_2_1","unstructured":"Bo Chen Xiangyu Zhao Yejing Wang Wenqi Fan Huifeng Guo and Ruiming Tang. 2022b. Automated machine learning for deep recommender systems: A survey. arXiv preprint arXiv:2204.01390(2022)."},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01163"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i8.16865"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10462-020-09816-7"},{"key":"e_1_3_2_2_6_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805(2018).","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805(2018)."},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3539618.3591701"},{"key":"e_1_3_2_2_8_1","first-page":"2477","article-title":"EdgeRec: recommender system on edge in Mobile Taobao","author":"Gong Yu","year":"2020","unstructured":"Yu Gong, Ziwen Jiang, Yufei Feng, Binbin Hu, Kaiqi Zhao, Qingwen Liu, and Wenwu Ou. 2020. EdgeRec: recommender system on edge in Mobile Taobao. In CIKM. 2477-2484.","journal-title":"CIKM."},{"key":"e_1_3_2_2_9_1","first-page":"3690","article-title":"PoWER-BERT: Accelerating BERT inference via progressive word-vector elimination","author":"Goyal Saurabh","year":"2020","unstructured":"Saurabh Goyal, Anamitra Roy Choudhury, Saurabh Raje, Venkatesan Chakaravarthy, Yogish Sabharwal, and Ashish Verma. 2020. PoWER-BERT: Accelerating BERT inference via progressive word-vector elimination. In ICML. PMLR, 3690-3699.","journal-title":"ICML. PMLR"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3292500.3332290"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3487045"},{"key":"e_1_3_2_2_12_1","unstructured":"Geoffrey Hinton Oriol Vinyals and Jeff Dean. 2015. Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531(2015)."},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1356"},{"key":"e_1_3_2_2_14_1","first-page":"4163","article-title":"TinyBERT","author":"Jiao Xiaoqi","year":"2020","unstructured":"Xiaoqi Jiao, Yichun Yin, Lifeng Shang, Xin Jiang, Xiao Chen, Linlin Li, Fang Wang, and Qun Liu. 2020. TinyBERT: Distilling BERT for Natural Language Understanding. In Findings of EMNLP. 4163-4174.","journal-title":"Distilling BERT for Natural Language Understanding. In Findings of EMNLP."},{"key":"e_1_3_2_2_15_1","first-page":"1","article-title":"Distill-quantize-tune-Leveraging large teachers for low-footprint efficient multilingual NLU on edge","author":"Kharazmi Pegah","year":"2023","unstructured":"Pegah Kharazmi, Zhewei Zhao, Clement Chung, and Samridhi Choudhary. 2023. Distill-quantize-tune-Leveraging large teachers for low-footprint efficient multilingual NLU on edge. In ICASSP. IEEE, 1-5.","journal-title":"ICASSP. IEEE"},{"key":"e_1_3_2_2_16_1","volume-title":"Natural language processing: State of the art, current trends and challenges. Multimedia tools and applications","author":"Khurana Diksha","year":"2023","unstructured":"Diksha Khurana, Aditya Koli, Kiran Khatter, and Sukhdev Singh. 2023. Natural language processing: State of the art, current trends and challenges. Multimedia tools and applications, Vol. 82, 3 (2023), 3713-3744."},{"key":"e_1_3_2_2_17_1","first-page":"6501","article-title":"Length-adaptive transformer: Train once with length drop, use anytime with search","author":"Kim Gyuwan","year":"2021","unstructured":"Gyuwan Kim and Kyunghyun Cho. 2021. Length-adaptive transformer: Train once with length drop, use anytime with search. In ACL-IJCNLP. ACL, 6501-6511.","journal-title":"ACL-IJCNLP. ACL"},{"key":"e_1_3_2_2_18_1","first-page":"784","article-title":"Learned token pruning for transformers","author":"Kim Sehoon","year":"2022","unstructured":"Sehoon Kim, Sheng Shen, David Thorsley, Amir Gholami, Woosuk Kwon, Joseph Hassoun, and Kurt Keutzer. 2022. Learned token pruning for transformers. In KDD. 784-794.","journal-title":"KDD."},{"key":"e_1_3_2_2_19_1","unstructured":"Taehyeon Kim Jaehoon Oh NakYil Kim Sangwook Cho and Se-Young Yun. 2021. Comparing kullback-leibler divergence and mean squared error loss in knowledge distillation. arXiv preprint arXiv:2105.08919(2021)."},{"key":"e_1_3_2_2_20_1","volume-title":"ALBERT: A Lite BERT for Self-supervised Learning of Language Representations. In ICLR.","author":"Lan Zhenzhong","year":"2019","unstructured":"Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, and Radu Soricut. 2019. ALBERT: A Lite BERT for Self-supervised Learning of Language Representations. In ICLR."},{"key":"e_1_3_2_2_21_1","first-page":"20852","article-title":"Less is more: Task-aware layer-wise distillation for language model compression","author":"Liang Chen","year":"2023","unstructured":"Chen Liang, Simiao Zuo, Qingru Zhang, Pengcheng He, Weizhu Chen, and Tuo Zhao. 2023. Less is more: Task-aware layer-wise distillation for language model compression. In ICML. PMLR, 20852-20867.","journal-title":"ICML. PMLR"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3543507.3583339"},{"key":"e_1_3_2_2_23_1","unstructured":"Weixin Liu Xuyi Chen Jiaxiang Liu Shikun Feng Yu Sun Hao Tian and Hua Wu. 2023. Ernie 3.0 tiny: Frustratingly simple method to improve task-agnostic distillation generalization. arXiv preprint arXiv:2301.03416(2023)."},{"key":"e_1_3_2_2_24_1","first-page":"28092","article-title":"Post-training quantization for vision transformer","volume":"34","author":"Liu Zhenhua","year":"2021","unstructured":"Zhenhua Liu, Yunhe Wang, Kai Han, Wei Zhang, Siwei Ma, and Wen Gao. 2021. Post-training quantization for vision transformer. Neurips, Vol. 34 (2021), 28092-28103.","journal-title":"Neurips"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3578938"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3605943"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i04.5963"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICMLA.2019.00127"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10994-021-06053-z"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"crossref","unstructured":"Fabio Petroni Tim Rockt\u00e4schel Patrick Lewis Anton Bakhtin Yuxiang Wu Alexander H Miller and Sebastian Riedel. 2019. Language models as knowledge bases? arXiv preprint arXiv:1909.01066(2019).","DOI":"10.18653\/v1\/D19-1250"},{"key":"e_1_3_2_2_31_1","unstructured":"Alec Radford Karthik Narasimhan Tim Salimans Ilya Sutskever et al. 2018. Improving language understanding by generative pre-training. (2018)."},{"key":"e_1_3_2_2_32_1","unstructured":"Victor Sanh Lysandre Debut Julien Chaumond and Thomas Wolf. 2019. DistilBERT a distilled version of BERT: smaller faster cheaper and lighter. arXiv preprint arXiv:1910.01108(2019)."},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/COMST.2020.3007787"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDM54844.2022.00056"},{"key":"e_1_3_2_2_35_1","volume-title":"Neurips","volume":"30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Neurips, Vol. 30 (2017)."},{"key":"e_1_3_2_2_36_1","unstructured":"Chenguang Wang Mu Li and Alexander J Smola. 2019. Language models with transformers. arXiv preprint arXiv:1904.09408(2019)."},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00018"},{"key":"e_1_3_2_2_38_1","unstructured":"Maolin Wang Yu Pan Xiangli Yang Guangxi Li and Zenglin Xu. 2023b. Tensor networks meet neural networks: A survey. arXiv preprint arXiv:2302.09019(2023)."},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"crossref","unstructured":"Maolin Wang Yao Zhao Jiajia Liu Jingdong Chen Chenyi Zhuang Jinjie Gu Ruocheng Guo and Xiangyu Zhao. 2023c. Large multimodal model compression via efficient pruning and distillation at AntGroup. arXiv preprint arXiv:2312.05795(2023).","DOI":"10.1145\/3589335.3648321"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3589335.3648321"},{"key":"e_1_3_2_2_41_1","first-page":"14140","article-title":"Deep Compression of Pre-trained Transformer Models","volume":"35","author":"Wang Naigang","year":"2022","unstructured":"Naigang Wang, Chi-Chun Charlie Liu, Swagath Venkataramani, Sanchari Sen, Chia-Yu Chen, Kaoutar El Maghraoui, Vijayalakshmi Viji Srinivasan, and Leland Chang. 2022a. Deep Compression of Pre-trained Transformer Models. Neurips, Vol. 35 (2022), 14140-14154.","journal-title":"Neurips"},{"key":"e_1_3_2_2_42_1","first-page":"5776","article-title":"Minilm: Deep self-attention distillation for task-agnostic compression of pre-trained transformers","volume":"33","author":"Wang Wenhui","year":"2020","unstructured":"Wenhui Wang, Furu Wei, Li Dong, Hangbo Bao, Nan Yang, and Ming Zhou. 2020. Minilm: Deep self-attention distillation for task-agnostic compression of pre-trained transformers. Neurips, Vol. 33 (2020), 5776-5788.","journal-title":"Neurips"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"crossref","unstructured":"Xiaojie Wang Jiameng Li Zhaolong Ning Qingyang Song Lei Guo Song Guo and Mohammad S Obaidat. 2023a. Wireless powered mobile edge computing networks: A survey. Comput. Surveys(2023).","DOI":"10.1145\/3579992"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3485447.3512071"},{"key":"e_1_3_2_2_45_1","volume-title":"Noisytune: A little noise can help you finetune pretrained language models better. arXiv preprint arXiv:2202.12024(2022).","author":"Wu Chuhan","year":"2022","unstructured":"Chuhan Wu, Fangzhao Wu, Tao Qi, Yongfeng Huang, and Xing Xie. 2022. Noisytune: A little noise can help you finetune pretrained language models better. arXiv preprint arXiv:2202.12024(2022)."},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i9.26255"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.coling-main.419"},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"crossref","unstructured":"Yi Yang Chen Zhang and Dawei Song. 2022. Sparse teachers can be dense with knowledge. arXiv preprint arXiv:2210.03923(2022).","DOI":"10.18653\/v1\/2022.emnlp-main.258"},{"key":"e_1_3_2_2_49_1","unstructured":"Xin Yao Ziqing Yang Yiming Cui and Shijin Wang. 2023. MiniRBT: A Two-stage Distilled Small Chinese Pre-trained Model. arXiv preprint arXiv:2304.00717(2023)."},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"crossref","unstructured":"Chen Zhang Yang Yang Jiahao Liu Jingang Wang Yunsen Xian Benyou Wang and Dawei Song. 2023. Lifting the curse of capacity gap in distilling language models. arXiv preprint arXiv:2305.12129(2023).","DOI":"10.18653\/v1\/2023.acl-long.249"},{"key":"e_1_3_2_2_51_1","volume-title":"Minidisc: Minimal distillation schedule for language model compression. arXiv preprint arXiv:2205.14570(2022).","author":"Zhang Chen","year":"2022","unstructured":"Chen Zhang, Yang Yang, Qifan Wang, Jiahao Liu, Jingang Wang, Wei Wu, and Dawei Song. 2022. Minidisc: Minimal distillation schedule for language model compression. arXiv preprint arXiv:2205.14570(2022)."},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3442381.3450124"},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDM51629.2021.00101"},{"key":"e_1_3_2_2_54_1","unstructured":"Wangchunshu Zhou Canwen Xu and Julian McAuley. 2021. BERT learns to teach: Knowledge distillation with meta learning. arXiv preprint arXiv:2106.04570(2021)."},{"key":"e_1_3_2_2_55_1","unstructured":"You Zhou Xiujing Lin Xiang Zhang Maolin Wang Gangwei Jiang Huakang Lu Yupeng Wu Kai Zhang Zhe Yang Kehang Wang et al. 2023. On the opportunities of green computing: A survey. arXiv preprint arXiv:2311.00447(2023)."}],"event":{"name":"KDD '25: The 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining","location":"Toronto ON Canada","acronym":"KDD '25","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data"]},"container-title":["Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V.2"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3711896.3737257","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,16]],"date-time":"2025-08-16T14:45:31Z","timestamp":1755355531000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3711896.3737257"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,3]]},"references-count":55,"alternative-id":["10.1145\/3711896.3737257","10.1145\/3711896"],"URL":"https:\/\/doi.org\/10.1145\/3711896.3737257","relation":{},"subject":[],"published":{"date-parts":[[2025,8,3]]},"assertion":[{"value":"2025-08-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}