{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,18]],"date-time":"2026-01-18T09:26:49Z","timestamp":1768728409896,"version":"3.49.0"},"reference-count":33,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"1","license":[{"start":{"date-parts":[[2025,2,1]],"date-time":"2025-02-01T00:00:00Z","timestamp":1738368000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,2,1]],"date-time":"2025-02-01T00:00:00Z","timestamp":1738368000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,2,1]],"date-time":"2025-02-01T00:00:00Z","timestamp":1738368000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"Shandong Province Key R&amp;D Program","award":["2023RKY02009"],"award-info":[{"award-number":["2023RKY02009"]}]},{"name":"Shandong Nature Science Foundation of China","award":["ZR2023MF070"],"award-info":[{"award-number":["ZR2023MF070"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Emerg. Top. Comput. Intell."],"published-print":{"date-parts":[[2025,2]]},"DOI":"10.1109\/tetci.2024.3418837","type":"journal-article","created":{"date-parts":[[2024,9,3]],"date-time":"2024-09-03T17:41:36Z","timestamp":1725385296000},"page":"848-860","source":"Crossref","is-referenced-by-count":1,"title":["ARC: A Layer Replacement Compression Method Based on Fine-Grained Self-Attention Distillation for Compressing Pre-Trained Language Models"],"prefix":"10.1109","volume":"9","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-6675-6512","authenticated-orcid":false,"given":"Daohan","family":"Yu","sequence":"first","affiliation":[{"name":"Shandong University of Science and Technology, Qingdao, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9184-2742","authenticated-orcid":false,"given":"Liqing","family":"Qiu","sequence":"additional","affiliation":[{"name":"Shandong University of Science and Technology, Qingdao, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"ref2","first-page":"14037","article-title":"Are sixteen heads really better than one?","volume-title":"Proc. 33rd Int. Conf. Neural Inf. Process. Syst.","author":"Michel","year":"2019"},{"key":"ref3","article-title":"Distilling the knowledge in a neural network","author":"Hinton","year":"2015"},{"key":"ref4","article-title":"DistilBERT, a distilled version of BERT: Smaller, faster, cheaper and lighter","author":"Sanh","year":"2019"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.372"},{"key":"ref6","article-title":"Knowledge distillation of large language models","volume-title":"Proc. 12th Int. Conf. Learn. Representations","author":"Han","year":"2024"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.441"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i9.26255"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W19-4828"},{"issue":"1","key":"ref10","first-page":"1929","article-title":"Dropout: A simple way to prevent neural networks from overfifitting","volume":"15","author":"Srivastava","year":"2014","journal-title":"J. Mach. Learn. Res."},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.195"},{"key":"ref12","article-title":"Well-read students learn better: On the importance of pre-training compact models","author":"Turc","year":"2020"},{"key":"ref13","article-title":"Distilling task-specific knowledge from BERT into simple neural networks","author":"Tang","year":"2019"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-5446"},{"key":"ref15","article-title":"ADAM: A method for stochastic optimization","volume-title":"CoRR","author":"Kingma"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.633"},{"key":"ref17","first-page":"5776","article-title":"MINILM: Deep self-attention distillation for task-agnostic compression of pre-trained transformers","volume-title":"Proc. Annu. Conf. Neural Inf. Process. Syst. 2020","author":"Wang","year":"2020"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1441"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-acl.188"},{"key":"ref20","article-title":"Compressing deep convolutional networks using vector quantization","author":"Gong","year":"2014"},{"key":"ref21","article-title":"Deep compression: Compressing deep neural network with pruning, trained quantization and Huffman coding","volume-title":"Proc. 4th Int. Conf. Learn. Representations ICLR","author":"Han","year":"2016"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref23","first-page":"9782","article-title":"DynaBERT: Dynamic BERT with adaptive width and depth","volume-title":"Proc. Annu. Conf. Neural Inf. Process. Syst.","author":"Hou","year":"2020"},{"key":"ref24","article-title":"FITNETS: Hints for thin deep nets","author":"Romero","year":"2015"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/tnnls.2022.3164264"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/icdm51629.2021.00069"},{"key":"ref27","first-page":"3509","article-title":"LIT: Learned intermediate representation training for model compression","volume-title":"Proc. 36th Int. Conf. Mach. Learn.","author":"Koratana","year":"2019"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.11"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i04.5963"},{"key":"ref30","article-title":"Paying more attention to attention: Improving the performance of convolutional neural networks via attention transfer","volume-title":"Proc. 5th Int. Conf. Learn. Representations ICLR","author":"Zagoruyko","year":"2017"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/d18-1232"},{"key":"ref32","first-page":"2148","article-title":"Predicting parameters in deep learning","volume-title":"Proc. 26th Int. Conf. Neural Inf. Process. Syst., Vol. 2","author":"Denil","year":"2013"},{"key":"ref33","article-title":"Doubly convolutional neural networks","volume-title":"Advances Neural Inf. Process. Syst.","volume":"29","author":"Zhai","year":"2016"}],"container-title":["IEEE Transactions on Emerging Topics in Computational Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/7433297\/10850886\/10663832.pdf?arnumber=10663832","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,24]],"date-time":"2025-01-24T05:52:00Z","timestamp":1737697920000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10663832\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2]]},"references-count":33,"journal-issue":{"issue":"1"},"URL":"https:\/\/doi.org\/10.1109\/tetci.2024.3418837","relation":{},"ISSN":["2471-285X"],"issn-type":[{"value":"2471-285X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,2]]}}}