{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T18:46:51Z","timestamp":1775069211351,"version":"3.50.1"},"reference-count":280,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"4","license":[{"start":{"date-parts":[[2025,4,1]],"date-time":"2025-04-01T00:00:00Z","timestamp":1743465600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,4,1]],"date-time":"2025-04-01T00:00:00Z","timestamp":1743465600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,4,1]],"date-time":"2025-04-01T00:00:00Z","timestamp":1743465600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"Advanced Manufacturing and Engineering (AME) Programmatic Fund","award":["RIE2020\/RIE2025"],"award-info":[{"award-number":["RIE2020\/RIE2025"]}]},{"name":"Advanced Manufacturing and Engineering (AME) Programmatic Fund","award":["A1892b0026"],"award-info":[{"award-number":["A1892b0026"]}]},{"name":"Advanced Manufacturing and Engineering (AME) Programmatic Fund","award":["A19E3b0099"],"award-info":[{"award-number":["A19E3b0099"]}]},{"name":"Manufacturing, Trade and Connectivity","award":["M23L7b0021"],"award-info":[{"award-number":["M23L7b0021"]}]},{"name":"AME Young Individual Research","award":["A2084c0167"],"award-info":[{"award-number":["A2084c0167"]}]},{"name":"Career Development Fund","award":["C210812035"],"award-info":[{"award-number":["C210812035"]}]},{"DOI":"10.13039\/501100001348","name":"Agency for Science, Technology and Research","doi-asserted-by":"crossref","id":[{"id":"10.13039\/501100001348","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Neural Netw. Learning Syst."],"published-print":{"date-parts":[[2025,4]]},"DOI":"10.1109\/tnnls.2024.3394494","type":"journal-article","created":{"date-parts":[[2024,6,14]],"date-time":"2024-06-14T17:36:52Z","timestamp":1718386612000},"page":"5837-5857","source":"Crossref","is-referenced-by-count":10,"title":["From Algorithm to Hardware: A Survey on Efficient and Safe Deployment of Deep Neural Networks"],"prefix":"10.1109","volume":"36","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2594-9648","authenticated-orcid":false,"given":"Xue","family":"Geng","sequence":"first","affiliation":[{"name":"Institute for Infocomm Research, Agency for Science, Technology, and Research (A*STAR), Fusionopolis, Singapore"}]},{"given":"Zhe","family":"Wang","sequence":"additional","affiliation":[{"name":"Institute for Infocomm Research, Agency for Science, Technology, and Research (A*STAR), Fusionopolis, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7753-6343","authenticated-orcid":false,"given":"Chunyun","family":"Chen","sequence":"additional","affiliation":[{"name":"College of Computing and Data Science (CCDS), Nanyang Technological University, Jurong West, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9202-1073","authenticated-orcid":false,"given":"Qing","family":"Xu","sequence":"additional","affiliation":[{"name":"Institute for Infocomm Research, Agency for Science, Technology, and Research (A*STAR), Fusionopolis, Singapore"}]},{"given":"Kaixin","family":"Xu","sequence":"additional","affiliation":[{"name":"Institute for Infocomm Research, Agency for Science, Technology, and Research (A*STAR), Fusionopolis, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6858-1177","authenticated-orcid":false,"given":"Chao","family":"Jin","sequence":"additional","affiliation":[{"name":"Institute for Infocomm Research, Agency for Science, Technology, and Research (A*STAR), Fusionopolis, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7522-0898","authenticated-orcid":false,"given":"Manas","family":"Gupta","sequence":"additional","affiliation":[{"name":"Institute for Infocomm Research, Agency for Science, Technology, and Research (A*STAR), Fusionopolis, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7002-4564","authenticated-orcid":false,"given":"Xulei","family":"Yang","sequence":"additional","affiliation":[{"name":"Institute for Infocomm Research, Agency for Science, Technology, and Research (A*STAR), Fusionopolis, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1719-0328","authenticated-orcid":false,"given":"Zhenghua","family":"Chen","sequence":"additional","affiliation":[{"name":"Institute for Infocomm Research, Agency for Science, Technology, and Research (A*STAR), Fusionopolis, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8018-1264","authenticated-orcid":false,"given":"Mohamed M.","family":"Sabry Aly","sequence":"additional","affiliation":[{"name":"College of Computing and Data Science (CCDS), Nanyang Technological University, Jurong West, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8971-0660","authenticated-orcid":false,"given":"Jie","family":"Lin","sequence":"additional","affiliation":[{"name":"Technology, and Research (A*STAR), Institute for Infocomm Research (I2R), Agency for Science, Fusionopolis, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0977-3600","authenticated-orcid":false,"given":"Min","family":"Wu","sequence":"additional","affiliation":[{"name":"Institute for Infocomm Research, Agency for Science, Technology, and Research (A*STAR), Fusionopolis, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0762-6562","authenticated-orcid":false,"given":"Xiaoli","family":"Li","sequence":"additional","affiliation":[{"name":"Institute for Infocomm Research, Agency for Science, Technology, and Research (A*STAR), Fusionopolis, Singapore"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Scaling instruction-finetuned language models","author":"Won Chung","year":"2022","journal-title":"arXiv:2210.11416"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2017.2761740"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/SmartCloud.2016.18"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2020.2976475"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/3570955"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2017.2765695"},{"key":"ref7","article-title":"A survey on deep neural network compression: Challenges, overview, and solutions","author":"Mishra","year":"2020","journal-title":"arXiv:2010.03954"},{"key":"ref8","article-title":"An overview of neural network compression","author":"O\u2019 Neill","year":"2020","journal-title":"arXiv:2006.03669"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.3390\/informatics8040077"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1007\/s11265-020-01596-1"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1016\/j.sysarc.2020.101839"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.26599\/TST.2018.9010103"},{"key":"ref13","article-title":"A survey on model compression and acceleration for pretrained language models","author":"Xu","year":"2022","journal-title":"arXiv:2202.07105"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1145\/3487045"},{"key":"ref15","article-title":"A survey on methods and theories of quantized neural networks","author":"Guo","year":"2018","journal-title":"arXiv:1808.04752"},{"key":"ref16","article-title":"A survey of quantization methods for efficient neural network inference","author":"Gholami","year":"2021","journal-title":"arXiv:2103.13630"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.23919\/CCC50068.2020.9189610"},{"key":"ref18","article-title":"Pruning algorithms to accelerate convolutional neural networks for edge applications: A survey","author":"Liu","year":"2020","journal-title":"arXiv:2005.04275"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/72.248452"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.26599\/TST.2021.9010057"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3447582"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1905.01392"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01453-z"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3055564"},{"key":"ref25","first-page":"1","article-title":"Deep compression: Compressing deep neural networks with pruning, trained quantization and Huffman coding","volume-title":"Proc. ICLR","author":"Han"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01237-3_23"},{"key":"ref27","article-title":"Incremental network quantization: Towards lossless CNNs with low-precision weights","author":"Zhou","year":"2017","journal-title":"arXiv:1702.03044"},{"key":"ref28","article-title":"DoReFa-Net: Training low bitwidth convolutional neural networks with low bitwidth gradients","author":"Zhou","year":"2016","journal-title":"arXiv:1606.06160"},{"key":"ref29","doi-asserted-by":"crossref","DOI":"10.1109\/ICASSP49357.2023.10094626","article-title":"Ternary weight networks","volume-title":"Proc. NIPS Workshop","author":"Li"},{"key":"ref30","article-title":"Compressing deep convolutional networks using vector quantization","author":"Gong","year":"2014","journal-title":"arXiv:1412.6115"},{"key":"ref31","first-page":"2285","article-title":"Compressing neural networks with the hashing trick","volume-title":"Proc. ICML","author":"Chen"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1145\/2939672.2939839"},{"key":"ref33","article-title":"And the bit goes down: Revisiting the quantization of neural networks","volume-title":"Proc. ICLR","author":"Stock"},{"key":"ref34","article-title":"Additive powers-of-two quantization: An efficient non-uniform discretization for neural networks","volume-title":"Proc. ICLR","author":"Li"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00452"},{"key":"ref36","doi-asserted-by":"crossref","DOI":"10.1109\/MM.2020.3009475","article-title":"ReLeQ: A reinforcement learning approach for deep quantization of neural networks","volume-title":"Proc. NeurIPS Workshop ML Syst.","author":"Elthakeb"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00881"},{"key":"ref38","article-title":"Mixed precision quantization of ConvNets via differentiable neural architecture search","volume-title":"Proc. ICLR","author":"Wu"},{"key":"ref39","article-title":"Mixed precision DNNs: All you need is a good parametrization","author":"Uhlich","year":"2019","journal-title":"arXiv:1905.11452"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00038"},{"key":"ref41","article-title":"Adaptive loss-aware quantization for multi-bit networks","author":"Qu","year":"2019","journal-title":"arXiv:1912.08883"},{"key":"ref42","article-title":"Post-training 4-bit quantization of convolution networks for rapid-deployment","author":"Banner","year":"2018","journal-title":"arXiv:1810.05723"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i12.17269"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00916"},{"key":"ref45","article-title":"AutoQ: Automated kernel-wise neural network quantizations","volume-title":"Proc. ICLR","author":"Lou"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01234-2_28"},{"key":"ref47","article-title":"DeepCABAC: Context-adaptive binary arithmetic coding for deep neural network compression","volume-title":"Proc. ICML Workshop","author":"Wiedemann"},{"key":"ref48","article-title":"Scalable model compression by entropy penalized reparameterization","volume-title":"Proc. ICLR","author":"Oktay"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/DCC50243.2021.00033"},{"key":"ref50","article-title":"Adaptive quantization of neural networks","volume-title":"Proc. ICLR","author":"Khoram"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3084839"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01206"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01205"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.627"},{"key":"ref55","first-page":"30318","article-title":"Gpt3. int8(): 8-bit matrix multiplication for transformers at scale","volume-title":"Proc. NIPS","author":"Dettmers"},{"key":"ref56","first-page":"28092","article-title":"Post-training quantization for vision transformer","volume-title":"Proc. NIPS","author":"Liu"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547826"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1117\/1.1469618"},{"key":"ref59","volume-title":"Synthesis of noiseless compression codes","author":"Tunstall","year":"1967"},{"key":"ref60","article-title":"SNIP: Single-shot network pruning based on connection sensitivity","volume-title":"Proc. ICLR","author":"Lee"},{"key":"ref61","article-title":"Picking winning tickets before training by preserving gradient flow","volume-title":"Proc. ICLR","author":"Wang"},{"key":"ref62","article-title":"Progressive skeletonization: Trimming more fat from a network at initialization","volume-title":"Proc. ICLR","author":"de Jorge"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2022\/786"},{"key":"ref64","first-page":"598","article-title":"Optimal brain damage","volume-title":"Proc. NIPS","author":"LeCun"},{"key":"ref65","first-page":"18098","article-title":"Woodfisher: Efficient second-order approximation for neural network compression","volume-title":"Proc. NIPS","author":"Singh"},{"key":"ref66","first-page":"14873","article-title":"M-FAC: Efficient matrix-free approximations of second-order information","volume-title":"Proc. NIPS","author":"Frantar"},{"key":"ref67","first-page":"12894","article-title":"Structural pruning via latency-saliency knapsack","volume-title":"Proc. NIPS","author":"Shen"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01152"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01600"},{"key":"ref70","article-title":"Dynamic model pruning with feedback","volume-title":"Proc. ICLR","author":"Lin"},{"key":"ref71","article-title":"Dynamic sparse training: Find efficient sparse network from scratch with trainable masked layers","volume-title":"Proc. ICLR","author":"Liu"},{"key":"ref72","article-title":"Pruning convolutional neural networks for resource efficient inference","volume-title":"Proc. ICLR","author":"Molchanov"},{"key":"ref73","first-page":"1","article-title":"SCOP: Scientific control for reliable neural network pruning","volume-title":"Proc. NIPS","author":"Tang"},{"key":"ref74","first-page":"5544","article-title":"Soft threshold weight reparameterization for learnable sparsity","volume-title":"Proc. ICML","author":"Kusupati"},{"key":"ref75","first-page":"11380","article-title":"Winning the lottery with continuous sparsification","volume-title":"Proc. NIPS","author":"Savarese"},{"key":"ref76","article-title":"Neural pruning via growing regularization","volume-title":"Proc. ICLR","author":"Wang"},{"key":"ref77","first-page":"2943","article-title":"Rigging the lottery: Making all tickets winners","volume-title":"Proc. ICML","author":"Evci"},{"key":"ref78","article-title":"Lookahead: A far-sighted alternative of magnitude-based pruning","volume-title":"Proc. ICLR","author":"Park"},{"key":"ref79","article-title":"To prune, or not to prune: Exploring the efficacy of pruning for model compression","volume-title":"Proc. ICLR Workshop","author":"Zhu"},{"key":"ref80","article-title":"Layer-adaptive sparsity for the magnitude-based pruning","volume-title":"Proc. ICLR","author":"Lee"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/cai59869.2024.00144"},{"key":"ref82","first-page":"4646","article-title":"Parameter efficient training of deep convolutional neural networks by dynamic sparse reparameterization","volume-title":"Proc. ICML","author":"Mostafa"},{"key":"ref83","article-title":"Sparse networks from scratch: Faster training without losing performance","author":"Dettmers","year":"2019","journal-title":"arXiv:1907.04840"},{"key":"ref84","article-title":"Learning to prune deep neural networks via reinforcement learning","volume-title":"Proc. ICML AutoML Workshop","author":"Gupta"},{"key":"ref85","first-page":"784","article-title":"AMC: AutoMl for model compression and acceleration on mobile devices","volume-title":"Proc. ECCV","author":"Lin"},{"key":"ref86","first-page":"2181","article-title":"Runtime neural pruning","volume-title":"Proc. NIPS","author":"Lin"},{"key":"ref87","article-title":"The lottery ticket hypothesis: Finding sparse, trainable neural networks","volume-title":"Proc. ICLR","author":"Frankle"},{"key":"ref88","first-page":"7948","article-title":"Post training 4-bit quantization of convolutional networks for rapid-deployment","volume-title":"Proc. NIPS","author":"Banner"},{"key":"ref89","first-page":"4475","article-title":"Optimal brain compression: A framework for accurate post-training quantization and pruning","volume-title":"Proc. NIPS","author":"Frantar"},{"key":"ref90","first-page":"24101","article-title":"A fast post-training pruning framework for transformers","volume-title":"Proc. NIPS","author":"Kwon"},{"key":"ref91","first-page":"2680","article-title":"Discovering neural wirings","volume-title":"Proc. NIPS","author":"Wortsman"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00447"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00160"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6910"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58598-3_36"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00290"},{"key":"ref97","first-page":"10717","article-title":"Accelerate CNNs from three dimensions: A comprehensive pruning framework","volume-title":"Proc. ICML","author":"Wang"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.829"},{"key":"ref99","article-title":"Learning N: M fine-grained structured sparse neural networks from scratch","author":"Zhou","year":"2021","journal-title":"arXiv:2102.04010"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1109\/TVLSI.2022.3197282"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01779"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02170"},{"key":"ref103","article-title":"A survey on deep neural network pruning-taxonomy, comparison, analysis, and recommendations","author":"Cheng","year":"2023","journal-title":"arXiv:2308.06767"},{"key":"ref104","article-title":"Slimmable neural networks","volume-title":"Proc. ICLR","author":"Yu"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00189"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00850"},{"key":"ref107","article-title":"Once-for-all: Train one network and specialize it for efficient deployment","volume-title":"Proc. ICLR","author":"Cai"},{"key":"ref108","article-title":"Model distillation with knowledge transfer from face classification to alignment and verification","author":"Wang","year":"2017","journal-title":"arXiv:1709.02929"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-73603-7_40"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00501"},{"key":"ref111","first-page":"13292","article-title":"Learning student-friendly teacher networks for knowledge distillation","volume-title":"Proc. NIPS","author":"Park"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1139"},{"key":"ref113","article-title":"Explaining sequence-level knowledge distillation as data-augmentation for neural machine translation","author":"Gordon","year":"2019","journal-title":"arXiv:1912.03334"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2020.107722"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-1190"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054698"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2020.11.025"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1109\/TIE.2021.3057030"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2021.04.139"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1503.02531"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2018\/158"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01165"},{"key":"ref123","first-page":"742","article-title":"Learning efficient object detection models with knowledge distillation","volume-title":"Proc. NIPS","author":"Chen"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.1109\/AIAM48774.2019.00106"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054157"},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00035"},{"key":"ref127","article-title":"Paying more attention to attention: Improving the performance of convolutional neural networks via attention transfer","volume-title":"Proc. ICLR","author":"Zagoruyko"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.754"},{"key":"ref129","first-page":"2765","article-title":"Paraphrasing complex network: Network compression via factor transfer","volume-title":"Proc. NIPS","author":"Kim"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00938"},{"key":"ref131","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00816"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01064"},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01050"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00358"},{"key":"ref135","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i9.16969"},{"key":"ref136","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01252-6_17"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00409"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00726"},{"key":"ref139","first-page":"33716","article-title":"Knowledge distillation from a stronger teacher","volume-title":"Proc. NIPS","author":"Huang"},{"key":"ref140","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01389"},{"key":"ref141","article-title":"Contrastive representation distillation","volume-title":"Proc. ICLR","author":"Tian"},{"key":"ref142","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00914"},{"key":"ref143","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00511"},{"key":"ref144","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-20870-7_7"},{"key":"ref145","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2005.177"},{"key":"ref146","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.593"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00719"},{"key":"ref148","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_48"},{"key":"ref149","article-title":"Visibility guided NMS: Efficient boosting of amodal object detection in crowded traffic scenes","author":"G\u00e4hlert","year":"2020","journal-title":"arXiv:2006.08547"},{"key":"ref150","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00300"},{"key":"ref151","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00662"},{"key":"ref152","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR48806.2021.9412930"},{"key":"ref153","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-45886-1_16"},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.685"},{"key":"ref155","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00378"},{"key":"ref156","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-16865-4_19"},{"key":"ref157","doi-asserted-by":"publisher","DOI":"10.1093\/comjnl\/bxaa108"},{"key":"ref158","doi-asserted-by":"publisher","DOI":"10.23919\/DATE54114.2022.9774717"},{"key":"ref159","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00958"},{"key":"ref160","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01558"},{"key":"ref161","first-page":"1302","article-title":"Efficient softmax approximation for gpus","volume-title":"Proc. ICML","author":"Joulin"},{"key":"ref162","article-title":"SVD-Softmax: Fast softmax approximation on large vocabulary neural networks","volume-title":"Proc. NIPS","author":"Shim"},{"key":"ref163","first-page":"589","article-title":"Adaptive sampled softmax with kernel based sampling","volume-title":"Proc. ICML","author":"Blanc"},{"key":"ref164","first-page":"21297","article-title":"SOFT: Softmax-free transformer with linear complexity","volume-title":"Proc. NIPS","author":"Lu"},{"key":"ref165","doi-asserted-by":"publisher","DOI":"10.14778\/3485450.3485451"},{"key":"ref166","article-title":"Learning to screen for fast softmax inference on large vocabulary neural networks","author":"Chen","year":"2018","journal-title":"arXiv:1810.12406"},{"key":"ref167","article-title":"Doubly sparse: Sparse mixture of sparse experts for efficient softmax inference","author":"Liao","year":"2019","journal-title":"arXiv:1901.10668"},{"key":"ref168","first-page":"13834","article-title":"Sampled softmax with random Fourier features","volume-title":"Proc. NIPS","author":"Rawat"},{"key":"ref169","first-page":"1783","article-title":"On the accuracy of self-normalized log-linear models","volume-title":"Proc. NIPS","author":"Andreas"},{"key":"ref170","first-page":"1106","article-title":"ImageNet classification with deep convolutional neural networks","volume-title":"Proc. NIPS","author":"Krizhevsky"},{"key":"ref171","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.123"},{"key":"ref172","first-page":"6105","article-title":"EfficientNet: Rethinking model scaling for convolutional neural networks","volume-title":"Proc. ICML","author":"Tan"},{"key":"ref173","article-title":"ProxylessNAS: Direct neural architecture search on target task and hardware","author":"Cai","year":"2018","journal-title":"arXiv:1812.00332"},{"key":"ref174","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00293"},{"key":"ref175","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01099"},{"key":"ref176","article-title":"DARTS: Differentiable architecture search","volume-title":"Proc. ICLR","author":"Liu"},{"key":"ref177","first-page":"4095","article-title":"Efficient neural architecture search via parameters sharing","volume-title":"Proc. ICML","author":"Pham"},{"key":"ref178","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01556"},{"key":"ref179","article-title":"MCUNetV2: Memory-efficient patch-based inference for tiny deep learning","author":"Lin","year":"2021","journal-title":"arXiv:2110.15352"},{"key":"ref180","first-page":"11285","article-title":"TinyTL: Reduce memory, not parameters for efficient on-device learning","volume-title":"Proc. NIPS","author":"Cai"},{"key":"ref181","doi-asserted-by":"publisher","DOI":"10.1145\/2644865.2541967"},{"key":"ref182","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2014.58"},{"key":"ref183","doi-asserted-by":"publisher","DOI":"10.1145\/2786763.2694358"},{"key":"ref184","volume-title":"NVIDIA Deep Learning Accelerator","year":"2018"},{"key":"ref185","doi-asserted-by":"publisher","DOI":"10.1145\/3296957.3173176"},{"key":"ref186","doi-asserted-by":"publisher","DOI":"10.1007\/BF02165411"},{"key":"ref187","doi-asserted-by":"publisher","DOI":"10.1137\/1.9781611970364"},{"key":"ref188","doi-asserted-by":"publisher","DOI":"10.1109\/MSPEC.1967.5217220"},{"key":"ref189","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-11179-7_36"},{"key":"ref190","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.435"},{"key":"ref191","volume-title":"Fast object detection using MLP and FFT","author":"Ben-Yacoub","year":"1997"},{"key":"ref192","article-title":"Fast training of convolutional networks through FFTs","author":"Mathieu","year":"2013","journal-title":"arXiv:1312.5851"},{"key":"ref193","article-title":"Fast convolutional nets with fbfft: A GPU performance evaluation","author":"Vasilache","year":"2014","journal-title":"arXiv:1412.7580"},{"key":"ref194","doi-asserted-by":"publisher","DOI":"10.1109\/FCCM.2017.64"},{"key":"ref195","doi-asserted-by":"publisher","DOI":"10.1109\/SOCC49529.2020.9524802"},{"key":"ref196","first-page":"328","article-title":"A3: Accelerating attention mechanisms in neural networks with approximation","volume-title":"Proc. IEEE Int. Symp. High Perform. Comput. Archit. (HPCA)","author":"Ham"},{"key":"ref197","article-title":"Vis-TOP: Visual transformer overlay processor","author":"Hu","year":"2021","journal-title":"arXiv:2110.10957"},{"key":"ref198","article-title":"VAQF: Fully automatic software-hardware co-design framework for low-bit vision transformer","author":"Sun","year":"2022","journal-title":"arXiv:2201.06618"},{"key":"ref199","doi-asserted-by":"publisher","DOI":"10.1145\/3489517.3530505"},{"key":"ref200","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN54540.2023.10191521"},{"key":"ref201","doi-asserted-by":"publisher","DOI":"10.1109\/ISCAS46773.2023.10181988"},{"key":"ref202","doi-asserted-by":"publisher","DOI":"10.23919\/VLSIC.2017.8008534"},{"key":"ref203","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2019.2936192"},{"key":"ref204","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC.2019.8662302"},{"key":"ref205","doi-asserted-by":"publisher","DOI":"10.1109\/ISCAS45731.2020.9180844"},{"key":"ref206","doi-asserted-by":"publisher","DOI":"10.1109\/VLSIC.2018.8502276"},{"key":"ref207","doi-asserted-by":"publisher","DOI":"10.1109\/FPL.2018.00035"},{"key":"ref208","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC42613.2021.9365791"},{"key":"ref209","article-title":"Dimensionality compression and expansion in deep neural networks","author":"Recanatesi","year":"2019","journal-title":"arXiv:1906.00443"},{"key":"ref210","article-title":"Scalable model compression by entropy penalized reparameterization","author":"Oktay","year":"2019","journal-title":"arXiv:1906.06624"},{"key":"ref211","article-title":"DeepCABAC: Context-adaptive binary arithmetic coding for deep neural network compression","author":"Wiedemann","year":"2019","journal-title":"arXiv:1905.08318"},{"key":"ref212","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01234-2_28"},{"key":"ref213","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18074.2021.9586173"},{"key":"ref214","volume-title":"Introduction to Algorithms","author":"Cormen","year":"2009"},{"key":"ref215","doi-asserted-by":"publisher","DOI":"10.1117\/1.1469618"},{"key":"ref216","doi-asserted-by":"publisher","DOI":"10.1145\/214762.214771"},{"key":"ref217","doi-asserted-by":"publisher","DOI":"10.1109\/JSSC.2016.2616357"},{"key":"ref218","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2016.30"},{"key":"ref219","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080254"},{"key":"ref220","doi-asserted-by":"publisher","DOI":"10.1109\/JETCAS.2019.2910232"},{"key":"ref221","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2021.3098483"},{"key":"ref222","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783723"},{"key":"ref223","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4615-8675-3_4"},{"key":"ref224","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2015.27"},{"key":"ref225","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358275"},{"key":"ref226","doi-asserted-by":"publisher","DOI":"10.1109\/VLSIC.2018.8502404"},{"key":"ref227","doi-asserted-by":"publisher","DOI":"10.23919\/VLSIC.2019.8778193"},{"key":"ref228","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2019.2911674"},{"key":"ref229","doi-asserted-by":"publisher","DOI":"10.1109\/FCCM.2019.00013"},{"issue":"5","key":"ref230","first-page":"1","article-title":"CompAct: On-chip compression of activations for low power systolic array based CNN acceleration","volume":"18","author":"Zhang","year":"2019","journal-title":"ACM Trans. Embedded Comput. Syst."},{"key":"ref231","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2018.00011"},{"key":"ref232","doi-asserted-by":"publisher","DOI":"10.1109\/ASPDAC.2017.7858395"},{"key":"ref233","doi-asserted-by":"publisher","DOI":"10.1145\/3020078.3021745"},{"key":"ref234","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC.2017.7870353"},{"key":"ref235","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7471831"},{"key":"ref236","doi-asserted-by":"publisher","DOI":"10.1109\/TCSII.2019.2893527"},{"key":"ref237","doi-asserted-by":"publisher","DOI":"10.1109\/SOCC.2016.7905501"},{"key":"ref238","doi-asserted-by":"publisher","DOI":"10.1109\/ICASID.2018.8693206"},{"key":"ref239","doi-asserted-by":"publisher","DOI":"10.1109\/APCCAS.2018.8605654"},{"key":"ref240","doi-asserted-by":"publisher","DOI":"10.1109\/ICSICT.2018.8565706"},{"key":"ref241","doi-asserted-by":"publisher","DOI":"10.1109\/ICDSP.2018.8631588"},{"key":"ref242","doi-asserted-by":"publisher","DOI":"10.1145\/3299874.3317988"},{"key":"ref243","doi-asserted-by":"publisher","DOI":"10.3390\/technologies8030046"},{"key":"ref244","doi-asserted-by":"publisher","DOI":"10.1109\/AICAS.2019.8771616"},{"key":"ref245","doi-asserted-by":"publisher","DOI":"10.1038\/s41598-021-94691-7"},{"key":"ref246","doi-asserted-by":"publisher","DOI":"10.1109\/TCSII.2021.3120495"},{"key":"ref247","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18074.2021.9586134"},{"key":"ref248","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4899-5841-9_2"},{"key":"ref249","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2020.3009047"},{"key":"ref250","doi-asserted-by":"publisher","DOI":"10.1109\/TVLSI.2019.2963678"},{"key":"ref251","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2022.3166799"},{"key":"ref252","doi-asserted-by":"publisher","DOI":"10.1109\/JETCAS.2022.3231708"},{"key":"ref253","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-48910-X_16"},{"key":"ref254","doi-asserted-by":"publisher","DOI":"10.1145\/357980.358017"},{"key":"ref255","doi-asserted-by":"publisher","DOI":"10.1145\/1536414.1536440"},{"key":"ref256","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC.2019.8916519"},{"issue":"3","key":"ref257","first-page":"1","article-title":"Fully homomorphic encryption without bootstrapping","volume":"6","author":"Yagisawa","year":"2015","journal-title":"Cryptol. ePrint Arch."},{"key":"ref258","first-page":"144","article-title":"Somewhat practical fully homomorphic encryption","author":"Fan","year":"2012","journal-title":"Cryptol. ePrint Arch."},{"key":"ref259","article-title":"REED: Chiplet-based accelerator for fully homomorphic encryption","author":"Aikata","year":"2023","journal-title":"arXiv:2308.02885"},{"key":"ref260","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683721"},{"key":"ref261","first-page":"10035","article-title":"She: A fast and accurate deep neural network for encrypted data","volume-title":"Proc. NIPS","author":"Lou"},{"key":"ref262","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-96878-0_17"},{"key":"ref263","first-page":"4497","article-title":"TAPAS: Tricks to accelerate (encrypted) prediction as a service","volume-title":"Proc. ICML","author":"Sanyal"},{"key":"ref264","article-title":"Faster CryptoNets: Leveraging sparsity for real-world encrypted inference","author":"Chou","year":"2018","journal-title":"arXiv:1811.09953"},{"key":"ref265","article-title":"FFConv: Fast factorized convolutional neural network inference on encrypted data","author":"Lu","year":"2021","journal-title":"arXiv:2102.03494"},{"key":"ref266","doi-asserted-by":"publisher","DOI":"10.1109\/ISMS.2010.89"},{"key":"ref267","doi-asserted-by":"publisher","DOI":"10.1109\/TETC.2016.2619669"},{"key":"ref268","doi-asserted-by":"publisher","DOI":"10.1145\/3385412.3385996"},{"key":"ref269","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589053"},{"key":"ref270","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527415"},{"key":"ref271","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2014.2345388"},{"key":"ref272","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00013"},{"key":"ref273","doi-asserted-by":"publisher","DOI":"10.1007\/s00145-019-09319-x"},{"key":"ref274","first-page":"409","article-title":"Research on homomorphic encryption for arithmetic of approximate numbers","volume-title":"Proc. Int. Conf. Intell. Syst. Commun., IoT Secur. (ICISCoIS)","author":"Cheon"},{"key":"ref275","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. NIPS","author":"Brown"},{"key":"ref276","article-title":"LLaMA: Open and efficient foundation language models","author":"Touvron","year":"2023","journal-title":"arXiv:2302.13971"},{"key":"ref277","first-page":"27168","article-title":"ZeroQuant: Efficient and affordable post-training quantization for large-scale transformers","volume-title":"Proc. NIPS","author":"Yao"},{"key":"ref278","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i4.20387"},{"key":"ref279","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2020.2976762"},{"key":"ref280","doi-asserted-by":"publisher","DOI":"10.1007\/s10115-022-01736-y"}],"container-title":["IEEE Transactions on Neural Networks and Learning Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/5962385\/10949581\/10557780.pdf?arnumber=10557780","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,5]],"date-time":"2025-04-05T06:34:44Z","timestamp":1743834884000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10557780\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4]]},"references-count":280,"journal-issue":{"issue":"4"},"URL":"https:\/\/doi.org\/10.1109\/tnnls.2024.3394494","relation":{},"ISSN":["2162-237X","2162-2388"],"issn-type":[{"value":"2162-237X","type":"print"},{"value":"2162-2388","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,4]]}}}