{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,8]],"date-time":"2026-06-08T20:05:07Z","timestamp":1780949107657,"version":"3.54.1"},"reference-count":28,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"2","license":[{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100000185","name":"Defense Advanced Research Projects Agency","doi-asserted-by":"publisher","award":["CoCoSys, one of seven centers in JUMP 2.0, a Semiconductor Research Corporation (SRC)"],"award-info":[{"award-number":["CoCoSys, one of seven centers in JUMP 2.0, a Semiconductor Research Corporation (SRC)"]}],"id":[{"id":"10.13039\/100000185","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Circuits Syst. Artif. Intel."],"published-print":{"date-parts":[[2026,4]]},"DOI":"10.1109\/tcasai.2025.3628250","type":"journal-article","created":{"date-parts":[[2025,11,3]],"date-time":"2025-11-03T18:44:00Z","timestamp":1762195440000},"page":"90-100","source":"Crossref","is-referenced-by-count":0,"title":["Softprox: A Post-Finetuning Methodology to Mitigate Softmax Bottlenecks in Transformer Workloads"],"prefix":"10.1109","volume":"3","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6961-3540","authenticated-orcid":false,"given":"Aradhana Mohan","family":"Parvathy","sequence":"first","affiliation":[{"name":"Purdue University, West Lafayette, IN, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6864-7738","authenticated-orcid":false,"given":"Sourjya","family":"Roy","sequence":"additional","affiliation":[{"name":"Purdue University, West Lafayette, IN, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6776-1427","authenticated-orcid":false,"given":"Soumendu Kumar","family":"Ghosh","sequence":"additional","affiliation":[{"name":"Intel Corporation, Santa Clara, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8848-1069","authenticated-orcid":false,"given":"Arnab","family":"Raha","sequence":"additional","affiliation":[{"name":"Intel Corporation, Santa Clara, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-3768-1337","authenticated-orcid":false,"given":"Deepak","family":"Mathaikutty","sequence":"additional","affiliation":[{"name":"Intel Corporation, Santa Clara, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4624-564X","authenticated-orcid":false,"given":"Anand","family":"Raghunathan","sequence":"additional","affiliation":[{"name":"Purdue University, West Lafayette, IN, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2010.11929"},{"key":"ref3","first-page":"28492","article-title":"Robust speech recognition via large-scale weak supervision","volume-title":"Proc. Int. Conf. Mach. Learn. (ICML)","author":"Radford","year":"2023"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref5","article-title":"Peering inside GPT-4: Understanding its mixture of experts (MOE) architecture"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52688.2022.01179"},{"key":"ref8","article-title":"Gemini 2.5: Our most intelligent ai model"},{"key":"ref9","article-title":"Efficient attention mechanisms for large language models: A survey","author":"Sun","year":"2025"},{"key":"ref10","article-title":"A survey of efficient attention methods: Hardware-efficient, sparse, compact, and linear attention","author":"Zhang"},{"key":"ref11","article-title":"Quick overview of intel\u2019s neural processing unit (NPU)"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/COINS65080.2025.11125797"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1189"},{"key":"ref14","first-page":"1","article-title":"Model tells you what to discard: Adaptive KV cache compression for LLMs","volume-title":"Proc. 12th Int. Conf. Learn. Representations","author":"Ge","year":"2024"},{"key":"ref15","article-title":"Linformer: Self-attention with linear complexity","author":"Wang","year":"2020"},{"key":"ref16","first-page":"21297","article-title":"SOFT: Softmax-free transformer with linear complexity","volume-title":"Proc. Adv. Neural Inf. Process. Syst. 34: Annu. Conf. Neural Inf. Process. Syst. (NeurIPS)","author":"Lu","year":"2021"},{"key":"ref17","first-page":"1","article-title":"MA-BERT: Towards matrix arithmetic-only BERT inference by eliminating complex non-linear functions","volume-title":"Proc. 11th Int. Conf. Learn. Representations","author":"Wei Ming","year":"2023"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18074.2021.9586134"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00259"},{"key":"ref20","doi-asserted-by":"crossref","first-page":"577","DOI":"10.1145\/3489517.3530505","article-title":"NN-LUT: Neural approximation of non-linear operations for efficient transformer inference","volume-title":"Proc. 59th ACM\/IEEE Des. Autom. Conf. (DAC)","author":"Yu","year":"2022"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.3389\/fhpcp.2025.1570210"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/w18-5446"},{"key":"ref23","first-page":"8026","article-title":"PyTorch: An imperative style, high-performance deep learning library","volume-title":"Proc. Adv. Neural Information Process. Syst.","author":"Paszke","year":"2019"},{"key":"ref24","article-title":"Language models are unsupervised multitask learners","author":"Radford","year":"2019"},{"key":"ref25","article-title":"The Llama 3 herd of models","year":"2024"},{"key":"ref27","article-title":"Training data-efficient image transformers & distillation through attention","volume-title":"Proc. 38th Int. Conf. Mach. Learn., (ICML)","author":"Touvron","year":"2021"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1775"},{"key":"ref31","article-title":"Openvino"}],"container-title":["IEEE Transactions on Circuits and Systems for Artificial Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10495160\/11458634\/11224505.pdf?arnumber=11224505","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,6,8]],"date-time":"2026-06-08T19:50:38Z","timestamp":1780948238000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11224505\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4]]},"references-count":28,"journal-issue":{"issue":"2"},"URL":"https:\/\/doi.org\/10.1109\/tcasai.2025.3628250","relation":{},"ISSN":["2996-6647"],"issn-type":[{"value":"2996-6647","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,4]]}}}