{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,23]],"date-time":"2026-03-23T19:54:37Z","timestamp":1774295677136,"version":"3.50.1"},"reference-count":40,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,3,31]],"date-time":"2025-03-31T00:00:00Z","timestamp":1743379200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,3,31]],"date-time":"2025-03-31T00:00:00Z","timestamp":1743379200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,3,31]]},"DOI":"10.23919\/date64628.2025.10992862","type":"proceedings-article","created":{"date-parts":[[2025,5,21]],"date-time":"2025-05-21T17:36:35Z","timestamp":1747848995000},"page":"1-7","source":"Crossref","is-referenced-by-count":2,"title":["SoftmAP: Software-Hardware Co-Design for Integer-Only Softmax on Associative Processors"],"prefix":"10.23919","author":[{"given":"Mariam","family":"Rakka","sequence":"first","affiliation":[{"name":"University of California,Irvine,CA,USA"}]},{"given":"Jinhao","family":"Li","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University,Shanghai,China"}]},{"given":"Guohao","family":"Dai","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University,Shanghai,China"}]},{"given":"Ahmed","family":"Eltawil","sequence":"additional","affiliation":[{"name":"King Abdullah University of Science and Technology (KAUST),Thuwal,Saudi Arabia"}]},{"given":"Mohammed E.","family":"Fouda","sequence":"additional","affiliation":[{"name":"Rain Neuromorphics Inc.,San Francisco,CA,USA"}]},{"given":"Fadi","family":"Kurdahi","sequence":"additional","affiliation":[{"name":"University of California,Irvine,CA,USA"}]}],"member":"263","reference":[{"key":"ref1","article-title":"A survey of large language models","author":"Zhao","year":"2023","journal-title":"arXiv preprint"},{"key":"ref2","article-title":"Sparks of artificial general intelligence: Early experiments with gpt-4","author":"Bubeck","year":"2023","journal-title":"arXiv preprint"},{"key":"ref3","volume-title":"Parameter, compute and data trends in machine learning","author":"AI","year":"2024"},{"key":"ref4","article-title":"A survey of resource-efficient llm and multimodal foundation models","author":"Xu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref5","article-title":"Scaling laws for neural language models","author":"Kaplan","year":"2020","journal-title":"arXiv preprint"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-industry.5"},{"key":"ref7","article-title":"Large language model inference acceleration: A comprehensive hardware perspective","author":"Li","year":"2024","journal-title":"arXiv preprint"},{"key":"ref8","article-title":"Model compression and efficient inference for large language models: A survey","author":"Wang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref9","first-page":"38087","article-title":"Smoothquant: Accurate and efficient post-training quantization for large language models","volume-title":"International Conference on Machine Learning","author":"Xiao","year":"2023"},{"key":"ref10","article-title":"Qlora: Efficient finetuning of quantized llms","volume":"36","author":"Dettmers","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref11","first-page":"27168","article-title":"Zeroquant: Efficient and affordable post-training quantization for large-scale transformers","volume":"35","author":"Yao","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref12","article-title":"Awq: Activation-aware weight quantization for llm compression and acceleration","author":"Lin","year":"2023","journal-title":"arXiv preprint"},{"key":"ref13","volume-title":"Fast and efficient 2-bit llm inference on gpu: 2\/4\/16-bit in a weight matrix with asynchronous dequantization","author":"Li","year":"2024"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00157"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18074.2021.9586134"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1145\/3676536.3676766"},{"key":"ref17","article-title":"Nongemm bench: Understanding the performance horizon of the latest ml workloads with nongemm workloads","volume-title":"arXiv preprint","author":"Karami","year":"2024"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/2.330035"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/TCSII.2022.3170468"},{"key":"ref20","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023","journal-title":"arXiv preprint"},{"key":"ref21","article-title":"Opt: Open pretrained transformer language models","author":"Zhang","year":"2022","journal-title":"arXiv preprint"},{"key":"ref22","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"Advances in neural information processing systems"},{"issue":"240","key":"ref23","first-page":"1","article-title":"Palm: Scaling language modeling with pathways","volume":"24","author":"Chowdhery","year":"2023","journal-title":"Journal of Machine Learning Research"},{"key":"ref24","article-title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","author":"Reid","year":"2024","journal-title":"arXiv preprint"},{"key":"ref25","volume-title":"Efficient acceleration of computation using associative in-memory processing","author":"Yantir","year":"2018"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/TVLSI.2018.2827262"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-47721-7_24"},{"key":"ref28","first-page":"5506","article-title":"I-bert: Integer-only bert quantization","volume-title":"International conference on machine learning","author":"Kim","year":"2021"},{"key":"ref29","article-title":"Online normalizer calculation for softmax","author":"Milakov","year":"2018","journal-title":"arXiv preprint"},{"key":"ref30","article-title":"Pointer sentinel mixture models","author":"Merity","year":"2016","journal-title":"arXiv preprint"},{"key":"ref31","volume-title":"PyTorch"},{"key":"ref32","volume-title":"Transformers"},{"key":"ref33","volume-title":"HuggingFace","year":"2024"},{"key":"ref34","article-title":"Bf-imna: A bit fluid in-memory neural architecture for neural network acceleration","author":"Rakka","year":"2024","journal-title":"arXiv preprint"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ASAP49362.2020.00017"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/3299874.3317988"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/IWOFC48002.2019.9078446"},{"key":"ref38","article-title":"Svd-softmax: Fast softmax approximation on large vocabulary neural networks","volume":"30","author":"Shim","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00018"},{"key":"ref40","article-title":"Flashattention-2: Faster attention with better parallelism and work partitioning","author":"Dao","year":"2023","journal-title":"arXiv preprint"}],"event":{"name":"2025 Design, Automation &amp; Test in Europe Conference (DATE)","location":"Lyon, France","start":{"date-parts":[[2025,3,31]]},"end":{"date-parts":[[2025,4,2]]}},"container-title":["2025 Design, Automation &amp;amp; Test in Europe Conference (DATE)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10992638\/10992588\/10992862.pdf?arnumber=10992862","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,22]],"date-time":"2025-05-22T05:32:32Z","timestamp":1747891952000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10992862\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,31]]},"references-count":40,"URL":"https:\/\/doi.org\/10.23919\/date64628.2025.10992862","relation":{},"subject":[],"published":{"date-parts":[[2025,3,31]]}}}