{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,9]],"date-time":"2026-04-09T14:44:48Z","timestamp":1775745888975,"version":"3.50.1"},"reference-count":49,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"9","license":[{"start":{"date-parts":[[2024,9,1]],"date-time":"2024-09-01T00:00:00Z","timestamp":1725148800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,9,1]],"date-time":"2024-09-01T00:00:00Z","timestamp":1725148800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,9,1]],"date-time":"2024-09-01T00:00:00Z","timestamp":1725148800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62102249"],"award-info":[{"award-number":["62102249"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62232015"],"award-info":[{"award-number":["62232015"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Comput.-Aided Des. Integr. Circuits Syst."],"published-print":{"date-parts":[[2024,9]]},"DOI":"10.1109\/tcad.2024.3373592","type":"journal-article","created":{"date-parts":[[2024,3,5]],"date-time":"2024-03-05T19:09:53Z","timestamp":1709665793000},"page":"2783-2796","source":"Crossref","is-referenced-by-count":18,"title":["Hardware\u2013Software Co-Design Enabling Static and Dynamic Sparse Attention Mechanisms"],"prefix":"10.1109","volume":"43","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8211-2812","authenticated-orcid":false,"given":"Jieru","family":"Zhao","sequence":"first","affiliation":[{"name":"Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-2319-7859","authenticated-orcid":false,"given":"Pai","family":"Zeng","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1264-8715","authenticated-orcid":false,"given":"Guan","family":"Shen","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5832-0347","authenticated-orcid":false,"given":"Quan","family":"Chen","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0034-2302","authenticated-orcid":false,"given":"Minyi","family":"Guo","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"ref3","article-title":"HuggingFace\u2019s transformers: State-of-the-art natural language processing","author":"Wolf","year":"2019","journal-title":"arXiv:1910.03771"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2010.11929"},{"key":"ref6","first-page":"4055","article-title":"Image transformer","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Parmar"},{"key":"ref7","article-title":"Voice transformer network: Sequence-to-sequence voice conversion using transformer with text-to-speech pretraining","author":"Huang","year":"2019","journal-title":"arXiv:1912.06813"},{"key":"ref8","first-page":"9361","article-title":"SqueezeFormer: An efficient transformer for automatic speech recognition","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Kim"},{"key":"ref9","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018","journal-title":"arXiv:1810.04805"},{"key":"ref10","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Brown"},{"key":"ref11","article-title":"Gpt-4 technical report","volume-title":"arXiv:2303.08774","year":"2023"},{"key":"ref12","article-title":"GPTEval: NLG evaluation using GPT-4 with better human alignment","author":"Liu","year":"2023","journal-title":"arXiv:2303.16634"},{"key":"ref13","article-title":"Evaluating the logical reasoning ability of chatGPT and GPT-4","author":"Liu","year":"2023","journal-title":"arXiv:2304.03439"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i14.17533"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00018"},{"key":"ref16","article-title":"FlashAttention-2: Faster attention with better parallelism and work partitioning","author":"Dao","year":"2023","journal-title":"arXiv:2307.08691"},{"key":"ref17","article-title":"Generating long sequences with sparse transformers","author":"Child","year":"2019","journal-title":"arXiv:1904.10509"},{"key":"ref18","article-title":"Blockwise self-attention for long document understanding","author":"Qiu","year":"2019","journal-title":"arXiv:1911.02972"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-1133"},{"key":"ref20","first-page":"17283","article-title":"Big bird: Transformers for longer sequences","volume-title":"Proc. 34th Adv. Neural Inf. Process. Syst.","volume":"33","author":"Zaheer"},{"key":"ref21","article-title":"Longformer: The long-document transformer","author":"Beltagy","year":"2020","journal-title":"arXiv:2004.05150"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00299"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01181"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1223"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1361"},{"key":"ref26","article-title":"Reformer: The efficient transformer","author":"Kitaev","year":"2020","journal-title":"arXiv:2001.04451"},{"key":"ref27","first-page":"9438","article-title":"Sparse sinkhorn attention","volume-title":"Proc. ICML","author":"Tay"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00353"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2022.3208206"},{"key":"ref30","first-page":"328","article-title":"A3: Accelerating attention mechanisms in neural networks with approximation","volume-title":"Proc. IEEE Int. Symp. High Perform. Comput. Archit. (HPCA)","author":"Ham"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480125"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2023.3273992"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1145\/3370748.3406567"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA56546.2023.10071027"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1145\/3489517.3530504"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589057"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2022.3170848"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.23919\/DATE54114.2022.9774692"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507738"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00050"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18074.2021.9586134"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1145\/1468075.1468121"},{"key":"ref43","volume-title":"NVIDIA\/DeepLearningExamples.","year":"2021"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-5446"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1264"},{"key":"ref46","article-title":"Large-scale cloze test dataset created by teachers","author":"Xie","year":"2017","journal-title":"arXiv:1711.03225"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1145\/2228360.2228584"},{"key":"ref48","first-page":"8026","article-title":"PyTorch: An imperative style, high-performance deep learning library","volume-title":"Proc. 33rd Adv. Neural Inf. Process. Syst.","volume":"32","author":"Paszke"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/EMC2-NIPS53020.2019.00010"}],"container-title":["IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/43\/10643378\/10460307.pdf?arnumber=10460307","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,1]],"date-time":"2024-09-01T03:59:49Z","timestamp":1725163189000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10460307\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9]]},"references-count":49,"journal-issue":{"issue":"9"},"URL":"https:\/\/doi.org\/10.1109\/tcad.2024.3373592","relation":{},"ISSN":["0278-0070","1937-4151"],"issn-type":[{"value":"0278-0070","type":"print"},{"value":"1937-4151","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,9]]}}}