{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,16]],"date-time":"2026-06-16T14:45:28Z","timestamp":1781621128092,"version":"3.54.5"},"reference-count":83,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"6","license":[{"start":{"date-parts":[[2024,9,1]],"date-time":"2024-09-01T00:00:00Z","timestamp":1725148800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,9,1]],"date-time":"2024-09-01T00:00:00Z","timestamp":1725148800000},"content-version":"am","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,9,1]],"date-time":"2024-09-01T00:00:00Z","timestamp":1725148800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,9,1]],"date-time":"2024-09-01T00:00:00Z","timestamp":1725148800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"NSF DMS","award":["2304489"],"award-info":[{"award-number":["2304489"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE J. Sel. Top. Signal Process."],"published-print":{"date-parts":[[2024,9]]},"DOI":"10.1109\/jstsp.2024.3446173","type":"journal-article","created":{"date-parts":[[2024,8,22]],"date-time":"2024-08-22T18:29:11Z","timestamp":1724351351000},"page":"985-996","source":"Crossref","is-referenced-by-count":1,"title":["S$^\\text{3}$Attention: Improving Long Sequence Attention With Smoothed Skeleton Sketching"],"prefix":"10.1109","volume":"18","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-2296-9688","authenticated-orcid":false,"given":"Xue","family":"Wang","sequence":"first","affiliation":[{"name":"Alibaba Group, Bellevue, WA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1789-5413","authenticated-orcid":false,"given":"Tian","family":"Zhou","sequence":"additional","affiliation":[{"name":"Alibaba Group, Bellevue, WA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4563-6785","authenticated-orcid":false,"given":"Jianqing","family":"Zhu","sequence":"additional","affiliation":[{"name":"Computer, Electrical and Mathematical Science and Engineering Division, King Abdullah University of Science and Technology, Thuwal, Saudi Arabia"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jialin","family":"Liu","sequence":"additional","affiliation":[{"name":"Department of Statistics and Data Science, University of Central Florida, Orlando, FL, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kun","family":"Yuan","sequence":"additional","affiliation":[{"name":"Center for Machine Learning Research, Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Tao","family":"Yao","sequence":"additional","affiliation":[{"name":"Antai College of Economics and Management, Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Wotao","family":"Yin","sequence":"additional","affiliation":[{"name":"Alibaba Group, Bellevue, WA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8797-4646","authenticated-orcid":false,"given":"Rong","family":"Jin","sequence":"additional","affiliation":[{"name":"Meta, Menlo Park, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2937-1986","authenticated-orcid":false,"given":"HanQin","family":"Cai","sequence":"additional","affiliation":[{"name":"Department of Statistics and Data Science and the Department of Computer Science, University of Central Florida, Orlando, FL, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"ref3","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. Adv. Neural Inform. Process. Syst.","volume":"33","author":"Brown","year":"2020"},{"key":"ref4","article-title":"Roberta: A robustly optimized bert pretraining approach","author":"Liu","year":"2019"},{"key":"ref5","article-title":"Electra: Pre-training text encoders as discriminators rather than generators","author":"Clark","year":"2020"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2010.11929"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref8","first-page":"10347","article-title":"Training data-efficient image transformers & distillation through attention","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Touvron","year":"2021"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3206108"},{"key":"ref10","article-title":"Elsa: Enhanced local self-attention for vision transformer","author":"Zhou","year":"2021"},{"key":"ref11","first-page":"22419","article-title":"Autoformer: Decomposition transformers with auto-correlation for long-term series forecasting","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Xu","year":"2021"},{"key":"ref12","first-page":"27268","article-title":"FEDformer: Frequency enhanced decomposed transformer for long-term series forecasting","volume-title":"Proc. 39th Int. Conf. Mach. Learn.","author":"Zhou","year":"2022"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/3437963.3441740"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1145\/3442381.3449983"},{"key":"ref15","first-page":"7483","article-title":"Forecasting treatment responses over time using recurrent marginal structural networks","volume-title":"Proc. Neural Inf. Process. Syst.","volume":"18","author":"Lim"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1017\/S0950268818000705"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.14778\/3137765.3137775"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3366424.3382728"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1186\/s40537-019-0212-5"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/3366424.3383118"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3485447.3512056"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/3366423.3380288"},{"key":"ref23","first-page":"17723","article-title":"Long-short transformer: Efficient transformers for language and vision","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Zhu","year":"2021"},{"key":"ref24","article-title":"Rethinking attention with performers","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Choromanski","year":"2020"},{"key":"ref25","first-page":"17413","article-title":"Scatterbrain: Unifying sparse and low-rank attention","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Chen","year":"2021"},{"key":"ref26","article-title":"On learning the transformer kernel","author":"Chowdhury","year":"2021"},{"key":"ref27","article-title":"Cosformer: Rethinking softmax in attention","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Qin","year":"2021"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.294"},{"key":"ref29","first-page":"29449","article-title":"Fmmformer: Efficient and flexible transformer via decomposed near-field and far-field attention","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Nguyen","year":"2021"},{"key":"ref30","first-page":"2441","article-title":"Luna: Linear unified nested attention","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Ma","year":"2021"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1137\/07070471X"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1137\/110852310"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2020.3044130"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1137\/20M1388322"},{"issue":"185","key":"ref35","first-page":"1","article-title":"Mode-wise tensor decompositions: Multi-dimensional generalizations of CUR decompositions","volume":"22","author":"Cai","year":"2021","journal-title":"J. Mach. Learn. Res."},{"key":"ref36","first-page":"152","article-title":"Riemannian CUR decompositions for robust principal component analysis","volume-title":"Proc. Topological, Algebr. Geometric Learn. Workshops","author":"Hamm","year":"2022"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3261185"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1137\/23M1574282"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CISS59072.2024.10480165"},{"key":"ref40","article-title":"Long range arena : A benchmark for efficient transformers","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Tay","year":"2021"},{"key":"ref41","article-title":"Fast training of convolutional networks through FFTS: International conference on learning representations (iclr2014), CBLS","volume-title":"Proc. 2nd Int. Conf. Learn. Representations","author":"Mathieu","year":"2014"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1145\/3530811"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.232"},{"key":"ref44","first-page":"4055","article-title":"Image transformer","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Parmar","year":"2018"},{"key":"ref45","article-title":"Generating wikipedia by summarizing long sequences","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Liu","year":"2018"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/p19-1285"},{"key":"ref47","article-title":"Compressive transformers for long-range sequence modelling","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Rae","year":"2020"},{"key":"ref48","first-page":"9438","article-title":"Sparse sinkhorn attention","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Tay","year":"2020"},{"key":"ref49","article-title":"Longformer: The long-document transformer","author":"Beltagy","year":"2020"},{"key":"ref50","article-title":"Generating long sequences with sparse transformers","author":"Child","year":"2019"},{"key":"ref51","article-title":"Reformer: The efficient transformer","volume-title":"Proc. 8th Int. Conf. Learn. Representations","author":"Kitaev","year":"2020"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00353"},{"key":"ref53","first-page":"17283","article-title":"Big bird: Transformers for longer sequences","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Zaheer","year":"2020"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.319"},{"key":"ref55","article-title":"PoNet: Pooling network for efficient token mixing in long sequences","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Tan"},{"key":"ref56","article-title":"Linformer: Self-attention with linear complexity","author":"Wang"},{"key":"ref57","article-title":"Random feature attention","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Peng","year":"2020"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053878"},{"key":"ref59","first-page":"10183","article-title":"Synthesizer: Rethinking self-attention for transformer models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Tay","year":"2021"},{"key":"ref60","first-page":"14138","article-title":"Nystrmformer: A nystrm-based algorithm for approximating self-attention","volume-title":"Proc. AAAI Conf. Artif. Intell.","volume":"35","author":"Xiong","year":"2021"},{"key":"ref61","first-page":"5156","article-title":"Transformers are rnns: Fast autoregressive transformers with linear attention","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Katharopoulos"},{"key":"ref62","article-title":"Perceiver IO: A general architecture for structured inputs & outputs","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Jaegle","year":"2021"},{"key":"ref63","first-page":"4271","article-title":"Funnel-transformer: Filtering out sequential redundancy for efficient language processing","volume-title":"Proc. Adv. Neural Inform. Process. Syst.","volume":"33","author":"Dai","year":"2020"},{"key":"ref64","first-page":"20014","article-title":"Xcit: Cross-covariance image transformers","volume-title":"Proc. Adv. Neural Inform. Process. Syst.","volume":"34","author":"Ali","year":"2021"},{"key":"ref65","article-title":"Charformer: Fast character transformers via gradient-based subword tokenization","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Tay","year":"2021"},{"key":"ref66","first-page":"1474","article-title":"Hippo: Recurrent memory with optimal polynomial projections","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Gu","year":"2020"},{"key":"ref67","article-title":"Efficiently modeling long sequences with structured state spaces","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Gu","year":"2022"},{"key":"ref68","first-page":"22982","article-title":"Diagonal state spaces are as effective as structured state spaces","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Gupta","year":"2022"},{"key":"ref69","first-page":"572","article-title":"Combining recurrent, convolutional, and continuous-time models with linear state space layers","volume-title":"Proc. Adv. Neural Inform. Process. Syst.","volume":"34","author":"Gu","year":"2021"},{"key":"ref70","first-page":"22982","article-title":"Simplified state space layers for sequence modeling","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Smith","year":"2023"},{"key":"ref71","article-title":"Liquid structural state-space models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Hasani","year":"2023"},{"key":"ref72","article-title":"How to train your HIPPO: State space models with generalized orthogonal basis projections","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Gu","year":"2023"},{"key":"ref73","article-title":"Mega: Moving average equipped gated attention","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Ma","year":"2023"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1137\/19M128394X"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1007\/s10208-009-9045-5"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1007\/s41095-023-0364-2"},{"key":"ref77","article-title":"Legendre memory units: Continuous-time representation in recurrent neural networks","volume-title":"Proc. Adv. Neural Inform. Process. Syst.","volume":"32","author":"Voelker","year":"2019"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20150"},{"key":"ref79","first-page":"101","article-title":"Autoformer: Decomposition transformers with auto-correlation for long-term series forecasting","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Wu","year":"2021"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i12.17325"},{"key":"ref81","article-title":"Enhancing the locality and breaking the memory bottleneck of transformer on time series forecasting","volume-title":"Proc. Adv. Neural Inform. Process. Syst.","volume":"32","author":"Li","year":"2019"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1145\/1132516.1132597"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.5"}],"container-title":["IEEE Journal of Selected Topics in Signal Processing"],"original-title":[],"link":[{"URL":"https:\/\/ieeexplore.ieee.org\/ielam\/4200690\/10852353\/10643652-aam.pdf","content-type":"application\/pdf","content-version":"am","intended-application":"syndication"},{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/4200690\/10852353\/10643652.pdf?arnumber=10643652","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,2,6]],"date-time":"2025-02-06T18:40:36Z","timestamp":1738867236000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10643652\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9]]},"references-count":83,"journal-issue":{"issue":"6"},"URL":"https:\/\/doi.org\/10.1109\/jstsp.2024.3446173","relation":{},"ISSN":["1932-4553","1941-0484"],"issn-type":[{"value":"1932-4553","type":"print"},{"value":"1941-0484","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,9]]}}}