{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,16]],"date-time":"2026-01-16T13:17:21Z","timestamp":1768569441257,"version":"3.49.0"},"reference-count":55,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"8","license":[{"start":{"date-parts":[[2025,8,1]],"date-time":"2025-08-01T00:00:00Z","timestamp":1754006400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,8,1]],"date-time":"2025-08-01T00:00:00Z","timestamp":1754006400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,8,1]],"date-time":"2025-08-01T00:00:00Z","timestamp":1754006400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62376129"],"award-info":[{"award-number":["62376129"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2023YFF0725003"],"award-info":[{"award-number":["2023YFF0725003"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100010041","name":"Tianjin Science and Technology Major Project","doi-asserted-by":"publisher","award":["24ZXZSSS00420"],"award-info":[{"award-number":["24ZXZSSS00420"]}],"id":[{"id":"10.13039\/501100010041","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100012543","name":"Tiankai Higher Education Science and Technology Park Enterprise Research and Development Special Project","doi-asserted-by":"publisher","award":["23YFZXYC00029"],"award-info":[{"award-number":["23YFZXYC00029"]}],"id":[{"id":"10.13039\/100012543","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Neural Netw. Learning Syst."],"published-print":{"date-parts":[[2025,8]]},"DOI":"10.1109\/tnnls.2025.3548047","type":"journal-article","created":{"date-parts":[[2025,3,21]],"date-time":"2025-03-21T20:41:51Z","timestamp":1742589711000},"page":"15294-15308","source":"Crossref","is-referenced-by-count":3,"title":["2-D Transformer: Extending Large Language Models to Long-Context With Few Memory"],"prefix":"10.1109","volume":"36","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-1202-7784","authenticated-orcid":false,"given":"Xingyang","family":"He","sequence":"first","affiliation":[{"name":"College of Artificial Intelligence, Nankai University, Tianjin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5544-8417","authenticated-orcid":false,"given":"Jie","family":"Liu","sequence":"additional","affiliation":[{"name":"National Key Laboratory of Intelligent Tracking and Forecasting for Infectious Diseases, Engineering Research Center of Trusted Behavior Intelligence, Ministry of Education, College of Artificial Intelligence, Nankai University, Tianjin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yutai","family":"Duan","sequence":"additional","affiliation":[{"name":"College of Artificial Intelligence, Nankai University, Tianjin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"LongBench: A bilingual, multitask benchmark for long context understanding","author":"Bai","year":"2023","journal-title":"arXiv:2308.14508"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1102"},{"key":"ref3","article-title":"Retrieval-augmented generation for large language models: A survey","author":"Gao","year":"2023","journal-title":"arXiv:2312.10997"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-long.260"},{"key":"ref5","article-title":"Extending context window of large language models via positional interpolation","author":"Chen","year":"2023","journal-title":"arXiv:2306.15595"},{"key":"ref6","first-page":"1","article-title":"Focused transformer: Contrastive training for context scaling","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Tworkowski"},{"key":"ref7","article-title":"Long context compression with activation beacon","author":"Zhang","year":"2024","journal-title":"arXiv:2401.03462"},{"key":"ref8","article-title":"FocusLLM: Precise understanding of long context by dynamic condensing","author":"Li","year":"2024","journal-title":"arXiv:2408.11745"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-long.222"},{"key":"ref10","article-title":"LongNet: Scaling transformers to 1,000,000,000 tokens","author":"Ding","year":"2023","journal-title":"arXiv:2307.02486"},{"key":"ref11","article-title":"LongLoRA: Efficient fine-tuning of long-context large language models","author":"Chen","year":"2023","journal-title":"arXiv:2309.12307"},{"key":"ref12","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023","journal-title":"arXiv:2307.09288"},{"key":"ref13","article-title":"FlashAttention-2: Faster attention with better parallelism and work partitioning","author":"Dao","year":"2023","journal-title":"arXiv:2307.08691"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.617"},{"key":"ref15","volume-title":"RedPajama: An Open Dataset for Training Large Language Models","author":"Computer","year":"2023"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref17","article-title":"Longformer: The long-document transformer","author":"Beltagy","year":"2020","journal-title":"arXiv:2004.05150"},{"key":"ref18","first-page":"17283","article-title":"Big Bird: Transformers for longer sequences","volume-title":"Proc. 34th Conf. Neural Inf. Process. Syst. (NeurlPS)","volume":"33","author":"Zaheer"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-naacl.55"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.232"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.309"},{"key":"ref22","article-title":"Rethinking attention with performers","author":"Choromanski","year":"2020","journal-title":"arXiv:2009.14794"},{"key":"ref23","article-title":"Linformer: Self-attention with linear complexity","author":"Wang","year":"2020","journal-title":"arXiv:2006.04768"},{"key":"ref24","first-page":"2206","article-title":"Improving language models by retrieving from trillions of tokens","volume-title":"Proc. 39th Int. Conf. Mach. Learn.","volume":"162","author":"Borgeaud"},{"key":"ref25","article-title":"Memorizing transformers","author":"Wu","year":"2022","journal-title":"arXiv:2203.08913"},{"key":"ref26","article-title":"Landmark attention: Random-access infinite context length for transformers","author":"Mohtashami","year":"2023","journal-title":"arXiv:2305.16300"},{"key":"ref27","article-title":"HyPE-GT: Where graph transformers meet hyperbolic positional encodings","author":"Bose","year":"2023","journal-title":"arXiv:2312.06576"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/tnnls.2024.3408835"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2023.3335119"},{"key":"ref30","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. NIPS","volume":"33","author":"Brown"},{"key":"ref31","first-page":"1","article-title":"Efficient large-scale language model training on GPU clusters using megatron-LM","volume-title":"Proc. Int. Conf. High Perform. Comput., Netw., Storage Anal.","author":"Narayanan"},{"key":"ref32","article-title":"P-tuning v2: Prompt tuning can be comparable to fine-tuning universally across scales and tasks","author":"Liu","year":"2021","journal-title":"arXiv:2110.07602"},{"key":"ref33","article-title":"Prefix-tuning: Optimizing continuous prompts for generation","author":"Lisa Li","year":"2021","journal-title":"arXiv:2101.00190"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.64"},{"key":"ref35","article-title":"LoRA: Low-rank adaptation of large language models","author":"Hu","year":"2021","journal-title":"arXiv:2106.09685"},{"key":"ref36","first-page":"1950","article-title":"Few-shot parameter-efficient fine-tuning is better and cheaper than in-context learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Liu"},{"key":"ref37","article-title":"BitFit: Simple parameter-efficient fine-tuning for transformer-based masked language-models","author":"Zaken","year":"2021","journal-title":"arXiv:2106.10199"},{"key":"ref38","article-title":"Input-tuning: Adapting unfamiliar inputs to frozen pretrained models","author":"An","year":"2022","journal-title":"arXiv:2203.03131"},{"key":"ref39","first-page":"24193","article-title":"Training neural networks with fixed sparse masks","volume-title":"Proc. Neural Inf. Process. Syst. (NIPS)","volume":"34","author":"Sung"},{"key":"ref40","article-title":"Compressive transformers for long-range sequence modelling","author":"Rae","year":"2019","journal-title":"arXiv:1911.05507"},{"key":"ref41","article-title":"Proofpile: A pre-training dataset of mathematical texts","author":"Azerbayev","year":"2023"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-acl.264"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.74"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.112"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.472"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.3115\/1072228.1072378"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1147"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-5409"},{"key":"ref49","article-title":"TruthfulQA: Measuring how models mimic human falsehoods","author":"Lin","year":"2021","journal-title":"arXiv:2109.07958"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1472"},{"key":"ref51","article-title":"Efficient streaming language models with attention sinks","author":"Xiao","year":"2023","journal-title":"arXiv:2309.17453"},{"key":"ref52","article-title":"InfLLM: Training-free long-context extrapolation for LLMs with an efficient context memory","author":"Xiao","year":"2024","journal-title":"arXiv:2402.04617"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.417"},{"key":"ref54","first-page":"1","article-title":"How long can context length of open-source LLMs truly promise?","volume-title":"Proc. NeurIPS Workshop","author":"Li"},{"key":"ref55","article-title":"Generating long sequences with sparse transformers","author":"Child","year":"2019","journal-title":"arXiv:1904.10509"}],"container-title":["IEEE Transactions on Neural Networks and Learning Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/5962385\/11114436\/10937248.pdf?arnumber=10937248","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,6]],"date-time":"2025-08-06T18:01:15Z","timestamp":1754503275000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10937248\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8]]},"references-count":55,"journal-issue":{"issue":"8"},"URL":"https:\/\/doi.org\/10.1109\/tnnls.2025.3548047","relation":{},"ISSN":["2162-237X","2162-2388"],"issn-type":[{"value":"2162-237X","type":"print"},{"value":"2162-2388","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,8]]}}}