{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,6]],"date-time":"2026-06-06T01:28:20Z","timestamp":1780709300462,"version":"3.54.1"},"reference-count":97,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"5","license":[{"start":{"date-parts":[[2025,5,1]],"date-time":"2025-05-01T00:00:00Z","timestamp":1746057600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,5,1]],"date-time":"2025-05-01T00:00:00Z","timestamp":1746057600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,5,1]],"date-time":"2025-05-01T00:00:00Z","timestamp":1746057600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2025,5]]},"DOI":"10.1109\/tpami.2025.3538221","type":"journal-article","created":{"date-parts":[[2025,2,3]],"date-time":"2025-02-03T18:26:46Z","timestamp":1738607206000},"page":"4061-4074","source":"Crossref","is-referenced-by-count":2,"title":["Pre-Training a Graph Recurrent Network for Text Understanding"],"prefix":"10.1109","volume":"47","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8705-9598","authenticated-orcid":false,"given":"Yile","family":"Wang","sequence":"first","affiliation":[{"name":"College of Computer Science and Software Engineering, Shenzhen University, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Linyi","family":"Yang","sequence":"additional","affiliation":[{"name":"School of Engineering, Westlake University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6757-6897","authenticated-orcid":false,"given":"Zhiyang","family":"Teng","sequence":"additional","affiliation":[{"name":"ByteDance, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2551-2964","authenticated-orcid":false,"given":"Ming","family":"Zhou","sequence":"additional","affiliation":[{"name":"Langboat Ltd., Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5214-2268","authenticated-orcid":false,"given":"Yue","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Engineering, Westlake University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-1202"},{"key":"ref2","article-title":"Language models are unsupervised multitask learners","author":"Radford","year":"2019"},{"key":"ref3","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Brown"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"ref5","article-title":"RoBERTa: A robustly optimized BERT pretraining approach","author":"Liu","year":"2019"},{"key":"ref6","first-page":"5754","article-title":"XLNet: Generalized autoregressive pretraining for language understanding","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Yang"},{"key":"ref7","article-title":"ALBERT: A lite BERT for self-supervised learning of language representations","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Lan"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1525\/9780520940420-020"},{"key":"ref9","first-page":"140:1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"J. Mach. Learn. Res."},{"key":"ref10","article-title":"DeBERTa: Decoding-enhanced BERT with disentangled attention","volume-title":"Proc. Int. Conf. Learn. Representations","author":"He"},{"key":"ref11","article-title":"ELECTRA: Pre-training text encoders as discriminators rather than generators","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Clark"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-5446"},{"key":"ref13","first-page":"3261","article-title":"Superglue: A stickier benchmark for general-purpose language understanding systems","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Wang"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1264"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-2124"},{"key":"ref16","first-page":"1693","article-title":"Teaching machines to read and comprehend","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Hermann"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1206"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.9"},{"key":"ref19","article-title":"Lamda: Language models for dialog applications","author":"Thoppilan","year":"2022"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"ref22","article-title":"An image is worth 16\u00d716 words: Transformers for image recognition at scale","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Dosovitskiy"},{"key":"ref23","first-page":"10347","article-title":"Training data-efficient image transformers & distillation through attention","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Touvron"},{"key":"ref24","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Baevski"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1038\/s41586-021-03819-2"},{"key":"ref26","article-title":"LLaMA: Open and efficient foundation language models","author":"Touvron","year":"2023","journal-title":"arXiv:2302.13971"},{"key":"ref27","article-title":"LLaMA 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023","journal-title":"arXiv:2307.09288"},{"key":"ref28","article-title":"ChatGPT","year":"2022"},{"key":"ref29","article-title":"GPT-4 Technical Report","author":"Achiam","year":"2023"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/p19-1355"},{"key":"ref31","article-title":"Pay less attention with lightweight and dynamic convolutions","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Wu"},{"key":"ref32","first-page":"2793","article-title":"Attention is not all you need: Pure attention loses rank doubly exponentially with depth","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Dong"},{"key":"ref33","first-page":"10183","article-title":"Synthesizer: Rethinking self-attention for transformer models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Tay"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.335"},{"key":"ref35","article-title":"Pay Attention to MLPs","author":"Liu","year":"2021"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.936"},{"key":"ref37","article-title":"Mamba: Linear-time sequence modeling with selective state spaces","author":"Gu","year":"2023"},{"key":"ref38","article-title":"Jamba: A hybrid transformer-Mamba language model","author":"Lieber","year":"2024"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1410"},{"key":"ref40","article-title":"Rethinking positional encoding in language pre-training","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Ke"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3337643"},{"key":"ref42","first-page":"3837","article-title":"Convolutional neural networks on graphs with fast localized spectral filtering","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Defferrard"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1209"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1156"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.295"},{"key":"ref46","first-page":"1243","article-title":"Convolutional sequence to sequence learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Gehring"},{"key":"ref47","article-title":"Semi-supervised classification with graph convolutional networks","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Kipf"},{"key":"ref48","article-title":"Graph attention networks","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Velickovic"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1030"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1150"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/K16-1006"},{"key":"ref52","article-title":"An attention free transformer","author":"Zhai","year":"2021"},{"key":"ref53","article-title":"Efficiently modeling long sequences with structured state spaces","author":"Gu","year":"2021"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1016\/j.aiopen.2021.08.002"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.319"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1159"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1110"},{"key":"ref59","article-title":"Reformer: The efficient transformer","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Kitaev"},{"key":"ref60","article-title":"Longformer: The long-document transformer","author":"Beltagy","year":"2020"},{"key":"ref61","article-title":"LinFormer: self-attention with linear complexity","author":"Wang","year":"2020"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.2307\/j.ctv36zrf8.5"},{"key":"ref63","article-title":"Random feature attention","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Peng"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1145\/3530811"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-1162"},{"key":"ref66","article-title":"Layer normalization","author":"Ba","year":"2016"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/d18-2012"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.831"},{"key":"ref69","article-title":"Universal transformers","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Dehghani"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-4009"},{"key":"ref71","article-title":"Pointer sentinel mixture models","author":"Merity","year":"2016"},{"key":"ref72","article-title":"DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter","author":"Sanh","year":"2019"},{"key":"ref73","first-page":"17283","article-title":"Big bird: Transformers for longer sequences","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Zaheer"},{"key":"ref74","article-title":"Pre-training with whole word masking for Chinese BERT","author":"Cui","year":"2019"},{"key":"ref75","article-title":"CLUECorpus2020: A large-scale Chinese corpus for pre-training language model","author":"Xu","year":"2020"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.58"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.11"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.coling-main.419"},{"key":"ref79","first-page":"1952","article-title":"LCQMC: A large-scale Chinese question matching corpus","volume-title":"Proc. Int. Conf. Comput. Linguistics","author":"Liu"},{"key":"ref80","first-page":"774","article-title":"Scalable term selection for text categorization","volume-title":"Proc. Conf. Empir. Methods Natural Lang. Process.-Comput. Natural Lang. Learn.","author":"Li"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2007.05.028"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-acl.435"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i15.17580"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1074"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.212"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/d19-1006"},{"key":"ref88","article-title":"Isotropy in the contextual embedding space: Clusters and manifolds","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Cai"},{"key":"ref89","article-title":"Representation degeneration problem in training natural language generation models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Gao"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.46"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i16.17718"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.552"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-short.73"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W19-4828"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1445"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.587"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.398"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/34\/10958761\/10870153.pdf?arnumber=10870153","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,14]],"date-time":"2025-04-14T18:19:06Z","timestamp":1744654746000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10870153\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5]]},"references-count":97,"journal-issue":{"issue":"5"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2025.3538221","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,5]]}}}