{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,5]],"date-time":"2026-05-05T12:29:17Z","timestamp":1777984157713,"version":"3.51.4"},"reference-count":98,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"National Key R&amp;D Program of China","award":["2022ZD0116002"],"award-info":[{"award-number":["2022ZD0116002"]}]},{"name":"Shenzhen Science and Technology Plan","award":["ShenKeJiChuangXinZhi[2023]87"],"award-info":[{"award-number":["ShenKeJiChuangXinZhi[2023]87"]}]},{"name":"Shenzhen Science and Technology Plan","award":["KJZD20230923115113026"],"award-info":[{"award-number":["KJZD20230923115113026"]}]},{"name":"Science and Technology Department of Guizhou Province","award":["Qiankehe Support[2022]General019"],"award-info":[{"award-number":["Qiankehe Support[2022]General019"]}]},{"DOI":"10.13039\/501100012456","name":"National Social Science Fund of China","doi-asserted-by":"publisher","award":["20&ZD226"],"award-info":[{"award-number":["20&ZD226"]}],"id":[{"id":"10.13039\/501100012456","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100002858","name":"China Postdoctoral Science Foundation","doi-asserted-by":"publisher","award":["2023M741843"],"award-info":[{"award-number":["2023M741843"]}],"id":[{"id":"10.13039\/501100002858","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012228","name":"Development and Reform Commission of Shenzhen Municipality","doi-asserted-by":"publisher","award":["XMHT20190108009"],"award-info":[{"award-number":["XMHT20190108009"]}],"id":[{"id":"10.13039\/501100012228","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62276075"],"award-info":[{"award-number":["62276075"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62106115"],"award-info":[{"award-number":["62106115"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62006062"],"award-info":[{"award-number":["62006062"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62176076"],"award-info":[{"award-number":["62176076"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Guangdong Provincial Key Laboratory","award":["2022B1212010005"],"award-info":[{"award-number":["2022B1212010005"]}]},{"name":"PCL","award":["PCL2022D01"],"award-info":[{"award-number":["PCL2022D01"]}]},{"name":"PCL","award":["PCL2023A09"],"award-info":[{"award-number":["PCL2023A09"]}]},{"name":"Key Laboratory of Intelligent Computing in Network Environment"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2024]]},"DOI":"10.1109\/taslp.2024.3374062","type":"journal-article","created":{"date-parts":[[2024,3,6]],"date-time":"2024-03-06T14:00:38Z","timestamp":1709733638000},"page":"2125-2140","source":"Crossref","is-referenced-by-count":1,"title":["BaSFormer: A Balanced Sparsity Regularized Attention Network for Transformer"],"prefix":"10.1109","volume":"32","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5888-0328","authenticated-orcid":false,"given":"Shuoran","family":"Jiang","sequence":"first","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8473-7293","authenticated-orcid":false,"given":"Qingcai","family":"Chen","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1395-6805","authenticated-orcid":false,"given":"Yang","family":"Xiang","sequence":"additional","affiliation":[{"name":"Peng Cheng Laboratory, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8270-5455","authenticated-orcid":false,"given":"Youcheng","family":"Pan","sequence":"additional","affiliation":[{"name":"Peng Cheng Laboratory, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5267-2250","authenticated-orcid":false,"given":"Xiangping","family":"Wu","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"A survey of large language models","author":"Zhao","year":"2023"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1016\/j.ajog.2023.03.010"},{"key":"ref3","first-page":"5485","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"J. Mach. Learn. Res."},{"key":"ref4","article-title":"LLaMA: Open and efficient foundation language models","author":"Touvron","year":"2023"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1038\/s41576-022-00532-2"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref7","article-title":"ChatDoctor: A medical chat model fine-tuned on LLaMA model using medical domain knowledge","author":"Yunxiang","year":"2023"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1016\/j.aiopen.2022.10.001"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2022.3199648"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3152247"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.400"},{"key":"ref12","article-title":"Improving graph attention networks with large margin-based constraints","author":"Wang","year":"2019"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1057\/s41599-020-00654-0"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2022.3162081"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.308"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3531811"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1475"},{"key":"ref18","article-title":"DropAttention: A regularization method for fully-connected self-attention networks","author":"Zehui","year":"2019"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-acl.346"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2021.3087662"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.470"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W19-4828"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527423"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/3544103"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1317"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1467"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.7717\/peerj.16125\/fig-6"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3207050"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1145\/3594633"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i14.17533"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-76153-9_28"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1146"},{"key":"ref33","article-title":"Adaptive sparse transformer for multilingual translation","author":"Gong","year":"2021"},{"key":"ref34","first-page":"1614","article-title":"From softmax to sparsemax: A sparse model of attention and multi-label classification","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Martins","year":"2016"},{"key":"ref35","first-page":"3340","article-title":"A regularized framework for sparse and structured neural attention","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Niculae","year":"2017"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2913087"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4613-0163-9"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1007\/s10878-019-00403-3"},{"key":"ref39","first-page":"2042","article-title":"Convolutional neural network architectures for matching natural language sentences","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Hu","year":"2014"},{"key":"ref40","first-page":"1","article-title":"Combining graph convolutional neural networks and label propagation","volume":"40","author":"Wang","year":"2021","journal-title":"ACM Trans. Inf. Syst."},{"key":"ref41","article-title":"The pile: An 800gb dataset of diverse text for language modeling","author":"Gao","year":"2020"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.11"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1145\/3340531.3412762"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4939-7131-2_100141"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/n18-1074"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1341"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-99-3723-3_16"},{"key":"ref49","first-page":"1298","article-title":"PAWS: Paraphrase adversaries from word scrambling","volume-title":"Proc. North Amer. Chapter Assoc. Comput. Linguistics: Hum. Lang. Technol.","author":"Zhang","year":"2019"},{"key":"ref50","article-title":"LongFormer: The long-document transformer","author":"Beltagy","year":"2020"},{"key":"ref51","first-page":"17283","article-title":"Big bird: Transformers for longer sequences","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Zaheer","year":"2020"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.eacl-main.291"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1418"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.770"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1145\/3459637.3482078"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-6115"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1435"},{"key":"ref58","first-page":"13657","article-title":"Uncertainty calibration for ensemble-based debiasing methods","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Xiong","year":"2021"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-acl.168"},{"key":"ref60","first-page":"2278","article-title":"Decorrelate irrelevant, purify relevant: Overcome textual spurious correlations from a feature perspective","volume-title":"Proc. Int. Conf. Comput. Linguistics","author":"Dou","year":"2022"},{"key":"ref61","article-title":"Whitening sentence representations for better semantics and faster retrieval","author":"Su","year":"2021"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.275"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1410"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.230"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.382"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.517"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"ref68","article-title":"RoBERTa: A robustly optimized BERT pretraining approach","author":"Liu","year":"2019"},{"key":"ref69","article-title":"ALBERT: A lite bert for self-supervised learning of language representations","author":"Lan","year":"2019"},{"key":"ref70","first-page":"137","article-title":"Language models are unsupervised multitask learners","volume-title":"Proc. OSDI04: 6th Symp. Operating Syst. Des. Implementation","author":"Radford","year":"2019"},{"key":"ref71","article-title":"Prompting GPT-3 to be reliable","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Si","year":"2023"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1016\/j.mlwa.2024.100541"},{"key":"ref73","article-title":"Instruction tuning with GPT-4","author":"Peng","year":"2023"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/W14-3302"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W17-4717"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1050"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W15-3031"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1149"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-1162"},{"key":"ref80","first-page":"11181","article-title":"Levenshtein transformer","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Gu","year":"2019"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.155"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1633"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.36"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1125"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1437"},{"key":"ref86","first-page":"3016","article-title":"Fast structured decoding for sequence models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Sun","year":"2019"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.83"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-2059"},{"key":"ref89","first-page":"8073","article-title":"Cross-model back-translated distillation for unsupervised machine translation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Nguyen","year":"2021"},{"key":"ref90","first-page":"7059","article-title":"Cross-lingual language model pretraining","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Conneau","year":"2019"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.3"},{"key":"ref92","first-page":"5926","article-title":"MASS: Masked sequence to sequence pre-training for language generation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Song","year":"2019"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.460"},{"key":"ref94","first-page":"1","article-title":"Beyond English-centric multilingual machine translation","volume":"22","author":"Fan","year":"2021","journal-title":"J. Mach. Learn. Res."},{"key":"ref95","article-title":"DeltaLM: Encoder-decoder pre-training for language generation and translation by augmenting pretrained multilingual encoders","author":"Ma","year":"2021"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-1101"},{"key":"ref97","first-page":"1631","article-title":"Recursive deep models for semantic compositionality over a sentiment treebank","volume-title":"Proc. Conf. Empirical Methods Natural Lang. Process.","author":"Socher","year":"2013"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.230"}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6570655\/10304349\/10461098.pdf?arnumber=10461098","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,7]],"date-time":"2024-09-07T00:37:17Z","timestamp":1725669437000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10461098\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"references-count":98,"URL":"https:\/\/doi.org\/10.1109\/taslp.2024.3374062","relation":{"has-preprint":[{"id-type":"doi","id":"10.36227\/techrxiv.22824908.v1","asserted-by":"object"}]},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"value":"2329-9290","type":"print"},{"value":"2329-9304","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]}}}