{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T01:43:55Z","timestamp":1775007835069,"version":"3.50.1"},"reference-count":171,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"10","license":[{"start":{"date-parts":[[2021,10,1]],"date-time":"2021-10-01T00:00:00Z","timestamp":1633046400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"DOI":"10.13039\/100010661","name":"Horizon 2020, project AI4EU","doi-asserted-by":"publisher","award":["825619"],"award-info":[{"award-number":["825619"]}],"id":[{"id":"10.13039\/100010661","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Neural Netw. Learning Syst."],"published-print":{"date-parts":[[2021,10]]},"DOI":"10.1109\/tnnls.2020.3019893","type":"journal-article","created":{"date-parts":[[2020,9,10]],"date-time":"2020-09-10T20:28:50Z","timestamp":1599769730000},"page":"4291-4308","source":"Crossref","is-referenced-by-count":483,"title":["Attention in Natural Language Processing"],"prefix":"10.1109","volume":"32","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9711-7042","authenticated-orcid":false,"given":"Andrea","family":"Galassi","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9663-1071","authenticated-orcid":false,"given":"Marco","family":"Lippi","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9253-8638","authenticated-orcid":false,"given":"Paolo","family":"Torroni","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1216"},{"key":"ref2","first-page":"1","article-title":"Neural machine translation by jointly learning to align and translate","volume-title":"Proc. ICLR","author":"Bahdanau"},{"key":"ref3","first-page":"1243","article-title":"Learning to combine foveal glimpses with a third-order Boltzmann machine","volume-title":"Proc. NIPS","author":"Larochelle"},{"key":"ref4","first-page":"2204","article-title":"Recurrent models of visual attention","volume-title":"Proc. NIPS","author":"Mnih"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/3236009"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W19-4828"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-5429"},{"key":"ref8","article-title":"Attention interpretability across NLP tasks","author":"Vashishth","year":"2019","journal-title":"arXiv:1909.11218"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1002"},{"key":"ref10","first-page":"3543","article-title":"Attention is not explanation","volume-title":"Proc. NAACL-HLT","author":"Jain"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1282"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-2007"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-2021"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1613\/jair.5477"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/MCI.2018.2840738"},{"key":"ref16","first-page":"2048","article-title":"Show, attend and tell: Neural image caption generation with visual attention","volume-title":"Proc. ICML","volume":"37","author":"Xu"},{"key":"ref17","first-page":"1462","article-title":"Draw: A recurrent neural network for image generation","volume-title":"Proc. ICML","volume":"37","author":"Gregor"},{"key":"ref18","first-page":"7354","article-title":"Self-attention generative adversarial networks","volume-title":"Proc. ICML","volume":"97","author":"Zhang"},{"key":"ref19","article-title":"End-to-end continuous speech recognition using attention-based recurrent NN: First results","volume":"abs\/1412.1602","author":"Chorowski","year":"2014","journal-title":"CoRR"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472621"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1910"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11851"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2018\/546"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2018.2869225"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11635"},{"key":"ref26","article-title":"Multi-focus attention network for efficient deep reinforcement learning","volume":"abs\/1712.04603","author":"Choi","year":"2017","journal-title":"CoRR"},{"key":"ref27","first-page":"2692","article-title":"Pointer networks","volume-title":"Proc. NIPS","author":"Vinyals"},{"key":"ref28","first-page":"1","article-title":"Attention, learn to solve routing problems!","volume-title":"Proc. ICLR","author":"Kool"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D15-1166"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1249"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1096"},{"key":"ref32","first-page":"2174","article-title":"Interactive attention for neural machine translation","volume-title":"Proc. COLING","author":"Meng"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-1008"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1317"},{"key":"ref35","first-page":"3093","article-title":"Neural machine translation with supervised attention","volume-title":"Proc. COLING","author":"Liu"},{"key":"ref36","first-page":"5998","article-title":"Attention is all you need","volume-title":"Proc. NIPS","author":"Vaswani"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1151"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1338"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1475"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11910"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11971"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1167"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11254"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1331"},{"key":"ref45","article-title":"Decoding-history-based adaptive control of attention for neural machine translation","volume":"abs\/1802.01812","author":"Lin","year":"2018","journal-title":"CoRR"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-2059"},{"key":"ref47","first-page":"1","article-title":"Structured attention networks","volume-title":"Proc. ICLR","author":"Kim"},{"key":"ref48","first-page":"14014","article-title":"Are sixteen heads really better than one?","volume-title":"Proc. NeurIPS","author":"Michel"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1117"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1580"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1036"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N16-1174"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1076"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1117"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D15-1044"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2018\/577"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1370"},{"key":"ref58","first-page":"435","article-title":"Abstractive text summarization using recurrent neural networks: Systematic literature review","volume-title":"Proc. ICICKM","volume":"13","author":"Ngoko"},{"key":"ref59","first-page":"2440","article-title":"End-to-end memory networks","volume-title":"Proc. NIPS","author":"Sukhbaatar"},{"key":"ref60","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","volume-title":"Proc. NAACL-HLT","author":"Devlin"},{"key":"ref61","first-page":"1","article-title":"Frustratingly short attention spans in neural language modeling","volume-title":"Proc. ICLR","author":"Daniluk"},{"key":"ref62","first-page":"1693","article-title":"Teaching machines to read and comprehend","volume-title":"Proc. NIPS","author":"Hermann"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1053"},{"key":"ref64","first-page":"1","article-title":"The goldilocks principle: Reading children\u2019s books with explicit memory representations","volume-title":"Proc. ICLR","author":"Hill"},{"key":"ref65","first-page":"1777","article-title":"Consensus attention-based neural networks for Chinese reading comprehension","volume-title":"Proc. COLING","author":"Cui"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-1086"},{"key":"ref67","article-title":"Iterative alternating neural attention for machine reading","volume":"abs\/1606.02245","author":"Sordoni","year":"2016","journal-title":"CoRR"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-1122"},{"key":"ref69","article-title":"Attentive pooling networks","volume":"abs\/1602.03609","author":"dos Santos","year":"2016","journal-title":"CoRR"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1055"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1168"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1145\/3097983.3098177"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12065"},{"key":"ref74","first-page":"6077","article-title":"Hierarchical attention flow for multiple-choice reading comprehension","volume-title":"Proc. AAAI","author":"Zhu"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2018\/615"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1021"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1272"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12024"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-00563-4_16"},{"key":"ref80","first-page":"289","article-title":"Hierarchical question-image co-attention for visual question answering","volume-title":"Proc. NIPS","author":"Lu"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.10"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2018\/126"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11832"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-2017"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-2063"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11962"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1220"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12043"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1244"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1185"},{"key":"ref91","first-page":"1614","article-title":"From softmax to sparsemax: A sparse model of attention and multi-label classification","volume-title":"Proc. ICML","volume":"48","author":"Martins"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N16-1170"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11941"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2018\/604"},{"key":"ref95","first-page":"1815","article-title":"Enhancing sentence embedding with generalized pooling","volume-title":"Proc. COLING","author":"Chen"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2019.04.054"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1548"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11928"},{"key":"ref99","first-page":"1","article-title":"Reasoning about entailment with neural attention","volume-title":"Proc. ICLR","author":"Rockt\u00e4schel"},{"key":"ref100","first-page":"1","article-title":"A structured self-attentive sentence embedding","volume-title":"Proc. ICLR","author":"Lin"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1230"},{"key":"ref102","first-page":"2773","article-title":"Grammar as a foreign language","volume-title":"Proc. NIPS","author":"Vinyals"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1249"},{"key":"ref104","first-page":"785","article-title":"Dynamic feature selection with attention in incremental parsing","volume-title":"Proc. COLING","author":"Kohita"},{"key":"ref105","first-page":"1","article-title":"Deep biaffine attention for neural dependency parsing","volume-title":"Proc. ICLR","author":"Dozat"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1021"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1145\/3132847.3133037"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1058"},{"key":"ref109","first-page":"1121","article-title":"Effective attention modeling for aspect-level sentiment classification","volume-title":"Proc. COLING","author":"He"},{"key":"ref110","first-page":"1077","article-title":"Enhanced aspect level sentiment classification with auxiliary memory","volume-title":"Proc. COLING","author":"Zhu"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2017\/568"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1380"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12048"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1381"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1137"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12055"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.29007\/pjn4"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2019.2903056"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2019.2900335"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2018\/583"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1069"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1129"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1402"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W19-4509"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W19-4509"},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1508"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1506"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1167"},{"key":"ref129","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33015361"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1145\/3363574"},{"issue":"1","key":"ref131","doi-asserted-by":"crossref","DOI":"10.1007\/978-3-031-02165-7","volume-title":"Neural Network Methods for Natural Language Processing","volume":"10","author":"Goldberg","year":"2017"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1458"},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.1109\/78.650093"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v31i1.10967"},{"key":"ref135","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11917"},{"key":"ref136","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1375"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12272"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1106"},{"key":"ref139","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1176"},{"key":"ref140","doi-asserted-by":"publisher","DOI":"10.1109\/72.279181"},{"key":"ref141","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-1180"},{"key":"ref142","article-title":"Neural turing machines","volume":"abs\/1410.5401","author":"Graves","year":"2014","journal-title":"CoRR"},{"key":"ref143","first-page":"315","article-title":"Deep sparse rectifier neural networks","volume-title":"Proc. AISTATS","volume":"15","author":"Glorot"},{"key":"ref144","first-page":"971","article-title":"Self-normalizing neural networks","volume-title":"Proc. NIPS","author":"Klambauer"},{"key":"ref145","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1245"},{"key":"ref146","doi-asserted-by":"publisher","DOI":"10.1038\/nature14539"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.7551\/mitpress\/7432.001.0001"},{"key":"ref148","volume-title":"Neural-Symbolic Learning Systems: Foundations and Applications","author":"Garcez","year":"2012"},{"key":"ref149","first-page":"38","article-title":"Reasoning with deep learning: An open challenge","volume-title":"Proc. CEUR Workshop","volume":"1802","author":"Lippi"},{"key":"ref150","first-page":"260","article-title":"Using \u2018annotator rationales\u2019 to improve machine learning for text categorization","volume-title":"Proc. HLT-NAACL","author":"Zaidan"},{"key":"ref151","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-019-0048-x"},{"key":"ref152","doi-asserted-by":"publisher","DOI":"10.1002\/hbe2.117"},{"key":"ref153","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6639343"},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1159"},{"key":"ref155","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1232"},{"key":"ref156","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2017\/349"},{"key":"ref157","first-page":"917","article-title":"Uncertainty-aware attention for reliable interpretation and prediction","volume-title":"Proc. NIPS","author":"Heo"},{"key":"ref158","doi-asserted-by":"publisher","DOI":"10.1109\/3DV.2016.61"},{"key":"ref159","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00781"},{"key":"ref160","first-page":"1613","article-title":"Weight uncertainty in neural networks","volume-title":"Proc. ICML","author":"Blundell"},{"key":"ref161","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1036"},{"key":"ref162","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11926"},{"key":"ref163","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v31i1.10969"},{"key":"ref164","doi-asserted-by":"publisher","DOI":"10.1016\/j.procs.2019.04.161"},{"key":"ref165","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1553"},{"key":"ref166","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01252-6_25"},{"key":"ref167","first-page":"2319","article-title":"Differentiable learning of logical rules for knowledge base reasoning","volume-title":"Proc. NIPS","author":"Yang"},{"key":"ref168","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33017152"},{"key":"ref169","doi-asserted-by":"publisher","DOI":"10.1145\/3209978.3210081"},{"key":"ref170","first-page":"1","article-title":"Compositional attention networks for machine reasoning","volume-title":"Proc. ICLR","author":"Hudson"},{"key":"ref171","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1028"}],"container-title":["IEEE Transactions on Neural Networks and Learning Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/5962385\/9559436\/09194070.pdf?arnumber=9194070","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,24]],"date-time":"2024-01-24T00:52:20Z","timestamp":1706057540000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9194070\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10]]},"references-count":171,"journal-issue":{"issue":"10"},"URL":"https:\/\/doi.org\/10.1109\/tnnls.2020.3019893","relation":{},"ISSN":["2162-237X","2162-2388"],"issn-type":[{"value":"2162-237X","type":"print"},{"value":"2162-2388","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,10]]}}}