{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,21]],"date-time":"2025-12-21T06:26:09Z","timestamp":1766298369916,"version":"3.37.3"},"reference-count":53,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2025,1,30]],"date-time":"2025-01-30T00:00:00Z","timestamp":1738195200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2025,1,30]],"date-time":"2025-01-30T00:00:00Z","timestamp":1738195200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["72271034"],"award-info":[{"award-number":["72271034"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Artif Intell Rev"],"DOI":"10.1007\/s10462-025-11115-y","type":"journal-article","created":{"date-parts":[[2025,1,30]],"date-time":"2025-01-30T05:47:05Z","timestamp":1738216025000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Word embedding factor based multi-head attention"],"prefix":"10.1007","volume":"58","author":[{"given":"Zhengren","family":"Li","sequence":"first","affiliation":[]},{"given":"Yumeng","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Xiaohang","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Huawei","family":"Han","sequence":"additional","affiliation":[]},{"given":"Cui","family":"Huang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,1,30]]},"reference":[{"key":"11115_CR1","unstructured":"Bahdanau D, Cho K, Bengio Y (2015) Neural machine translation by jointly learning to align and translate. In Proceedings of the 3rd International Conference on Learning Representations, ICLR 2015, pp. 1\u201315"},{"key":"11115_CR2","unstructured":"Barbieri F, Anke LE, Camacho-Collados J (2022) Xlm-t: Multilingual language models in twitter for sentiment analysis and beyond, Proceedings of the Thirteenth Language Resources and Evaluation Conference. Publishing, pp. 258\u2013266"},{"key":"11115_CR3","first-page":"77","volume":"3","author":"MS Bartlett","year":"1950","unstructured":"Bartlett MS (1950) Tests of significance in factor analysis. Br J Psychol 3:77\u201385","journal-title":"Br J Psychol"},{"key":"11115_CR4","doi-asserted-by":"crossref","unstructured":"Bowman SR, Angeli G, Potts C, Manning CD (2015) A large annotated corpus for learning natural language inference, Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing, EMNLP 2015. Publishing, pp. 632\u2013642","DOI":"10.18653\/v1\/D15-1075"},{"key":"11115_CR5","unstructured":"Brown TB, Mann B, Ryder N, Subbiah M, Kaplan J, Dhariwal P, Neelakantan A, Shyam P, Sastry G, Askell A, Agarwal S, Herbert-Voss A, Krueger G, Henighan T, Child R, Ramesh A, Ziegler DM, Wu J, Winter C, Hesse C, Chen M, Sigler E, Litwin M, Gray S, Chess B, Clark J, Berner C, McCandlish S, Radford A, Sutskever I, Amodei D (2020) Language models are few-shot learners, Proceedings of the 34th International Conference on Neural Information Processing Systems (NeurIPS 2020). Publishing, Vancouver, BC, Canada, pp. 1877\u20131901"},{"key":"11115_CR6","doi-asserted-by":"crossref","unstructured":"Cheng J, Dong L, Lapata M (2016) Long short-term memory-networks for machine reading. 2016 Conference on Empirical Methods in Natural Language Processing, 551\u2013561","DOI":"10.18653\/v1\/D16-1053"},{"key":"11115_CR7","unstructured":"Devlin J, Chang M-W, Lee K, Toutanova K (2019) Bert: Pre-training of deep bidirectional transformers for language understanding. Proceedings of naacL-HLT, 4171\u20134186"},{"key":"11115_CR8","unstructured":"Dolan B, Brockett C (2005) Automatically constructing a corpus of sentential paraphrases, Third International Workshop on Paraphrasing (IWP2005). Publishing"},{"key":"11115_CR9","doi-asserted-by":"publisher","first-page":"511","DOI":"10.1016\/j.ijnurstu.2013.10.005","volume":"51","author":"CJ Gaskin","year":"2014","unstructured":"Gaskin CJ, Happell B (2014) On exploratory factor analysis: a review of recent evidence, an assessment of current practice, and recommendations for future use. Int J Nurs Stud 51:511\u2013521","journal-title":"Int J Nurs Stud"},{"key":"11115_CR10","doi-asserted-by":"publisher","first-page":"3510","DOI":"10.1007\/s12144-019-00300-2","volume":"40","author":"D Goretzko","year":"2021","unstructured":"Goretzko D, Pham TTH, B\u00fchner M (2021) Exploratory factor analysis: current use, methodological developments and recommendations for good practice. Curr Psychol 40:3510\u20133521","journal-title":"Curr Psychol"},{"key":"11115_CR11","doi-asserted-by":"publisher","first-page":"351","DOI":"10.1007\/BF02289468","volume":"31","author":"HH Harman","year":"1966","unstructured":"Harman HH, Jones WH (1966) Factor analysis by minimizing residuals (minres). Psychometrika 31:351\u2013368","journal-title":"Psychometrika"},{"key":"11115_CR12","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J, Deep Residual Learning for Image Recognition, 2016 IEEE Conference on Computer Vision and, Recognition P (2016) (CVPR). Publishing, pp. 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"11115_CR13","doi-asserted-by":"publisher","first-page":"393","DOI":"10.1177\/0013164405282485","volume":"66","author":"RK Henson","year":"2006","unstructured":"Henson RK, Roberts JK (2006) Use of exploratory factor analysis in published research: common errors and some comment on improved practice. Educ Psychol Meas 66:393\u2013416","journal-title":"Educ Psychol Meas"},{"key":"11115_CR14","first-page":"30016","volume":"35","author":"J Hoffmann","year":"2022","unstructured":"Hoffmann J, Borgeaud S, Mensch A, Buchatskaya E, Cai T, Rutherford E, d.L. Casas D, A. Hendricks L, Welbl J, Clark A, Hennigan T, Noland E, Millican K, v.d. Driessche G, Damoc B, Guy A, Osindero S, Simonyan K, Elsen E, Vinyals O, Rae J, Sifre L (2022) An empirical analysis of compute-optimal large language model training. Proc 36th Int Conf Neural Inform Process Syst (NeurIPS 2022) 35:30016\u201330030","journal-title":"Proc 36th Int Conf Neural Inform Process Syst (NeurIPS 2022)"},{"key":"11115_CR15","unstructured":"Hua W, Dai Z, Liu H, Le Q (2022) Transformer Quality in Linear Time, in: C. Kamalika, J. Stefanie, S. Le, S. Csaba, N. Gang & S. Sivan (Eds.), Proceedings of the 39th International Conference on Machine Learning. Publishing, Proceedings of Machine Learning Research, pp. 9099\u20139117"},{"key":"11115_CR16","doi-asserted-by":"crossref","unstructured":"Iandola F, Shaw A, Krishna R, Keutzer K (2020) SqueezeBERT: What can computer vision teach NLP about efficient neural networks? Proceedings of SustaiNLP: Workshop on Simple and Efficient Natural Language Processing. Publishing, pp. 124\u2013135","DOI":"10.18653\/v1\/2020.sustainlp-1.17"},{"key":"11115_CR17","doi-asserted-by":"publisher","first-page":"443","DOI":"10.1007\/BF02289658","volume":"32","author":"KG J\u00f6reskog","year":"1967","unstructured":"J\u00f6reskog KG (1967) Some contributions to maximum likelihood factor analysis. Psychometrika 32:443\u2013482","journal-title":"Psychometrika"},{"key":"11115_CR18","doi-asserted-by":"publisher","first-page":"183","DOI":"10.1007\/BF02289343","volume":"34","author":"KG J\u00f6reskog","year":"1969","unstructured":"J\u00f6reskog KG (1969) A general approach to confirmatory maximum likelihood factor analysis. Psychometrika 34:183\u2013202","journal-title":"Psychometrika"},{"key":"11115_CR19","doi-asserted-by":"publisher","first-page":"401","DOI":"10.1007\/BF02291817","volume":"35","author":"HF Kaiser","year":"1970","unstructured":"Kaiser HF (1970) A second generation little jiffy. Psychometrika 35:401\u2013415","journal-title":"Psychometrika"},{"key":"11115_CR20","doi-asserted-by":"publisher","first-page":"111","DOI":"10.1177\/001316447403400115","volume":"34","author":"HF Kaiser","year":"1974","unstructured":"Kaiser HF, Rice J (1974) Little jiffy, mark IV. Educ Psychol Meas 34:111\u2013117","journal-title":"Educ Psychol Meas"},{"key":"11115_CR21","unstructured":"Kingma DP, Ba J (201) Adam: A method for stochastic optimization. In Proceedings of the 3rd International Conference on Learning Representations, ICLR 2015 https:\/\/arxiv.org\/abs\/1412.6980"},{"key":"11115_CR22","doi-asserted-by":"publisher","first-page":"rm1","DOI":"10.1187\/cbe.18-04-0064","volume":"18","author":"E Knekta","year":"2019","unstructured":"Knekta E, Runyon C, Eddy S (2019) One size doesn\u2019t fit all: using factor analysis to gather validity evidence when using surveys in your research. CBE\u2014Life Sci Educ 18:rm1","journal-title":"CBE\u2014Life Sci Educ"},{"key":"11115_CR23","first-page":"172","volume":"33","author":"DN Lawley","year":"1943","unstructured":"Lawley DN (1943) The application of the maximum likelihood method to factor analysis. Br J Psychol 33:172\u2013175","journal-title":"Br J Psychol"},{"key":"11115_CR24","doi-asserted-by":"publisher","first-page":"14","DOI":"10.1016\/j.neucom.2021.04.038","volume":"454","author":"J Li","year":"2021","unstructured":"Li J, Wang X, Tu Z, Lyu MR (2021) On the diversity of multi-head attention. Neurocomputing 454:14\u201324","journal-title":"Neurocomputing"},{"key":"11115_CR25","unstructured":"Liu Y, Ott M, Goyal N, Du J, Joshi M, Chen D, Levy O, Lewis M, Zettlemoyer L, Stoyanov V (2019) Roberta: A robustly optimized bert pretraining approach. Computer Science"},{"key":"11115_CR26","doi-asserted-by":"crossref","unstructured":"Liu Z, Ning J, Cao Y, Wei Y, Zhang Z, Lin S, Hu H (2022) Video swin transformer, Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. Publishing, pp. 3192\u20133201","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"11115_CR27","doi-asserted-by":"crossref","first-page":"1151","DOI":"10.6018\/analesps.30.3.199361","volume":"30","author":"S Lloret-Segura","year":"2014","unstructured":"Lloret-Segura S, Ferreres-Traver A, Hernandez-Baeza A, Tomas-Marco I (2014) Exploratory item factor analysis: a practical guide revised and updated. Anales De Psicolog\u00eda 30:1151\u20131169","journal-title":"Anales De Psicolog\u00eda"},{"key":"11115_CR28","unstructured":"Maas A, Daly RE, Pham PT, Huang D, Ng AY, Potts C (2011) Learning word vectors for sentiment analysis, Proceedings of the 49th annual meeting of the association for computational linguistics: Human language technologies. Publishing, pp. 142\u2013150"},{"key":"11115_CR29","first-page":"44","volume":"10","author":"OA Mabel","year":"2020","unstructured":"Mabel OA, Olayemi OS (2020) A comparison of principal component analysis, maximum likelihood and the principal axis in factor analysis. Am J Math Stat 10:44\u201354","journal-title":"Am J Math Stat"},{"key":"11115_CR30","doi-asserted-by":"crossref","unstructured":"Mao R, Liu Q, He K, Li W, Cambria E (2022) The biases of pre-trained language models: An empirical study on prompt-based sentiment analysis and emotion detection. IEEE Transactions on Affective Computing","DOI":"10.1109\/TAFFC.2022.3204972"},{"key":"11115_CR31","unstructured":"Mikolov T, Sutskever I, Chen K, Corrado GS, Dean J (2013) Distributed representations of words and phrases and their compositionality. In NeurIPS, pp. 3111\u20133119"},{"key":"11115_CR32","doi-asserted-by":"publisher","first-page":"74","DOI":"10.1016\/j.jpsychores.2012.10.010","volume":"74","author":"S Norton","year":"2013","unstructured":"Norton S, Cosco T, Doyle F, Done J, Sacker A (2013) The Hospital anxiety and Depression Scale: a meta confirmatory factor analysis. J Psychosom Res 74:74\u201381","journal-title":"J Psychosom Res"},{"key":"11115_CR33","doi-asserted-by":"publisher","first-page":"2575","DOI":"10.1081\/JLC-100102045","volume":"22","author":"S Ounnar","year":"1999","unstructured":"Ounnar S, Righezza M (1999) Data analysis. A bridge between factor analysis and chromatography. J Liq Chromatogr Relat Technol 22:2575\u20132594","journal-title":"J Liq Chromatogr Relat Technol"},{"key":"11115_CR34","doi-asserted-by":"crossref","unstructured":"Pennington J, Socher R, Manning CD (2014) Glove: Global vectors for word representation, Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP). Publishing, pp. 1532\u20131543","DOI":"10.3115\/v1\/D14-1162"},{"key":"11115_CR35","unstructured":"Radford A, Narasimhan K, Salimans T, Sutskever I (2018) Improving language understanding by generative pre-training. arXiv:1806.07462"},{"key":"11115_CR36","doi-asserted-by":"publisher","first-page":"117275","DOI":"10.1016\/j.eswa.2022.117275","volume":"202","author":"S Reza","year":"2022","unstructured":"Reza S, Ferreira MC, Machado JJ, Tavares JMR (2022) A multi-head attention-based transformer model for traffic flow forecasting with a comparative analysis to recurrent neural networks. Expert Syst Appl 202:117275","journal-title":"Expert Syst Appl"},{"key":"11115_CR37","doi-asserted-by":"crossref","unstructured":"Robbins H, Monro S (1951) A stochastic approximation method. Annal Math Stat 22(3):400\u2013407","DOI":"10.1214\/aoms\/1177729586"},{"key":"11115_CR38","doi-asserted-by":"publisher","first-page":"53","DOI":"10.1162\/tacl_a_00353","volume":"9","author":"A Roy","year":"2021","unstructured":"Roy A, Saffar M, Vaswani A, Grangier D (2021) Efficient content-based sparse attention with routing transformers. Trans Association Comput Linguistics 9:53\u201368","journal-title":"Trans Association Comput Linguistics"},{"key":"11115_CR39","doi-asserted-by":"crossref","unstructured":"Sang EF, De Meulder F (2003) Introduction to the CoNLL-2003 shared task: Language-independent named entity recognition. Proceedings of the Seventh Conference on Natural Language Learning at HLT-NAACL 2003, 142\u2013147","DOI":"10.3115\/1119176.1119195"},{"key":"11115_CR40","doi-asserted-by":"publisher","first-page":"109831","DOI":"10.1016\/j.asoc.2022.109831","volume":"132","author":"L Su","year":"2023","unstructured":"Su L, Xiong L, Yang J, Multi-Attn BLS (2023) Multi-head attention mechanism with broad learning system for chaotic time series prediction. Appl Soft Comput 132:109831","journal-title":"Appl Soft Comput"},{"key":"11115_CR41","doi-asserted-by":"crossref","unstructured":"Suhara Y, Li J, Li Y, Zhang D, Demiralp \u00c7, Chen C, Tan W-C (2022) Annotating columns with pre-trained language models, Proceedings of the 2022 International Conference on Management of Data. Publishing, pp. 1493\u20131503","DOI":"10.1145\/3514221.3517906"},{"key":"11115_CR42","unstructured":"Sun T, Shao Y, Qian H, Huang X, Qiu X (2022) Black-box tuning for language-model-as-a-service, International Conference on Machine Learning. Publishing, pp. 20841\u201320855"},{"key":"11115_CR43","unstructured":"Tay Y, Dehghani M, Abnar S, Shen Y, Bahri D, Pham P, Rao J, Yang L, Ruder S, Metzler D (2020) Long Range Arena: A Benchmark for Efficient Transformers, International Conference on Learning Representations. Publishing"},{"key":"11115_CR44","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser \u0141, Polosukhin I (2017) Attention is all you need. Adv Neural Inf Process Syst 30:5998\u20136008"},{"key":"11115_CR46","doi-asserted-by":"crossref","unstructured":"Wang H, Tu M (2020) Enhancing attention models via multi-head collaboration, 2020 International Conference on Asian Language Processing (IALP). Publishing, pp. 19\u201323","DOI":"10.1109\/IALP51396.2020.9310460"},{"key":"11115_CR45","doi-asserted-by":"crossref","unstructured":"Wang A, Singh A, Michael J, Hill F, Levy O, Bowman SR (2018) GLUE: A multi-task benchmark and analysis platform for natural language understanding. In Proceedings of the 2018 EMNLP Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP, 353\u2013355","DOI":"10.18653\/v1\/W18-5446"},{"key":"11115_CR47","doi-asserted-by":"publisher","first-page":"625","DOI":"10.1162\/tacl_a_00290","volume":"7","author":"A Warstadt","year":"2019","unstructured":"Warstadt A, Singh A, Bowman SR (2019) Neural network acceptability judgments. Trans Association Comput Linguistics 7:625\u2013641","journal-title":"Trans Association Comput Linguistics"},{"key":"11115_CR48","doi-asserted-by":"publisher","first-page":"1071","DOI":"10.1007\/s10270-022-00975-5","volume":"21","author":"M Weyssow","year":"2022","unstructured":"Weyssow M, Sahraoui H, Syriani E (2022) Recommending metamodel concepts during modeling activities with pre-trained language models. Softw Syst Model 21:1071\u20131089","journal-title":"Softw Syst Model"},{"key":"11115_CR49","doi-asserted-by":"crossref","unstructured":"Williams B, Onsman A, Brown T (2010) Exploratory factor analysis: a five-step guide for novices. J Emerg Primary Health Care 8(3):1\u201313","DOI":"10.33151\/ajp.8.3.93"},{"key":"11115_CR50","unstructured":"Xiao D, Meng Q, Li S, Yuan X (2024) Improving Transformers with Dynamically Composable Multi-Head Attention. In Proceedings of the 41st International Conference on Machine Learning (ICML 2024)"},{"key":"11115_CR51","doi-asserted-by":"publisher","first-page":"1099","DOI":"10.1016\/S1872-2040(14)60757-X","volume":"42","author":"C Yi","year":"2014","unstructured":"Yi C, Fei T, Tie-Gang L, Jiu-Ming H, Abliz Z, Li-Tao L, Xiao-Hao W (2014) Application of factor analysis in imaging mass spectrometric data analysis. Chin J Anal Chem 42:1099\u20131103","journal-title":"Chin J Anal Chem"},{"key":"11115_CR53","doi-asserted-by":"publisher","first-page":"109584","DOI":"10.1016\/j.knosys.2022.109584","volume":"254","author":"P Zeng","year":"2022","unstructured":"Zeng P, Hu G, Zhou X, Li S, Liu P, Liu S (2022) Muformer: a long sequence time-series forecasting model based on modified multi-head attention. Knowl Based Syst 254:109584","journal-title":"Knowl Based Syst"},{"key":"11115_CR52","doi-asserted-by":"publisher","first-page":"15747","DOI":"10.1007\/s10489-022-04263-z","volume":"53","author":"P Zeng","year":"2023","unstructured":"Zeng P, Hu G, Zhou X, Li S, Liu P (2023) Seformer: a long sequence time-series forecasting model based on binary position encoding and information transfer regularization. Appl Intell 53:15747\u201315771","journal-title":"Appl Intell"}],"container-title":["Artificial Intelligence Review"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10462-025-11115-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10462-025-11115-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10462-025-11115-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T05:37:27Z","timestamp":1740116247000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10462-025-11115-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,1,30]]},"references-count":53,"journal-issue":{"issue":"4","published-online":{"date-parts":[[2025,4]]}},"alternative-id":["11115"],"URL":"https:\/\/doi.org\/10.1007\/s10462-025-11115-y","relation":{},"ISSN":["1573-7462"],"issn-type":[{"type":"electronic","value":"1573-7462"}],"subject":[],"published":{"date-parts":[[2025,1,30]]},"assertion":[{"value":"12 January 2025","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"30 January 2025","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"115"}}