{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,11]],"date-time":"2026-04-11T02:17:48Z","timestamp":1775873868003,"version":"3.50.1"},"reference-count":48,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,5,22]],"date-time":"2025-05-22T00:00:00Z","timestamp":1747872000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2025,5,22]],"date-time":"2025-05-22T00:00:00Z","timestamp":1747872000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Discov Artif Intell"],"DOI":"10.1007\/s44163-025-00304-x","type":"journal-article","created":{"date-parts":[[2025,5,22]],"date-time":"2025-05-22T08:28:14Z","timestamp":1747902494000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Protein sequence classification using natural language processing techniques"],"prefix":"10.1007","volume":"5","author":[{"given":"Huma","family":"Perveen","sequence":"first","affiliation":[]},{"given":"Julie","family":"Weeds","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,5,22]]},"reference":[{"key":"304_CR1","volume-title":"Computational intelligence in data mining","author":"J Dongardive","year":"2016","unstructured":"Dongardive J, Abraham S. Protein sequence classification based on N-gram and K-nearest neighbor algorithm. In: Behera H, Mohapatra D, editors. Computational intelligence in data mining. Berlin: Springer; 2016."},{"key":"304_CR2","doi-asserted-by":"publisher","DOI":"10.1109\/ISCC.2017.8024644","author":"M Li","year":"2017","unstructured":"Li M, Ling C, Gao J. An efficient CNN-based classification on G-protein coupled receptors using TF-IDF and N-gram. IEEE Sympos Computers Commun (ISCC). 2017. https:\/\/doi.org\/10.1109\/ISCC.2017.8024644.","journal-title":"IEEE Sympos Computers Commun (ISCC)"},{"key":"304_CR3","volume-title":"Protein family classification with neural network","author":"T Lee","year":"2019","unstructured":"Lee T, Nguyen T. Protein family classification with neural network. Stanford: Stanford University; 2019."},{"key":"304_CR4","doi-asserted-by":"publisher","DOI":"10.1101\/414631","author":"A Vazhayil","year":"2019","unstructured":"Vazhayil A, Vinayakumar R, Soman KP. Deep proteomics: protein family classification using shallow and deep networks. Center Comput Eng Netw (CEN). 2019. https:\/\/doi.org\/10.1101\/414631.","journal-title":"Center Comput Eng Netw (CEN)"},{"issue":"9","key":"304_CR5","doi-asserted-by":"publisher","first-page":"1481","DOI":"10.1093\/bioinformatics\/btx823","volume":"34","author":"SMA Islam","year":"2018","unstructured":"Islam SMA, Heil BJ, Kearney CM, Baker EJ. Protein classification using modified n-grams and skip-grams. Bioinformatics. 2018;34(9):1481\u20137. https:\/\/doi.org\/10.1093\/bioinformatics\/btx823.","journal-title":"Bioinformatics"},{"key":"304_CR6","doi-asserted-by":"publisher","first-page":"9","DOI":"10.1155\/2013\/854745","volume":"2013","author":"A Barve","year":"2013","unstructured":"Barve A, Ghaskadbi S, Ghaskadbi S. Structural and sequence similarities of hydra Xeroderma pigmentosum a protein to human homolog suggest early evolution and conservation. BioMed Res Int. 2013;2013:9. https:\/\/doi.org\/10.1155\/2013\/854745.","journal-title":"BioMed Res Int"},{"key":"304_CR7","doi-asserted-by":"publisher","first-page":"423589","DOI":"10.1155\/2010\/423589","volume":"2010","author":"C Chen","year":"2010","unstructured":"Chen C, McGarvey PB, Huang H, Wu CH. Protein bioinformatics infrastructure for the integration and analysis of multiple high-throughput omics data. Adv Bioinform. 2010;2010:423589. https:\/\/doi.org\/10.1155\/2010\/423589.","journal-title":"Adv Bioinform"},{"key":"304_CR8","doi-asserted-by":"publisher","first-page":"165342","DOI":"10.1155\/2013\/165342","volume":"2013","author":"H Cong","year":"2013","unstructured":"Cong H, Zhang M, Zhang Q, et al. Analysis of structures and epitopes of surface antigen glycoproteins expressed in bradyzoites of Toxoplasma gondii. BioMed Res Int. 2013;2013:165342. https:\/\/doi.org\/10.1155\/2013\/165342.","journal-title":"BioMed Res Int"},{"key":"304_CR9","doi-asserted-by":"publisher","first-page":"Article 917153","DOI":"10.1155\/2013\/917153","volume":"2013","author":"J Machado","year":"2013","unstructured":"Machado J, Costa AC, Quelhas M. Can power laws help us understand gene and proteome information? Adv Math Phys. 2013;2013:Article 917153. https:\/\/doi.org\/10.1155\/2013\/917153.","journal-title":"Adv Math Phys"},{"key":"304_CR10","doi-asserted-by":"publisher","DOI":"10.1155\/2013\/612649","author":"V Carregari","year":"2013","unstructured":"Carregari V, Floriano R, Rodrigues-Simioni L, et al. Biochemical, pharmacological, and structural characterization of new basic PLA2 bbil-tx from Bothriopsis bilineata snake venom. BioMed Res Int. 2013. https:\/\/doi.org\/10.1155\/2013\/612649.","journal-title":"BioMed Res Int"},{"key":"304_CR11","doi-asserted-by":"publisher","DOI":"10.1155\/2013\/198250","author":"L Liu","year":"2013","unstructured":"Liu L, Cui J, Zhang X, Wei T, Jiang P, Wang Z. Analysis of structures, functions, and epitopes of cysteine protease from Spirometra erinaceieuropaei spargana. BioMed Res Int. 2013. https:\/\/doi.org\/10.1155\/2013\/198250.","journal-title":"BioMed Res Int"},{"key":"304_CR12","volume-title":"Bioinformatics: the machine learning approach","author":"P Baldi","year":"2001","unstructured":"Baldi P, Brunak S. Bioinformatics: the machine learning approach. Cambridge: The MIT Press; 2001."},{"key":"304_CR13","doi-asserted-by":"crossref","unstructured":"Yang, Y., Lu, B.-L., & Yang, W.-Y. Classification of protein sequences based on word segmentation methods. In Proceedings of the 6th Asia-Pacific Bioinformatics Conference (APBC '08) (pp. 177\u2013186). Imperial College Press. 2008.","DOI":"10.1142\/9781848161092_0020"},{"issue":"1","key":"304_CR14","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1186\/1477-5956-10-1","volume":"10","author":"C Caragea","year":"2012","unstructured":"Caragea C, Silvescu A, Mitra P. Protein sequence classification using feature hashing. Proteome Sci. 2012;10(1):1\u20138. https:\/\/doi.org\/10.1186\/1477-5956-10-1.","journal-title":"Proteome Sci"},{"key":"304_CR15","unstructured":"Wang, D., Lee, N. K., Dillon, T. S., & Hoogenraad, N. J. Protein sequences classification using radial basis function (RBF) neural networks. In Proceedings of the 9th International Conference on Neural Information Processing (pp. 764\u2013768). 2002."},{"issue":"1","key":"304_CR16","first-page":"53","volume":"1","author":"D Wang","year":"2003","unstructured":"Wang D, Lee NK, Dillon TS. Extraction and optimization of fuzzy protein sequences classification rules using GRBF neural networks. Informs Process Lett Rev. 2003;1(1):53\u20139.","journal-title":"Informs Process Lett Rev"},{"key":"304_CR17","unstructured":"Wang, D., & Huang, G.-B. Protein sequence classification using extreme learning machine. In Proceedings of the International Joint Conference on Neural Networks (IJCNN '05) (pp. 1406\u20131411). 2005. Montreal, Canada."},{"key":"304_CR18","unstructured":"Huang, G.-B., Zhu, Q.-Y., & Siew, C.-K. Extreme learning machine: A new learning scheme of feedforward neural networks. In Proceedings of the IEEE International Joint Conference on Neural Networks (pp. 985\u2013990). 2004."},{"issue":"1\u20133","key":"304_CR19","doi-asserted-by":"publisher","first-page":"489","DOI":"10.1016\/j.neucom.2005.12.126","volume":"70","author":"G-B Huang","year":"2006","unstructured":"Huang G-B, Zhu Q-Y, Siew C-K. Extreme learning machine: theory and applications. Neurocomputing. 2006;70(1\u20133):489\u2013501. https:\/\/doi.org\/10.1016\/j.neucom.2005.12.126.","journal-title":"Neurocomputing"},{"issue":"3","key":"304_CR20","doi-asserted-by":"publisher","first-page":"187","DOI":"10.1109\/TCSII.2005.856901","volume":"53","author":"G-B Huang","year":"2006","unstructured":"Huang G-B, Zhu Q-Y, Mao KZ, Siew C-K, Saratchandran P, Sundararajan N. Can threshold networks be trained directly? IEEE Trans Circuits Syst II Express Briefs. 2006;53(3):187\u201391. https:\/\/doi.org\/10.1109\/TCSII.2005.856901.","journal-title":"IEEE Trans Circuits Syst II Express Briefs"},{"key":"304_CR21","doi-asserted-by":"publisher","first-page":"164","DOI":"10.1090\/qam\/10666","volume":"2","author":"K Levenberg","year":"1944","unstructured":"Levenberg K. A method for the solution of certain problems in least squares. Quart Appl Math. 1944;2:164\u20138.","journal-title":"Quart Appl Math"},{"issue":"2","key":"304_CR22","doi-asserted-by":"publisher","first-page":"431","DOI":"10.1137\/0111030","volume":"11","author":"D Marquardt","year":"1963","unstructured":"Marquardt D. An algorithm for least-squares estimation of nonlinear parameters. SIAM J Appl Math. 1963;11(2):431\u201341.","journal-title":"SIAM J Appl Math"},{"issue":"3","key":"304_CR23","doi-asserted-by":"publisher","first-page":"285","DOI":"10.1007\/s11063-012-9234-2","volume":"36","author":"J Cao","year":"2012","unstructured":"Cao J, Lin Z. Self-adaptive evolutionary extreme learning machine. Neural Process Lett. 2012;36(3):285\u2013305. https:\/\/doi.org\/10.1007\/s11063-012-9234-2.","journal-title":"Neural Process Lett"},{"key":"304_CR24","doi-asserted-by":"publisher","DOI":"10.1155\/2014\/103054","author":"J Cao","year":"2014","unstructured":"Cao J, Xiong L. Protein sequence classification with improved extreme learning machine algorithms. BioMed Res Int. 2014. https:\/\/doi.org\/10.1155\/2014\/103054.","journal-title":"BioMed Res Int"},{"key":"304_CR25","doi-asserted-by":"publisher","DOI":"10.1073\/pnas.2016239118","volume":"118","author":"A Rives","year":"2021","unstructured":"Rives A, et al. Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences. Proc Natl Acad Sci USA. 2021;118: e2016239118.","journal-title":"Proc Natl Acad Sci USA"},{"key":"304_CR26","first-page":"9689","volume":"32","author":"R Rao","year":"2019","unstructured":"Rao R, et al. Evaluating protein transfer learning with tape. Adv Neural Inf Process Syst. 2019;32:9689\u2013701.","journal-title":"Adv Neural Inf Process Syst"},{"issue":"10","key":"304_CR27","doi-asserted-by":"publisher","first-page":"7112","DOI":"10.1109\/TPAMI.2021.3095381","volume":"44","author":"A Elnaggar","year":"2022","unstructured":"Elnaggar A, Heinzinger M, Dallago C, Rehawi G, Wang Y, Jones L, Gibbs T, Feher T, Angerer C, Steinegger M, Bhowmik D, Rost B. ProtTrans: toward understanding the language of life through self-supervised learning. IEEE Trans Pattern Anal Mach Intell. 2022;44(10):7112\u201327. https:\/\/doi.org\/10.1109\/TPAMI.2021.3095381.","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"304_CR28","first-page":"169","volume":"10","author":"A Shinde","year":"2019","unstructured":"Shinde A, D\u2019Silva M. Protein sequence classification using natural language processing. Int J Eng Dev Res. 2019;10:169\u201375.","journal-title":"Int J Eng Dev Res"},{"key":"304_CR29","doi-asserted-by":"publisher","DOI":"10.1101\/626507","author":"LM Bileschi","year":"2019","unstructured":"Bileschi LM, Belanger D, Bryant D, et al. Using deep learning to annotate protein universe. BioRxiv. 2019. https:\/\/doi.org\/10.1101\/626507.","journal-title":"BioRxiv"},{"issue":"8","key":"304_CR30","doi-asserted-by":"publisher","first-page":"2102","DOI":"10.1093\/bioinformatics\/btac020","volume":"38","author":"N Brandes","year":"2022","unstructured":"Brandes N, Ofer D, Peleg Y, Rappoport N, Linial M. ProteinBERT: a universal deep-learning model of protein sequence and function. Bioinformatics. 2022;38(8):2102\u201310. https:\/\/doi.org\/10.1093\/bioinformatics\/btac020.","journal-title":"Bioinformatics"},{"key":"304_CR31","unstructured":"Uniprot. Uniref100. Retrieved from https:\/\/www.uniprot.org\/help\/downloads"},{"key":"304_CR32","unstructured":"Elnaggar, A., Heinzinger, M., Dallago, C., et al. ProtTrans towards cracking the language of life. bioRxiv. 2020. https:\/\/www.biorxiv.org\/content\/early\/2020\/07\/21\/2020.07.12.199554"},{"issue":"Suppl 13","key":"304_CR33","doi-asserted-by":"publisher","first-page":"380","DOI":"10.1186\/s12859-020-03673-5","volume":"21","author":"Wang","year":"2020","unstructured":"Wang, et al. Biomedical document triage using a hierarchical attention-based capsule network. BMC Bioinform. 2020;21(Suppl 13):380. https:\/\/doi.org\/10.1186\/s12859-020-03673-5.","journal-title":"BMC Bioinform"},{"key":"304_CR34","doi-asserted-by":"publisher","first-page":"136475","DOI":"10.1016\/j.ijbiomac.2024.136475","volume":"282","author":"N Almusallam","year":"2024","unstructured":"Almusallam N, Ali F, Masmoudi A, Abu Ghazalah S, Alsini R, Yafoz A. An omics-driven computational model for angiogenic protein prediction: advancing therapeutic strategies with Ens-deep-AGP. Int J Biol Macromol. 2024;282:136475. https:\/\/doi.org\/10.1016\/j.ijbiomac.2024.136475.","journal-title":"Int J Biol Macromol"},{"key":"304_CR35","doi-asserted-by":"publisher","first-page":"102448","DOI":"10.1016\/j.jocs.2024.102448","volume":"83","author":"F Ali","year":"2024","unstructured":"Ali F, Khalid M, Masmoudi A, Alghamdi W, Yafoz A, Alsini R. VEGF-ERCNN: A deep learning-based model for prediction of vascular endothelial growth factor using ensemble residual CNN. J Computat Sci. 2024;83:102448. https:\/\/doi.org\/10.1016\/j.jocs.2024.102448.","journal-title":"J Computat Sci"},{"key":"304_CR36","doi-asserted-by":"publisher","first-page":"49","DOI":"10.1016\/j.ymeth.2024.04.004","volume":"226","author":"F Ali","year":"2024","unstructured":"Ali F, Almuhaimeed A, Khalid M, Alshanbari H, Masmoudi A, Alsini R. DEEP-EP: Identification of epigenetic protein by ensemble residual convolutional neural network for drug discovery. Methods. 2024;226:49\u201353. https:\/\/doi.org\/10.1016\/j.ymeth.2024.04.004.","journal-title":"Methods"},{"key":"304_CR37","doi-asserted-by":"publisher","DOI":"10.1080\/07391102.2024.2329777","author":"M Khalid","year":"2024","unstructured":"Khalid M, Ali F, Alghamdi W, Alzahrani A, Alsini R, Alzahrani A. An ensemble computational model for prediction of clathrin protein by coupling machine learning with discrete cosine transform. J Biomol Struct Dynam. 2024. https:\/\/doi.org\/10.1080\/07391102.2024.2329777.","journal-title":"J Biomol Struct Dynam"},{"key":"304_CR38","doi-asserted-by":"publisher","DOI":"10.1080\/07391102.2023.2269280","author":"O Alghushairy","year":"2023","unstructured":"Alghushairy O, Ali F, Alghamdi W, Khalid M, Alsini R, Asiry O. Machine learning-based model for accurate identification of druggable proteins using light extreme gradient boosting. J Bio Struct Dynam. 2023. https:\/\/doi.org\/10.1080\/07391102.2023.2269280.","journal-title":"J Bio Struct Dynam"},{"issue":"11","key":"304_CR39","doi-asserted-by":"publisher","first-page":"5725","DOI":"10.1080\/07391102.2023.2243523","volume":"42","author":"A Adnan","year":"2023","unstructured":"Adnan A, Hongya W, Ali F, Khalid M, Alghushairy O, Alsini R. A bi-layer model for identification of piwiRNA using deep neural learning. J Biomol Struct Dyn. 2023;42(11):5725\u201333. https:\/\/doi.org\/10.1080\/07391102.2023.2243523.","journal-title":"J Biomol Struct Dyn"},{"key":"304_CR40","doi-asserted-by":"publisher","DOI":"10.1016\/j.ijbiomac.2023.125296","volume":"243","author":"F Ali","year":"2023","unstructured":"Ali F, Alghamdi W, Almagrabi AO, Alghushairy O, Banjar A, Khalid M. Deep-AGP: prediction of angiogenic protein by integrating two-dimensional convolutional neural network with discrete cosine transform. Int J Biol Macromol. 2023;243: 125296. https:\/\/doi.org\/10.1016\/j.ijbiomac.2023.125296.","journal-title":"Int J Biol Macromol"},{"key":"304_CR41","unstructured":"Kaggle dataset. Structural protein sequences. https:\/\/www.kaggle.com\/datasets\/shahir\/protein-data-set"},{"key":"304_CR42","doi-asserted-by":"publisher","unstructured":"Parikh, Y. and Abdelfattah, N. Machine Learning Models to Predict Multiclass Protein Classifications, IEEE 10th Annual Ubiquitous Computing, Electronics & Mobile Communication Conference, pp. 0300\u20130304, 2019. https:\/\/doi.org\/10.1109\/UEMCON47517.2019.8993049","DOI":"10.1109\/UEMCON47517.2019.8993049"},{"key":"304_CR43","doi-asserted-by":"publisher","unstructured":"Yang, Y., Liang, Lu, B. and Yun, Yang, W. Classification of protein sequences based on word segmentation methods, Proceedings of the 6th Asia-Pacific Bioinformatics Conference. pp. 1\u201310. 2007. https:\/\/doi.org\/10.1142\/9781848161092_0020","DOI":"10.1142\/9781848161092_0020"},{"key":"304_CR44","doi-asserted-by":"publisher","first-page":"1161","DOI":"10.4236\/jbise.2013.612145","volume":"6","author":"J Li","year":"2013","unstructured":"Li J, Wu J, Chen K. PFP-RFSM: protein fold prediction by using random forests and sequence motifs. J Biomed Sci Eng. 2013;6:1161\u201370. https:\/\/doi.org\/10.4236\/jbise.2013.612145.","journal-title":"J Biomed Sci Eng"},{"key":"304_CR45","doi-asserted-by":"publisher","DOI":"10.1109\/SoutheastCon42311.2019.9020333","author":"S Jalal","year":"2019","unstructured":"Jalal S, Zhong J, Kumar S. Protein secondary structure prediction using multi input convolutional neural network. SoutheastCon. 2019. https:\/\/doi.org\/10.1109\/SoutheastCon42311.2019.9020333.","journal-title":"SoutheastCon"},{"issue":"13","key":"304_CR46","doi-asserted-by":"publisher","first-page":"i254","DOI":"10.1093\/bioinformatics\/bty275","volume":"34","author":"S Seo","year":"2018","unstructured":"Seo S, Oh M, Park Y, Kim S. DeepFam: deep learning-based alignment-free method for protein family modeling and prediction. Bioinformatics. 2018;34(13):i254\u201362. https:\/\/doi.org\/10.1093\/bioinformatics\/bty275.","journal-title":"Bioinformatics"},{"issue":"8","key":"304_CR47","doi-asserted-by":"publisher","first-page":"1444","DOI":"10.1038\/s41592-024-02362-y","volume":"21","author":"J Bernett","year":"2024","unstructured":"Bernett J, Blumenthal DB, Grimm DG, Haselbeck F, Joeres R, Kalinina OV, List M. Guiding questions to avoid data leakage in biological machine learning applications. Nat Methods. 2024;21(8):1444\u201353. https:\/\/doi.org\/10.1038\/s41592-024-02362-y.","journal-title":"Nat Methods"},{"issue":"12","key":"304_CR48","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pcbi.1003926","volume":"10","author":"H Cheng","year":"2014","unstructured":"Cheng H, Schaeffer RD, Liao Y, Kinch LN, Pei J, Shi S, et al. ECOD: an evolutionary classification of protein domains. PLoS Comput Biol. 2014;10(12): e1003926. https:\/\/doi.org\/10.1371\/journal.pcbi.1003926.","journal-title":"PLoS Comput Biol"}],"container-title":["Discover Artificial Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s44163-025-00304-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s44163-025-00304-x\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s44163-025-00304-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,22]],"date-time":"2025-05-22T08:28:20Z","timestamp":1747902500000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s44163-025-00304-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,22]]},"references-count":48,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2025,12]]}},"alternative-id":["304"],"URL":"https:\/\/doi.org\/10.1007\/s44163-025-00304-x","relation":{"has-preprint":[{"id-type":"doi","id":"10.21203\/rs.3.rs-5045037\/v1","asserted-by":"object"},{"id-type":"doi","id":"10.1101\/2024.08.23.609306","asserted-by":"object"}]},"ISSN":["2731-0809"],"issn-type":[{"value":"2731-0809","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,5,22]]},"assertion":[{"value":"6 September 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 May 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 May 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"Not applicable.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics approval and consent to participate"}},{"value":"The authors declare no competing interests.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"66"}}