{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,6]],"date-time":"2026-06-06T17:12:25Z","timestamp":1780765945972,"version":"3.54.1"},"reference-count":26,"publisher":"Oxford University Press (OUP)","issue":"6","license":[{"start":{"date-parts":[[2023,6,8]],"date-time":"2023-06-08T00:00:00Z","timestamp":1686182400000},"content-version":"vor","delay-in-days":7,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100009708","name":"Novo Nordisk Foundation","doi-asserted-by":"publisher","award":["NNF14CC0001"],"award-info":[{"award-number":["NNF14CC0001"]}],"id":[{"id":"10.13039\/501100009708","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100002341","name":"Academy of Finland","doi-asserted-by":"publisher","award":["332844"],"award-info":[{"award-number":["332844"]}],"id":[{"id":"10.13039\/501100002341","id-type":"DOI","asserted-by":"publisher"}]},{"name":"European Union\u2019s Horizon 2020 research and innovation program"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,6,1]]},"abstract":"<jats:title>Abstract<\/jats:title>\n                  <jats:sec>\n                    <jats:title>Motivation<\/jats:title>\n                    <jats:p>The recognition of mentions of species names in text is a critically important task for biomedical text mining. While deep learning-based methods have made great advances in many named entity recognition tasks, results for species name recognition remain poor. We hypothesize that this is primarily due to the lack of appropriate corpora.<\/jats:p>\n                  <\/jats:sec>\n                  <jats:sec>\n                    <jats:title>Results<\/jats:title>\n                    <jats:p>We introduce the S1000 corpus, a comprehensive manual re-annotation and extension of the S800 corpus. We demonstrate that S1000 makes highly accurate recognition of species names possible (F-score\u2009=93.1%), both for deep learning and dictionary-based methods.<\/jats:p>\n                  <\/jats:sec>\n                  <jats:sec>\n                    <jats:title>Availability and implementation<\/jats:title>\n                    <jats:p>All resources introduced in this study are available under open licenses from https:\/\/jensenlab.org\/resources\/s1000\/. The webpage contains links to a Zenodo project and three GitHub repositories associated with the study.<\/jats:p>\n                  <\/jats:sec>","DOI":"10.1093\/bioinformatics\/btad369","type":"journal-article","created":{"date-parts":[[2023,6,8]],"date-time":"2023-06-08T12:03:25Z","timestamp":1686225805000},"source":"Crossref","is-referenced-by-count":15,"title":["S1000: a better taxonomic name corpus for biomedical information extraction"],"prefix":"10.1093","volume":"39","author":[{"given":"Jouni","family":"Luoma","sequence":"first","affiliation":[{"name":"TurkuNLP Group, Department of Computing, University of Turku , Turku 20014, Finland"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3611-5726","authenticated-orcid":false,"given":"Katerina","family":"Nastou","sequence":"additional","affiliation":[{"name":"Novo Nordisk Foundation Center for Protein Research, University of Copenhagen , Blegdamsvej 3 , Copenhagen 2200, Denmark"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Tomoko","family":"Ohta","sequence":"additional","affiliation":[{"name":"Textimi , Tokyo, Japan"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Harttu","family":"Toivonen","sequence":"additional","affiliation":[{"name":"TurkuNLP Group, Department of Computing, University of Turku , Turku 20014, Finland"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Evangelos","family":"Pafilis","sequence":"additional","affiliation":[{"name":"Hellenic Centre for Marine Research, Institute of Marine Biology, Biotechnology and Aquaculture , Heraklion 71003, Greece"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7885-715X","authenticated-orcid":false,"given":"Lars Juhl","family":"Jensen","sequence":"additional","affiliation":[{"name":"Novo Nordisk Foundation Center for Protein Research, University of Copenhagen , Blegdamsvej 3 , Copenhagen 2200, Denmark"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Sampo","family":"Pyysalo","sequence":"additional","affiliation":[{"name":"TurkuNLP Group, Department of Computing, University of Turku , Turku 20014, Finland"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"286","published-online":{"date-parts":[[2023,6,8]]},"reference":[{"key":"2023062101002596600_btad369-B1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1016\/j.jbi.2013.12.006","article-title":"NCBI disease corpus: a resource for disease name recognition and concept normalization","volume":"47","author":"Do\u011fan","year":"2014","journal-title":"J Biomed Inform"},{"key":"2023062101002596600_btad369-B2","doi-asserted-by":"crossref","first-page":"85","DOI":"10.1186\/1471-2105-11-85","article-title":"LINNAEUS: a species name identification system for biomedical literature","volume":"11","author":"Gerner","year":"2010","journal-title":"BMC Bioinformatics"},{"key":"2023062101002596600_btad369-B3","doi-asserted-by":"crossref","first-page":"4087","DOI":"10.1093\/bioinformatics\/bty449","article-title":"Transfer learning for biomedical named entity recognition with neural networks","volume":"34","author":"Giorgi","year":"2018","journal-title":"Bioinformatics"},{"key":"2023062101002596600_btad369-B4","first-page":"102","author":"Hakala","year":"2016"},{"key":"2023062101002596600_btad369-B5","first-page":"067132","author":"Jensen","year":"2016"},{"key":"2023062101002596600_btad369-B6","doi-asserted-by":"crossref","first-page":"i180","DOI":"10.1093\/bioinformatics\/btg1023","article-title":"GENIA corpus\u2014a semantically annotated corpus for bio-textmining","volume":"19","author":"Kim","year":"2003","journal-title":"Bioinformatics"},{"key":"2023062101002596600_btad369-B7","first-page":"70","author":"Kim","year":"2004"},{"key":"2023062101002596600_btad369-B8","doi-asserted-by":"crossref","first-page":"635","DOI":"10.1007\/978-3-030-68763-2_48","volume-title":"Pattern Recognition. ICPR International Workshops and Challenges","author":"Kocaman","year":"2021"},{"key":"2023062101002596600_btad369-B9","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1186\/1758-2946-7-S1-S1","article-title":"The CHEMDNER corpus of chemicals and drugs and its annotation principles","volume":"7","author":"Krallinger","year":"2015","journal-title":"J Cheminform"},{"key":"2023062101002596600_btad369-B10","doi-asserted-by":"crossref","first-page":"301","DOI":"10.1007\/s00705-012-1454-0","article-title":"Virus nomenclature below the species level: a standardized nomenclature for natural variants of viruses assigned to the family Filoviridae","volume":"158","author":"Kuhn","year":"2013","journal-title":"Arch Virol"},{"key":"2023062101002596600_btad369-B11","doi-asserted-by":"crossref","first-page":"1234","DOI":"10.1093\/bioinformatics\/btz682","article-title":"BioBERT: a pre-trained biomedical language representation model for biomedical text mining","volume":"36","author":"Lee","year":"2020","journal-title":"Bioinformatics"},{"key":"2023062101002596600_btad369-B12","first-page":"146","author":"Lewis","year":"2020"},{"key":"2023062101002596600_btad369-B13","doi-asserted-by":"crossref","first-page":"baw068","DOI":"10.1093\/database\/baw068","article-title":"BioCreative V CDR task corpus: a resource for chemical disease relation extraction","volume":"2016","author":"Li","year":"2016","journal-title":"Database"},{"key":"2023062101002596600_btad369-B14","first-page":"904","author":"Luoma","year":"2020"},{"key":"2023062101002596600_btad369-B15","author":"Miranda","year":"2021"},{"key":"2023062101002596600_btad369-B16","doi-asserted-by":"crossref","first-page":"e65390","DOI":"10.1371\/journal.pone.0065390","article-title":"The species and organisms resources for fast and accurate identification of taxonomic names in text","volume":"8","author":"Pafilis","year":"2013","journal-title":"PLoS One"},{"key":"2023062101002596600_btad369-B17","author":"Phan","year":"2021"},{"key":"2023062101002596600_btad369-B18","doi-asserted-by":"crossref","first-page":"868","DOI":"10.1093\/bioinformatics\/btt580","article-title":"Anatomical entity mention recognition at literature scale","volume":"30","author":"Pyysalo","year":"2014","journal-title":"Bioinformatics"},{"key":"2023062101002596600_btad369-B19","doi-asserted-by":"crossref","first-page":"baaa062","DOI":"10.1093\/database\/baaa062","article-title":"NCBI taxonomy: a comprehensive update on curation, resources and tools","volume":"2020","author":"Schoch","year":"2020","journal-title":"Database"},{"key":"2023062101002596600_btad369-B20","author":"Sharma","year":"2019"},{"key":"2023062101002596600_btad369-B21","first-page":"4700","author":"Shin","year":"2020"},{"key":"2023062101002596600_btad369-B22","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1186\/gb-2008-9-s2-s2","article-title":"Overview of BioCreative II gene mention recognition","volume":"9","author":"Smith","year":"2008","journal-title":"Genome Biol"},{"key":"2023062101002596600_btad369-B23","doi-asserted-by":"crossref","first-page":"D638","DOI":"10.1093\/nar\/gkac1000","article-title":"The string database in 2023: protein\u2013protein association networks and functional enrichment analyses for any sequenced genome of interest","volume":"51","author":"Szklarczyk","year":"2023","journal-title":"Nucleic Acids Res"},{"key":"2023062101002596600_btad369-B24","doi-asserted-by":"crossref","first-page":"D480","DOI":"10.1093\/nar\/gkaa1100","article-title":"UniProt: the universal protein knowledgebase in 2021","volume":"49","author":"The UniProt Consortium","year":"2021","journal-title":"Nucleic Acids Res"},{"key":"2023062101002596600_btad369-B25","first-page":"6000","author":"Vaswani","year":"2017"},{"key":"2023062101002596600_btad369-B26","doi-asserted-by":"crossref","first-page":"1892","DOI":"10.1093\/jamia\/ocab090","article-title":"Biomedical and clinical English model packages for the Stanza Python NLP library","volume":"28","author":"Zhang","year":"2021","journal-title":"J Am Med Inform Assoc"}],"container-title":["Bioinformatics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/academic.oup.com\/bioinformatics\/advance-article-pdf\/doi\/10.1093\/bioinformatics\/btad369\/50540439\/btad369.pdf","content-type":"application\/pdf","content-version":"am","intended-application":"syndication"},{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article-pdf\/39\/6\/btad369\/50661418\/btad369.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article-pdf\/39\/6\/btad369\/50661418\/btad369.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,6,21]],"date-time":"2023-06-21T01:39:29Z","timestamp":1687311569000},"score":1,"resource":{"primary":{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article\/doi\/10.1093\/bioinformatics\/btad369\/7192170"}},"subtitle":[],"editor":[{"given":"Zhiyong","family":"Lu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"editor"}]}],"short-title":[],"issued":{"date-parts":[[2023,6,1]]},"references-count":26,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2023,6,1]]}},"URL":"https:\/\/doi.org\/10.1093\/bioinformatics\/btad369","relation":{"has-preprint":[{"id-type":"doi","id":"10.1101\/2023.02.20.528934","asserted-by":"object"}]},"ISSN":["1367-4811"],"issn-type":[{"value":"1367-4811","type":"electronic"}],"subject":[],"published-other":{"date-parts":[[2023,6,1]]},"published":{"date-parts":[[2023,6,1]]},"article-number":"btad369"}}