{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,8]],"date-time":"2026-05-08T11:24:34Z","timestamp":1778239474882,"version":"3.51.4"},"reference-count":44,"publisher":"Oxford University Press (OUP)","issue":"9","license":[{"start":{"date-parts":[[2024,9,14]],"date-time":"2024-09-14T00:00:00Z","timestamp":1726272000000},"content-version":"vor","delay-in-days":13,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100009708","name":"Novo Nordisk Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100009708","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100002341","name":"Academy of Finland","doi-asserted-by":"publisher","award":["332844"],"award-info":[{"award-number":["332844"]}],"id":[{"id":"10.13039\/501100002341","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,9,2]]},"abstract":"<jats:title>Abstract<\/jats:title>\n                  <jats:sec>\n                    <jats:title>Motivation<\/jats:title>\n                    <jats:p>Understanding biological processes relies heavily on curated knowledge of physical interactions between proteins. Yet, a notable gap remains between the information stored in databases of curated knowledge and the plethora of interactions documented in the scientific literature.<\/jats:p>\n                  <\/jats:sec>\n                  <jats:sec>\n                    <jats:title>Results<\/jats:title>\n                    <jats:p>To bridge this gap, we introduce ComplexTome, a manually annotated corpus designed to facilitate the development of text-mining methods for the extraction of complex formation relationships among biomedical entities targeting the downstream semantics of the physical interaction subnetwork of the STRING database. This corpus comprises 1287 documents with \u223c3500 relationships. We train a novel relation extraction model on this corpus and find that it can highly reliably identify physical protein interactions (F1-score\u2009=\u200982.8%). We additionally enhance the model\u2019s capabilities through unsupervised trigger word detection and apply it to extract relations and trigger words for these relations from all open publications in the domain literature. This information has been fully integrated into the latest version of the STRING database.<\/jats:p>\n                  <\/jats:sec>\n                  <jats:sec>\n                    <jats:title>Availability and implementation<\/jats:title>\n                    <jats:p>We provide the corpus, code, and all results produced by the large-scale runs of our systems biomedical on literature via Zenodo https:\/\/doi.org\/10.5281\/zenodo.8139716, Github https:\/\/github.com\/farmeh\/ComplexTome_extraction, and the latest version of STRING database https:\/\/string-db.org\/.<\/jats:p>\n                  <\/jats:sec>","DOI":"10.1093\/bioinformatics\/btae552","type":"journal-article","created":{"date-parts":[[2024,9,12]],"date-time":"2024-09-12T21:48:20Z","timestamp":1726177700000},"source":"Crossref","is-referenced-by-count":16,"title":["STRING-ing together protein complexes: corpus and methods for extracting physical protein interactions from the biomedical literature"],"prefix":"10.1093","volume":"40","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5555-2828","authenticated-orcid":false,"given":"Farrokh","family":"Mehryary","sequence":"first","affiliation":[{"name":"TurkuNLP Group, Department of Computing, University of Turku , Turku 20014,","place":["Finland"]}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3611-5726","authenticated-orcid":false,"given":"Katerina","family":"Nastou","sequence":"additional","affiliation":[{"name":"Novo Nordisk Foundation Center for Protein Research, University of Copenhagen , Copenhagen 2200,","place":["Denmark"]}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tomoko","family":"Ohta","sequence":"additional","affiliation":[{"name":"Textimi , 1-37-13 Kitazawa , Tokyo, Setagaya-ku 155-0031,","place":["Japan"]}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7885-715X","authenticated-orcid":false,"given":"Lars Juhl","family":"Jensen","sequence":"additional","affiliation":[{"name":"Novo Nordisk Foundation Center for Protein Research, University of Copenhagen , Copenhagen 2200,","place":["Denmark"]}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sampo","family":"Pyysalo","sequence":"additional","affiliation":[{"name":"TurkuNLP Group, Department of Computing, University of Turku , Turku 20014,","place":["Finland"]}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"286","published-online":{"date-parts":[[2024,9,14]]},"reference":[{"key":"2024093015200910700_btae552-B1","first-page":"10","author":"Bj\u00f6rne","year":"2009"},{"key":"2024093015200910700_btae552-B2","doi-asserted-by":"crossref","first-page":"139","DOI":"10.1016\/j.artmed.2004.07.016","article-title":"Comparative experiments on learning information extractors for proteins and their interactions","volume":"33","author":"Bunescu","year":"2005","journal-title":"Artif Intell Med"},{"key":"2024093015200910700_btae552-B3","doi-asserted-by":"crossref","first-page":"btad557","DOI":"10.1093\/bioinformatics\/btad557","article-title":"An extensive benchmark study on biomedical text generation and mining with chatgpt","volume":"39","author":"Chen","year":"2023","journal-title":"Bioinformatics"},{"key":"2024093015200910700_btae552-B4","doi-asserted-by":"crossref","first-page":"37","DOI":"10.1177\/001316446002000104","article-title":"A coefficient of agreement for nominal scales","volume":"20","author":"Cohen","year":"1960","journal-title":"Educ Psychol Meas"},{"key":"2024093015200910700_btae552-B5","doi-asserted-by":"crossref","first-page":"3533","DOI":"10.1093\/bioinformatics\/btz070","article-title":"Pmc text mining subset in bioc: about three million full-text articles and growing","volume":"35","author":"Comeau","year":"2019","journal-title":"Bioinformatics"},{"key":"2024093015200910700_btae552-B6","first-page":"4171","author":"Devlin","year":"2019"},{"key":"2024093015200910700_btae552-B7","doi-asserted-by":"crossref","first-page":"69","DOI":"10.1007\/978-981-19-8234-7_6","volume-title":"Recent Challenges in Intelligent Information and Database Systems","author":"Dewi","year":"2022"},{"key":"2024093015200910700_btae552-B8","doi-asserted-by":"crossref","first-page":"D808","DOI":"10.1093\/nar\/gks1094","article-title":"String v9. 1: protein-protein interaction networks, with increased coverage and integration","volume":"41","author":"Franceschini","year":"2012","journal-title":"Nucleic Acids Res"},{"key":"2024093015200910700_btae552-B9","doi-asserted-by":"publisher","first-page":"365","DOI":"10.1093\/bioinformatics\/btl616","article-title":"RelEx\u2014relation extraction using dependency parse trees","volume":"23","author":"Fundel","year":"2006","journal-title":"Bioinformatics"},{"key":"2024093015200910700_btae552-B10","doi-asserted-by":"crossref","first-page":"D687","DOI":"10.1093\/nar\/gkab1028","article-title":"The reactome pathway knowledgebase 2022","volume":"50","author":"Gillespie","year":"2022","journal-title":"Nucleic Acids Res"},{"key":"2024093015200910700_btae552-B11","doi-asserted-by":"crossref","first-page":"D559","DOI":"10.1093\/nar\/gky973","article-title":"Corum: the comprehensive resource of mammalian protein complexes\u20142019","volume":"47","author":"Giurgiu","year":"2019","journal-title":"Nucleic Acids Res"},{"key":"2024093015200910700_btae552-B12","author":"Jahan","year":"2023"},{"key":"2024093015200910700_btae552-B13","doi-asserted-by":"crossref","article-title":"One tagger, many uses: Illustrating the power of ontologies in dictionary-based named entity recognition","author":"Jensen","DOI":"10.1101\/067132"},{"key":"2024093015200910700_btae552-B14","doi-asserted-by":"publisher","author":"Jimenez Gutierrez","year":"2022","DOI":"10.18653\/v1\/2022.findings-emnlp.329"},{"key":"2024093015200910700_btae552-B15","doi-asserted-by":"publisher","first-page":"D632","DOI":"10.1093\/nar\/gkab1048","article-title":"HumanNet v3: an improved database of human gene networks for disease research","volume":"50","author":"Kim","year":"2021","journal-title":"Nucleic Acids Res"},{"key":"2024093015200910700_btae552-B16","doi-asserted-by":"crossref","first-page":"10","DOI":"10.1186\/1471-2105-9-10","article-title":"Corpus annotation for mining biomedical events from literature","volume":"9","author":"Kim","year":"2008","journal-title":"BMC Bioinformatics"},{"key":"2024093015200910700_btae552-B17","author":"Kim","year":"2009"},{"issue":"Suppl 2","key":"2024093015200910700_btae552-B18","doi-asserted-by":"crossref","first-page":"S4","DOI":"10.1186\/gb-2008-9-s2-s4","article-title":"Overview of the protein-protein interaction annotation extraction task of biocreative II","volume":"9","author":"Krallinger","year":"2008","journal-title":"Genome Biol"},{"key":"2024093015200910700_btae552-B19","doi-asserted-by":"publisher","first-page":"1234","DOI":"10.1093\/bioinformatics\/btz682","article-title":"BioBERT: a pre-trained biomedical language representation model for biomedical text mining","volume":"36","author":"Lee","year":"2019","journal-title":"Bioinformatics"},{"key":"2024093015200910700_btae552-B20","doi-asserted-by":"publisher","first-page":"146","DOI":"10.18653\/v1\/2020.clinicalnlp-1.17","author":"Lewis","year":"2020"},{"key":"2024093015200910700_btae552-B21","doi-asserted-by":"crossref","first-page":"D857","DOI":"10.1093\/nar\/gkr930","article-title":"Mint, the molecular interaction database: 2012 update","volume":"40","author":"Licata","year":"2012","journal-title":"Nucleic Acids Res"},{"key":"2024093015200910700_btae552-B22","author":"Lundberg","year":"2017"},{"key":"2024093015200910700_btae552-B23","doi-asserted-by":"crossref","first-page":"276","DOI":"10.11613\/BM.2012.031","article-title":"Interrater reliability: the kappa statistic","volume":"22","author":"McHugh","year":"2012","journal-title":"Biochem Med (Zagreb)"},{"key":"2024093015200910700_btae552-B24","doi-asserted-by":"publisher","author":"Mehryary","year":"2016","DOI":"10.18653\/v1\/W16-3009"},{"key":"2024093015200910700_btae552-B25","doi-asserted-by":"crossref","first-page":"bay120","DOI":"10.1093\/database\/bay120","article-title":"Potent pairing: ensemble of long short-term memory networks and support vector machine for chemical-protein relation extraction","volume":"2018","author":"Mehryary","year":"2018","journal-title":"Database"},{"key":"2024093015200910700_btae552-B26","author":"Mehryary","year":"2020"},{"key":"2024093015200910700_btae552-B27","doi-asserted-by":"crossref","first-page":"D550","DOI":"10.1093\/nar\/gky1001","article-title":"Complex portal 2018: extended content and enhanced visualization tools for macromolecular complexes","volume":"47","author":"Meldal","year":"2019","journal-title":"Nucleic Acids Res"},{"key":"2024093015200910700_btae552-B28","doi-asserted-by":"publisher","first-page":"100756","DOI":"10.1016\/j.websem.2022.100756","article-title":"Comparison of biomedical relationship extraction methods and models for knowledge graph creation","volume":"75","author":"Milo\u0161evi\u0107","year":"2023","journal-title":"J Web Semantics"},{"key":"2024093015200910700_btae552-B29","doi-asserted-by":"crossref","first-page":"baad080","DOI":"10.1093\/database\/baad080","article-title":"Overview of drugprot task at biocreative vii: Data and methods for large-scale text mining and knowledge graph generation of heterogenous chemical\u2013protein relations","volume":"2023","author":"Miranda-Escalada","year":"2023","journal-title":"Database"},{"key":"2024093015200910700_btae552-B30","author":"N\u00e9dellec","year":":"},{"key":"2024093015200910700_btae552-B31","author":"OpenAI","year":"2024"},{"key":"2024093015200910700_btae552-B32","doi-asserted-by":"crossref","first-page":"D358","DOI":"10.1093\/nar\/gkt1115","article-title":"The mintact project\u2014intact as a common curation platform for 11 molecular interaction databases","volume":"42","author":"Orchard","year":"2014","journal-title":"Nucleic Acids Res"},{"key":"2024093015200910700_btae552-B33","doi-asserted-by":"crossref","first-page":"187","DOI":"10.1002\/pro.3978","article-title":"The biogrid database: a comprehensive biomedical resource of curated protein, genetic, and chemical interactions","volume":"30","author":"Oughtred","year":"2021","journal-title":"Protein Sci"},{"key":"2024093015200910700_btae552-B34","doi-asserted-by":"crossref","first-page":"50","DOI":"10.1186\/1471-2105-8-50","article-title":"Bioinfer: a corpus for information extraction in the biomedical domain","volume":"8","author":"Pyysalo","year":"2007","journal-title":"BMC Bioinformatics"},{"key":"2024093015200910700_btae552-B35","author":"Pyysalo","year":"2008"},{"key":"2024093015200910700_btae552-B36","author":"Pyysalo","year":"2011"},{"key":"2024093015200910700_btae552-B37","author":"Stenetorp","year":"2012"},{"key":"2024093015200910700_btae552-B38","first-page":"3319","author":"Sundararajan","year":"2017"},{"key":"2024093015200910700_btae552-B39","doi-asserted-by":"crossref","first-page":"D605","DOI":"10.1093\/nar\/gkaa1074","article-title":"The string database in 2021: Customizable protein\u2013protein networks, and functional characterization of user-uploaded gene\/measurement sets","volume":"49","author":"Szklarczyk","year":"2021","journal-title":"Nucleic Acids Res"},{"key":"2024093015200910700_btae552-B40","doi-asserted-by":"crossref","first-page":"D638","DOI":"10.1093\/nar\/gkac1000","article-title":"The string database in 2023: Protein\u2013protein association networks and functional enrichment analyses for any sequenced genome of interest","volume":"51","author":"Szklarczyk","year":"2023","journal-title":"Nucleic Acids Res"},{"key":"2024093015200910700_btae552-B41","article-title":"Llama 2: open foundation and fine-tuned chat models","author":"Touvron","year":"2023"},{"key":"2024093015200910700_btae552-B42","author":"Vaswani","year":"2017"},{"key":"2024093015200910700_btae552-B43","author":"Wan","year":"3547"},{"key":"2024093015200910700_btae552-B44","author":"Wang","year":"2023"}],"container-title":["Bioinformatics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/academic.oup.com\/bioinformatics\/advance-article-pdf\/doi\/10.1093\/bioinformatics\/btae552\/59122666\/btae552.pdf","content-type":"application\/pdf","content-version":"am","intended-application":"syndication"},{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article-pdf\/40\/9\/btae552\/59459850\/btae552.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article-pdf\/40\/9\/btae552\/59459850\/btae552.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T11:20:25Z","timestamp":1727695225000},"score":1,"resource":{"primary":{"URL":"https:\/\/academic.oup.com\/bioinformatics\/article\/doi\/10.1093\/bioinformatics\/btae552\/7758064"}},"subtitle":[],"editor":[{"given":"Zhiyong","family":"Lu","sequence":"additional","affiliation":[],"role":[{"role":"editor","vocabulary":"crossref"}]}],"short-title":[],"issued":{"date-parts":[[2024,9]]},"references-count":44,"journal-issue":{"issue":"9","published-print":{"date-parts":[[2024,9,2]]}},"URL":"https:\/\/doi.org\/10.1093\/bioinformatics\/btae552","relation":{"has-preprint":[{"id-type":"doi","id":"10.1101\/2023.12.10.570999","asserted-by":"object"}]},"ISSN":["1367-4811"],"issn-type":[{"value":"1367-4811","type":"electronic"}],"subject":[],"published-other":{"date-parts":[[2024,9]]},"published":{"date-parts":[[2024,9]]},"article-number":"btae552"}}