{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,12]],"date-time":"2025-06-12T04:06:37Z","timestamp":1749701197121,"version":"3.41.0"},"reference-count":39,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,6,11]],"date-time":"2025-06-11T00:00:00Z","timestamp":1749600000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2025,6,11]],"date-time":"2025-06-11T00:00:00Z","timestamp":1749600000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["BioData Mining"],"DOI":"10.1186\/s13040-025-00454-9","type":"journal-article","created":{"date-parts":[[2025,6,11]],"date-time":"2025-06-11T06:24:18Z","timestamp":1749623058000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["A probabilistic approach for building disease phenotypes across electronic health records"],"prefix":"10.1186","volume":"18","author":[{"given":"David","family":"Vidmar","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jessica","family":"De Freitas","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Will","family":"Thompson","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"John M.","family":"Pfeifer","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Brandon K.","family":"Fornwalt","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Noah","family":"Zimmerman","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Riccardo","family":"Miotto","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ruijun","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,6,11]]},"reference":[{"key":"454_CR1","doi-asserted-by":"publisher","first-page":"289","DOI":"10.1093\/jamia\/ocx110","volume":"25","author":"G Hripcsak","year":"2018","unstructured":"Hripcsak G, Albers DJ. High-fidelity phenotyping: richness and freedom from bias. J Am Med Inf Assoc JAMIA. 2018;25:289\u201394. https:\/\/doi.org\/10.1093\/jamia\/ocx110.","journal-title":"J Am Med Inf Assoc JAMIA"},{"key":"454_CR2","doi-asserted-by":"publisher","first-page":"611","DOI":"10.1016\/j.ajhg.2020.03.007","volume":"106","author":"C DeBoever","year":"2020","unstructured":"DeBoever C, Tanigawa Y, Aguirre M, et al. Assessing digital phenotyping to enhance genetic studies of human diseases. Am J Hum Genet. 2020;106:611\u201322. https:\/\/doi.org\/10.1016\/j.ajhg.2020.03.007.","journal-title":"Am J Hum Genet"},{"key":"454_CR3","doi-asserted-by":"publisher","first-page":"1287","DOI":"10.1161\/CIRCULATIONAHA.120.047829","volume":"143","author":"S Raghunath","year":"2021","unstructured":"Raghunath S, Pfeifer JM, Ulloa-Cerna AE, et al. Deep neural networks can predict New-Onset atrial fibrillation from the 12-Lead ECG and help identify those at risk of atrial fibrillation\u2013Related stroke. Circulation. 2021;143:1287\u201398. https:\/\/doi.org\/10.1161\/CIRCULATIONAHA.120.047829.","journal-title":"Circulation"},{"key":"454_CR4","doi-asserted-by":"publisher","first-page":"e22296","DOI":"10.2196\/22296","volume":"5","author":"A Chandra","year":"2021","unstructured":"Chandra A, Philips ST, Pandey A, et al. Electronic health Records\u2013Based Cardio-Oncology registry for care gap identification and pragmatic research: procedure and observational study. JMIR Cardio. 2021;5:e22296. https:\/\/doi.org\/10.2196\/22296.","journal-title":"JMIR Cardio"},{"key":"454_CR5","doi-asserted-by":"publisher","first-page":"e56734","DOI":"10.2196\/56734","volume":"12","author":"PD Sood","year":"2024","unstructured":"Sood PD, Liu S, Lehmann H, et al. Assessing the effect of electronic health record data quality on identifying patients with type 2 diabetes: Cross-Sectional study. JMIR Med Inf. 2024;12:e56734. https:\/\/doi.org\/10.2196\/56734.","journal-title":"JMIR Med Inf"},{"key":"454_CR6","unstructured":"Leader JB, Pendergrass SA, Verma A et al. Contrasting association results between existing phewas phenotype definition methods and five validated electronic phenotypes. AMIA Annu Symp Proc AMIA Symp. 2015;2015:824\u201332."},{"key":"454_CR7","doi-asserted-by":"publisher","first-page":"e147","DOI":"10.1136\/amiajnl-2012-000896","volume":"20","author":"KM Newton","year":"2013","unstructured":"Newton KM, Peissig PL, Kho AN, et al. Validation of electronic medical record-based phenotyping algorithms: results and lessons learned from the eMERGE network. J Am Med Inf Assoc. 2013;20:e147\u201354. https:\/\/doi.org\/10.1136\/amiajnl-2012-000896.","journal-title":"J Am Med Inf Assoc"},{"key":"454_CR8","unstructured":"Eyre H, Chapman AB, Peterson KS et al. Launching into clinical space with medspaCy: a new clinical text processing toolkit in Python. AMIA Annu Symp Proc. 2022;2021:438\u201347."},{"key":"454_CR9","doi-asserted-by":"publisher","first-page":"53","DOI":"10.1146\/annurev-biodatasci-080917-013315","volume":"1","author":"JM Banda","year":"2018","unstructured":"Banda JM, Seneviratne M, Hernandez-Boussard T, et al. Advances in electronic phenotyping: from Rule-Based definitions to machine learning models. Annu Rev Biomed Data Sci. 2018;1:53\u201368. https:\/\/doi.org\/10.1146\/annurev-biodatasci-080917-013315.","journal-title":"Annu Rev Biomed Data Sci"},{"key":"454_CR10","doi-asserted-by":"publisher","unstructured":"De Freitas JK, Johnson KW, Golden E, et al. Phe2vec: automated disease phenotyping based on unsupervised embeddings from electronic health records. Patterns. 2021;2. https:\/\/doi.org\/10.1016\/j.patter.2021.100337.","DOI":"10.1016\/j.patter.2021.100337"},{"key":"454_CR11","doi-asserted-by":"publisher","first-page":"e0192586","DOI":"10.1371\/journal.pone.0192586","volume":"13","author":"Y Ni","year":"2018","unstructured":"Ni Y, Alwell K, Moomaw CJ, et al. Towards phenotyping stroke: leveraging data from a large-scale epidemiological study to detect stroke diagnosis. PLoS ONE. 2018;13:e0192586. https:\/\/doi.org\/10.1371\/journal.pone.0192586.","journal-title":"PLoS ONE"},{"key":"454_CR12","doi-asserted-by":"publisher","first-page":"731","DOI":"10.1093\/jamia\/ocw011","volume":"23","author":"Y Halpern","year":"2016","unstructured":"Halpern Y, Horng S, Choi Y, et al. Electronic medical record phenotyping using the anchor and learn framework. J Am Med Inf Assoc JAMIA. 2016;23:731\u201340. https:\/\/doi.org\/10.1093\/jamia\/ocw011.","journal-title":"J Am Med Inf Assoc JAMIA"},{"key":"454_CR13","unstructured":"Shu K, Zheng G, Li Y et al. Leveraging multi-source weak social supervision for early detection of fake news. 2020."},{"key":"454_CR14","doi-asserted-by":"publisher","first-page":"5854","DOI":"10.1109\/TKDE.2021.3061215","volume":"34","author":"Z-Y Zhang","year":"2022","unstructured":"Zhang Z-Y, Zhao P, Jiang Y, et al. Learning from incomplete and inaccurate supervision. IEEE Trans Knowl Data Eng. 2022;34:5854\u201368. https:\/\/doi.org\/10.1109\/TKDE.2021.3061215.","journal-title":"IEEE Trans Knowl Data Eng"},{"key":"454_CR15","doi-asserted-by":"publisher","first-page":"269","DOI":"10.14778\/3157794.3157797","volume":"11","author":"A Ratner","year":"2017","unstructured":"Ratner A, Bach SH, Ehrenberg H, et al. Snorkel: rapid training data creation with weak supervision. Proc VLDB Endow Int Conf Very Large Data Bases. 2017;11:269. https:\/\/doi.org\/10.14778\/3157794.3157797.","journal-title":"Proc VLDB Endow Int Conf Very Large Data Bases"},{"key":"454_CR16","doi-asserted-by":"publisher","first-page":"1988","DOI":"10.1109\/TPAMI.2005.249","volume":"27","author":"A Narasimhamurthy","year":"2005","unstructured":"Narasimhamurthy A. Theoretical bounds of majority voting performance for a binary classification problem. IEEE Trans Pattern Anal Mach Intell. 2005;27:1988\u201395. https:\/\/doi.org\/10.1109\/TPAMI.2005.249.","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"454_CR17","unstructured":"Platanios EA, Dubey A, Mitchell T. Estimating accuracy from unlabeled data: a bayesian approach. proceedings of the 33rd international conference on machine learning. PMLR 2016:1416\u201325."},{"key":"454_CR18","doi-asserted-by":"publisher","first-page":"4763","DOI":"10.1609\/aaai.v33i01.33014763","volume":"33","author":"A Ratner","year":"2019","unstructured":"Ratner A, Hancock B, Dunnmon J, et al. Training complex models with Multi-Task weak supervision. Proc AAAI Conf Artif Intell. 2019;33:4763\u201371. https:\/\/doi.org\/10.1609\/aaai.v33i01.33014763.","journal-title":"Proc AAAI Conf Artif Intell"},{"key":"454_CR19","doi-asserted-by":"crossref","unstructured":"Maheshwari A, Chatterjee O, Killamsetty K, et al. Semi-Supervised data programming with subset selection. Association for Computational Linguistics; 2021.","DOI":"10.18653\/v1\/2021.findings-acl.408"},{"key":"454_CR20","doi-asserted-by":"crossref","unstructured":"Lison P, Hubin A, Barnes J, et al. Named entity recognition without labelled data: A weak supervision approach. Association for Computational Linguistics; 2020.","DOI":"10.18653\/v1\/2020.acl-main.139"},{"key":"454_CR21","doi-asserted-by":"publisher","first-page":"217","DOI":"10.1186\/s12859-020-03542-1","volume":"21","author":"EK Mallory","year":"2020","unstructured":"Mallory EK, de Rochemonteix M, Ratner A, et al. Extracting chemical reactions from text using snorkel. BMC Bioinformatics. 2020;21:217. https:\/\/doi.org\/10.1186\/s12859-020-03542-1.","journal-title":"BMC Bioinformatics"},{"key":"454_CR22","unstructured":"Guo C, Pleiss G, Sun Y et al. On Calibration of Modern Neural Networks. 2017."},{"key":"454_CR23","doi-asserted-by":"publisher","first-page":"104269","DOI":"10.1016\/j.jbi.2022.104269","volume":"139","author":"E Getzen","year":"2023","unstructured":"Getzen E, Ungar L, Mowery D, et al. Mining for equitable health: assessing the impact of missing data in electronic health records. J Biomed Inf. 2023;139:104269. https:\/\/doi.org\/10.1016\/j.jbi.2022.104269.","journal-title":"J Biomed Inf"},{"key":"454_CR24","doi-asserted-by":"publisher","first-page":"1246","DOI":"10.1093\/jamia\/ocad066","volume":"30","author":"Y Zhou","year":"2023","unstructured":"Zhou Y, Shi J, Stein R, et al. Missing data matter: an empirical evaluation of the impacts of missing EHR data in comparative effectiveness research. J Am Med Inf Assoc. 2023;30:1246\u201356. https:\/\/doi.org\/10.1093\/jamia\/ocad066.","journal-title":"J Am Med Inf Assoc"},{"key":"454_CR25","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1038\/s41746-021-00518-0","volume":"4","author":"J Li","year":"2021","unstructured":"Li J, Yan XS, Chaudhary D, et al. Imputation of missing values for electronic health record laboratory data. Npj Digit Med. 2021;4:1\u201314. https:\/\/doi.org\/10.1038\/s41746-021-00518-0.","journal-title":"Npj Digit Med"},{"key":"454_CR26","unstructured":"Harrell F. Classification vs. Prediction. Stat. Think. 2017. https:\/\/www.fharrell.com\/post\/classification\/. (Accessed 11 November 2024)."},{"key":"454_CR27","doi-asserted-by":"publisher","first-page":"345","DOI":"10.1136\/amiajnl-2013-001942","volume":"21","author":"M Rosenman","year":"2014","unstructured":"Rosenman M, He J, Martin J, et al. Database queries for hospitalizations for acute congestive heart failure: flexible methods and validation based on set theory. J Am Med Inf Assoc. 2014;21:345\u201352. https:\/\/doi.org\/10.1136\/amiajnl-2013-001942.","journal-title":"J Am Med Inf Assoc"},{"key":"454_CR28","doi-asserted-by":"publisher","first-page":"846","DOI":"10.1093\/jamia\/ocad260","volume":"31","author":"C Zeng","year":"2024","unstructured":"Zeng C, Schlueter DJ, Tran TC, et al. Comparison of phenomic profiles in the all of Us research program against the US general population and the UK biobank. J Am Med Inf Assoc. 2024;31:846\u201354. https:\/\/doi.org\/10.1093\/jamia\/ocad260.","journal-title":"J Am Med Inf Assoc"},{"key":"454_CR29","doi-asserted-by":"publisher","first-page":"e0191214","DOI":"10.1371\/journal.pone.0191214","volume":"13","author":"M Pujades-Rodriguez","year":"2018","unstructured":"Pujades-Rodriguez M, Guttmann OP, Gonzalez-Izquierdo A, et al. Identifying unmet clinical need in hypertrophic cardiomyopathy using National electronic health records. PLoS ONE. 2018;13:e0191214. https:\/\/doi.org\/10.1371\/journal.pone.0191214.","journal-title":"PLoS ONE"},{"key":"454_CR30","doi-asserted-by":"publisher","first-page":"e0280342","DOI":"10.1371\/journal.pone.0280342","volume":"18","author":"E Farrand","year":"2023","unstructured":"Farrand E, Collard HR, Guarnieri M, et al. Extracting patient-level data from the electronic health record: expanding opportunities for health system research. PLoS ONE. 2023;18:e0280342. https:\/\/doi.org\/10.1371\/journal.pone.0280342.","journal-title":"PLoS ONE"},{"key":"454_CR31","unstructured":"Bielinski SJ. Heart Failure (HF) with Differentiation between Preserved and Reduced Ejection Fraction| PheKB. 2013. https:\/\/phekb.org\/phenotype\/heart-failure-hf-differentiation-between-preserved-and-reduced-ejection-fraction. (Accessed 1 November 2024)."},{"key":"454_CR32","doi-asserted-by":"publisher","first-page":"1046","DOI":"10.1093\/jamia\/ocv202","volume":"23","author":"JC Kirby","year":"2016","unstructured":"Kirby JC, Speltz P, Rasmussen LV, et al. PheKB: a catalog and workflow for creating electronic phenotype algorithms for transportability. J Am Med Inf Assoc JAMIA. 2016;23:1046\u201352. https:\/\/doi.org\/10.1093\/jamia\/ocv202.","journal-title":"J Am Med Inf Assoc JAMIA"},{"key":"454_CR33","doi-asserted-by":"crossref","unstructured":"Ratner A, Snorkel. 2017. https:\/\/github.com\/snorkel-team\/snorkel","DOI":"10.1145\/3035918.3056442"},{"key":"454_CR34","doi-asserted-by":"publisher","first-page":"620","DOI":"10.1103\/PhysRev.106.620","volume":"106","author":"ET Jaynes","year":"1957","unstructured":"Jaynes ET. Information theory and statistical mechanics. Phys Rev. 1957;106:620\u201330. https:\/\/doi.org\/10.1103\/PhysRev.106.620.","journal-title":"Phys Rev"},{"key":"454_CR35","unstructured":"Jaynes ET. Where do we stand on maximum entropy? Maximum Entropy Formalism Conference, MIT. 1978."},{"key":"454_CR36","doi-asserted-by":"publisher","unstructured":"Vickers AJ, Van Calster B, Steyerberg EW. Net benefit approaches to the evaluation of prediction models, molecular markers, and diagnostic tests. BMJ. 2016;i6. https:\/\/doi.org\/10.1136\/bmj.i6.","DOI":"10.1136\/bmj.i6"},{"key":"454_CR37","unstructured":"Shin C, Sebag AS. Can we get smarter than majority vote? Efficient use of individual rater\u2019s labels for content moderation. 2nd Workshop on Efficient Natural Language and Speech Processing. 2022."},{"key":"454_CR38","unstructured":"Kumar A, Liang PS, Ma T. Verified uncertainty calibration. In: Wallach H, Larochelle H, Beygelzimer A, et al. editors. Advances in neural information processing systems. Curran Associates, Inc.; 2019."},{"key":"454_CR39","doi-asserted-by":"publisher","first-page":"359","DOI":"10.1198\/016214506000001437","volume":"102","author":"T Gneiting","year":"2007","unstructured":"Gneiting T, Raftery AE. Strictly proper scoring rules, prediction, and Estimation. J Am Stat Assoc. 2007;102:359\u201378. https:\/\/doi.org\/10.1198\/016214506000001437.","journal-title":"J Am Stat Assoc"}],"container-title":["BioData Mining"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1186\/s13040-025-00454-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1186\/s13040-025-00454-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1186\/s13040-025-00454-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,11]],"date-time":"2025-06-11T06:24:21Z","timestamp":1749623061000},"score":1,"resource":{"primary":{"URL":"https:\/\/biodatamining.biomedcentral.com\/articles\/10.1186\/s13040-025-00454-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,11]]},"references-count":39,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2025,12]]}},"alternative-id":["454"],"URL":"https:\/\/doi.org\/10.1186\/s13040-025-00454-9","relation":{},"ISSN":["1756-0381"],"issn-type":[{"value":"1756-0381","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,6,11]]},"assertion":[{"value":"13 February 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"30 May 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 June 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"This study was conducted on de-identified health information subject to an IRB exempt determination (Advarra Pro00072742) and did not involve human subjects research.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics approval and consent to participate"}},{"value":"Not Applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for publication"}},{"value":"All authors are employees and shareholders of Tempus AI, Inc.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"39"}}