{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T20:08:31Z","timestamp":1777925311787,"version":"3.51.4"},"reference-count":97,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2023,1,25]],"date-time":"2023-01-25T00:00:00Z","timestamp":1674604800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,1,25]],"date-time":"2023-01-25T00:00:00Z","timestamp":1674604800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Nat Mach Intell"],"DOI":"10.1038\/s42256-022-00596-z","type":"journal-article","created":{"date-parts":[[2023,1,25]],"date-time":"2023-01-25T17:03:59Z","timestamp":1674666239000},"page":"13-23","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":68,"title":["Learning from data with structured missingness"],"prefix":"10.1038","volume":"5","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9584-8044","authenticated-orcid":false,"given":"Robin","family":"Mitra","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2448-6714","authenticated-orcid":false,"given":"Sarah F.","family":"McGough","sequence":"additional","affiliation":[]},{"given":"Tapabrata","family":"Chakraborti","sequence":"additional","affiliation":[]},{"given":"Chris","family":"Holmes","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3399-0085","authenticated-orcid":false,"given":"Ryan","family":"Copping","sequence":"additional","affiliation":[]},{"given":"Niels","family":"Hagenbuch","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8900-8268","authenticated-orcid":false,"given":"Stefanie","family":"Biedermann","sequence":"additional","affiliation":[]},{"given":"Jack","family":"Noonan","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7302-4391","authenticated-orcid":false,"given":"Brieuc","family":"Lehmann","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2453-9188","authenticated-orcid":false,"given":"Aditi","family":"Shenvi","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8572-5208","authenticated-orcid":false,"given":"Xuan Vinh","family":"Doan","sequence":"additional","affiliation":[]},{"given":"David","family":"Leslie","sequence":"additional","affiliation":[]},{"given":"Ginestra","family":"Bianconi","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6479-3028","authenticated-orcid":false,"given":"Ruben","family":"Sanchez-Garcia","sequence":"additional","affiliation":[]},{"given":"Alisha","family":"Davies","sequence":"additional","affiliation":[]},{"given":"Maxine","family":"Mackintosh","sequence":"additional","affiliation":[]},{"given":"Eleni-Rosalina","family":"Andrinopoulou","sequence":"additional","affiliation":[]},{"given":"Anahid","family":"Basiri","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4353-5780","authenticated-orcid":false,"given":"Chris","family":"Harbron","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5396-9750","authenticated-orcid":false,"given":"Ben D.","family":"MacArthur","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,1,25]]},"reference":[{"key":"596_CR1","doi-asserted-by":"crossref","unstructured":"Little, R. J. A. & Rubin, D. B. Statistical Analysis With Missing Data Vol. 793 (John Wiley & Sons, 2019).","DOI":"10.1002\/9781119482260"},{"key":"596_CR2","doi-asserted-by":"crossref","unstructured":"Karla\u0161, B. et al. Nearest neighbor classifiers over incomplete information: from certain answers to certain predictions. Preprint at https:\/\/arxiv.org\/abs\/2005.05117 (2020).","DOI":"10.14778\/3430915.3430917"},{"key":"596_CR3","doi-asserted-by":"publisher","first-page":"581","DOI":"10.1093\/biomet\/63.3.581","volume":"63","author":"DB Rubin","year":"1976","unstructured":"Rubin, D. B. Inference and missing data. Biometrika 63, 581\u2013592 (1976).","journal-title":"Biometrika"},{"key":"596_CR4","doi-asserted-by":"publisher","first-page":"353","DOI":"10.1076\/edre.7.4.353.8937","volume":"7","author":"TD Pigott","year":"2001","unstructured":"Pigott, T. D. A review of methods for missing data. Educ. Res. Eval. 7, 353\u2013383 (2001).","journal-title":"Educ. Res. Eval."},{"key":"596_CR5","doi-asserted-by":"publisher","first-page":"147","DOI":"10.1037\/1082-989X.7.2.147","volume":"7","author":"JL Schafer","year":"2002","unstructured":"Schafer, J. L. & Graham, J. W. Missing data: our view of the state of the art. Psychol. Methods 7, 147\u2013177 (2002).","journal-title":"Psychol. Methods"},{"key":"596_CR6","doi-asserted-by":"publisher","first-page":"2244","DOI":"10.1214\/aos\/1176348396","volume":"19","author":"DF Heitjan","year":"1991","unstructured":"Heitjan, D. F. & Rubin, D. B. Ignorability and coarse data. Ann. Stat. 19, 2244\u20132253 (1991).","journal-title":"Ann. Stat."},{"key":"596_CR7","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1186\/s40537-021-00516-9","volume":"8","author":"T Emmanuel","year":"2021","unstructured":"Emmanuel, T. et al. A survey on missing data in machine learning. J. Big Data 8, 1\u201337 (2021).","journal-title":"J. Big Data"},{"key":"596_CR8","doi-asserted-by":"publisher","first-page":"829","DOI":"10.1162\/neco_a_01273","volume":"32","author":"J Gao","year":"2020","unstructured":"Gao, J., Li, P., Chen, Z. & Zhang, J. A survey on deep learning for multimodal data fusion. Neur. Comput. 32, 829\u2013864 (2020).","journal-title":"Neur. Comput."},{"key":"596_CR9","doi-asserted-by":"publisher","first-page":"106","DOI":"10.1016\/j.neucom.2021.03.090","volume":"448","author":"X Yan","year":"2021","unstructured":"Yan, X., Hu, S., Mao, Y., Ye, Y. & Yu, H. Deep multi-view learning methods: a review. Neurocomputing 448, 106\u2013129 (2021).","journal-title":"Neurocomputing"},{"key":"596_CR10","unstructured":"Xu, C., Tao, D. & Xu, C. A survey on multi-view learning. Preprint at https:\/\/arxiv.org\/abs\/1304.5634 (2013)."},{"key":"596_CR11","doi-asserted-by":"publisher","first-page":"44","DOI":"10.1038\/s41591-018-0300-7","volume":"25","author":"EJ Topol","year":"2019","unstructured":"Topol, E. J. High-performance medicine: the convergence of human and artificial intelligence. Nat. Med. 25, 44\u201356 (2019).","journal-title":"Nat. Med."},{"key":"596_CR12","doi-asserted-by":"crossref","unstructured":"Silva, L. A. V. & Rohr, K. Pan-cancer prognosis prediction using multimodal deep learning. In 2020 IEEE 17th International Symposium on Biomedical Imaging 568\u2013571 (IEEE, 2020).","DOI":"10.1109\/ISBI45749.2020.9098665"},{"key":"596_CR13","doi-asserted-by":"publisher","first-page":"473","DOI":"10.1080\/01621459.1996.10476908","volume":"91","author":"DB Rubin","year":"1996","unstructured":"Rubin, D. B. Multiple imputation after 18+ years. J. Am. Stat. Assoc. 91, 473\u2013489 (1996).","journal-title":"J. Am. Stat. Assoc."},{"key":"596_CR14","unstructured":"Bommasani, R. et al. On the opportunities and risks of foundation models. Preprint at https:\/\/arxiv.org\/abs\/2108.07258 (2021)."},{"key":"596_CR15","doi-asserted-by":"publisher","first-page":"305","DOI":"10.1038\/s42256-020-0186-1","volume":"2","author":"GA Kaissis","year":"2020","unstructured":"Kaissis, G. A., Makowski, M. R., R\u00fcckert, D. & Braren, R. F. Secure, privacy-preserving and federated machine learning in medical imaging. Nat. Mach. Intell. 2, 305\u2013311 (2020).","journal-title":"Nat. Mach. Intell."},{"key":"596_CR16","first-page":"50","volume":"37","author":"T Li","year":"2020","unstructured":"Li, T., Sahu, A. K., Talwalkar, A. & Smith, V. Federated learning: challenges, methods, and future directions. IEEE Signal Process. Mag. 37, 50\u201360 (2020).","journal-title":"IEEE Signal Process. Mag."},{"key":"596_CR17","unstructured":"Holmes, C. Artificial Intelligence and Health: A Summary Report of a Roundtable Held on 16 January 2019 (Academy of Medical Sciences, 2019); https:\/\/acmedsci.ac.uk\/policy\/policy-projects\/artificial--intelligence-and-health"},{"key":"596_CR18","doi-asserted-by":"publisher","first-page":"1278","DOI":"10.1093\/bioinformatics\/bty796","volume":"35","author":"X Dong","year":"2019","unstructured":"Dong, X. et al. TOBMI: trans-omics block missing data imputation using a k-nearest neighbor weighted approach. Bioinformatics 35, 1278\u20131283 (2019).","journal-title":"Bioinformatics"},{"key":"596_CR19","doi-asserted-by":"publisher","first-page":"1639","DOI":"10.1038\/s41467-021-21975-x","volume":"12","author":"T Naito","year":"2021","unstructured":"Naito, T. et al. A deep learning method for HLA imputation and trans-ethnic MHC fine-mapping of type 1 diabetes. Nat. Commun. 12, 1639 (2021).","journal-title":"Nat. Commun."},{"key":"596_CR20","doi-asserted-by":"publisher","first-page":"160","DOI":"10.1214\/18-STS646","volume":"33","author":"V Audigier","year":"2018","unstructured":"Audigier, V. et al. Multiple imputation for multilevel data with continuous and binary variables. Stat. Sci. 33, 160\u2013183 (2018).","journal-title":"Stat. Sci."},{"key":"596_CR21","doi-asserted-by":"publisher","unstructured":"Kamphuis, R., Jolani, S. & Lugtig, P. The blocked imputation approach for missing data. Preprint at ResearchGate https:\/\/doi.org\/10.13140\/RG.2.2.12467.32803 (2018).","DOI":"10.13140\/RG.2.2.12467.32803"},{"key":"596_CR22","doi-asserted-by":"publisher","first-page":"6085","DOI":"10.1038\/s41598-018-24271-9","volume":"8","author":"Z Che","year":"2018","unstructured":"Che, Z., Purushotham, S., Cho, K., Sontag, D. & Liu, Y. Recurrent neural networks for multivariate time series with missing values. Sci. Rep. 8, 6085 (2018).","journal-title":"Sci. Rep."},{"key":"596_CR23","unstructured":"Wang, Z., Akande, O., Poulos, J. & Li, F. Are deep learning models superior for missing data imputation in large surveys? Evidence from an empirical comparison. Preprint at https:\/\/arxiv.org\/abs\/2103.09316 (2021)."},{"key":"596_CR24","doi-asserted-by":"publisher","first-page":"e007450","DOI":"10.1136\/bmjopen-2014-007450","volume":"5","author":"NJ Tierney","year":"2015","unstructured":"Tierney, N. J., Harden, F. A., Harden, M. J. & Mengersen, K. L. Using decision trees to understand structure in missing data. BMJ Open 5, e007450 (2015).","journal-title":"BMJ Open"},{"key":"596_CR25","doi-asserted-by":"publisher","first-page":"2514","DOI":"10.1200\/JCO.2017.35.15_suppl.2514","volume":"35","author":"G Singal","year":"2017","unstructured":"Singal, G. et al. Development and validation of a real-world clinicogenomic database. J. Clin. Oncol. 35, 2514 (2017).","journal-title":"J. Clin. Oncol."},{"key":"596_CR26","doi-asserted-by":"publisher","first-page":"1","DOI":"10.18637\/jss.v045.i03","volume":"45","author":"S Van Buuren","year":"2011","unstructured":"Van Buuren, S. & Groothuis-Oudshoorn, K. mice: multivariate imputation by chained equations in R. J. Stat. Softw. 45, 1\u201367 (2011).","journal-title":"J. Stat. Softw."},{"key":"596_CR27","doi-asserted-by":"crossref","unstructured":"Leslie, D. et al. Artificial intelligence, human rights, democracy, and the rule of law: a primer. Preprint at https:\/\/arxiv.org\/abs\/2104.04147 (2021).","DOI":"10.2139\/ssrn.3817999"},{"key":"596_CR28","doi-asserted-by":"publisher","unstructured":"MacArthur, B. D., Dorobantu, C. & Margetts, H. Resilient government requires data science reform. Nat. Hum. Behav. https:\/\/doi.org\/10.1038\/s41562-022-01423-6 (2022).","DOI":"10.1038\/s41562-022-01423-6"},{"key":"596_CR29","doi-asserted-by":"publisher","first-page":"257","DOI":"10.1214\/13-STS415","volume":"28","author":"S Seaman","year":"2013","unstructured":"Seaman, S., Galati, J., Jackson, D. & Carlin, J. What is meant by \u201cmissing at random\"? Stat. Sci. 28, 257\u2013268 (2013).","journal-title":"Stat. Sci."},{"key":"596_CR30","doi-asserted-by":"publisher","first-page":"189","DOI":"10.1111\/insr.12242","volume":"86","author":"M Doretti","year":"2018","unstructured":"Doretti, M., Geneletti, S. & Stanghellini, E. Missing data: a unified taxonomy guided by conditional independence. Int. Stat. Rev. 86, 189\u2013204 (2018).","journal-title":"Int. Stat. Rev."},{"key":"596_CR31","unstructured":"Tian, J. Missing at random in graphical models. In Artificial Intelligence and Statistics 977\u2013985 (PMLR, 2015)."},{"key":"596_CR32","unstructured":"Antelmi, L. et al. Combining multi-task learning and multi-channel variational auto-encoders to exploit datasets with missing observations -application to multi-modal neuroimaging studies in dementia. Preprint at https:\/\/hal.inria.fr\/hal-03114888 (2021)."},{"key":"596_CR33","doi-asserted-by":"crossref","unstructured":"Newman, M. Networks (Oxford Univ. Press, 2018).","DOI":"10.1093\/oso\/9780198805090.001.0001"},{"key":"596_CR34","doi-asserted-by":"crossref","unstructured":"Bianconi, G. Higher-Order Networks (Cambridge Univ. Press, 2021).","DOI":"10.1017\/9781108770996"},{"key":"596_CR35","doi-asserted-by":"publisher","first-page":"20210110","DOI":"10.1098\/rspa.2021.0110","volume":"477","author":"AJ Gutknecht","year":"2021","unstructured":"Gutknecht, A. J., Wibral, M. & Makkeh, A. Bits and pieces: understanding information decomposition from part-whole relationships and formal logic. Proc. R. Soc. A 477, 20210110 (2021).","journal-title":"Proc. R. Soc. A"},{"key":"596_CR36","unstructured":"Bick, C., Gross, E., Harrington, H. A. & Schaub, M. T. What are higher-order networks? Preprint at https:\/\/arxiv.org\/abs\/2104.11329 (2021)."},{"key":"596_CR37","doi-asserted-by":"publisher","first-page":"255","DOI":"10.1090\/S0273-0979-09-01249-X","volume":"46","author":"G Carlsson","year":"2009","unstructured":"Carlsson, G. Topology and data. Bull. Am. Math. Soc. 46, 255\u2013308 (2009).","journal-title":"Bull. Am. Math. Soc."},{"key":"596_CR38","doi-asserted-by":"crossref","unstructured":"Joharinad, P. & Jost, J. Geometry of data. Preprint at https:\/\/arxiv.org\/abs\/2203.07208 (2022).","DOI":"10.1007\/978-3-031-12244-6_14"},{"key":"596_CR39","doi-asserted-by":"crossref","unstructured":"Bianconi, G. Multilayer Networks (Oxford Univ. Press, 2018).","DOI":"10.1093\/oso\/9780198753919.001.0001"},{"key":"596_CR40","doi-asserted-by":"crossref","unstructured":"Kiani, N. A., Gomez-Cabrero, D. & Bianconi, G. (eds) Networks of Networks in Biology (Cambridge Univ. Press, 2021).","DOI":"10.1017\/9781108553711"},{"key":"596_CR41","doi-asserted-by":"publisher","first-page":"2749","DOI":"10.1002\/sim.8148","volume":"38","author":"KM Lee","year":"2019","unstructured":"Lee, K. M., Biedermann, S. & Mitra, R. D-optimal designs for multiarm trials with dropouts. Stat. Med. 38, 2749\u20132766 (2019).","journal-title":"Stat. Med."},{"key":"596_CR42","doi-asserted-by":"publisher","unstructured":"Lee, K. M., Mitra, R. & Biedermann, S. Optimal design when outcome values are not missing at random. Stat. Sinica https:\/\/doi.org\/10.5705\/ss.202016.0526 (2018).","DOI":"10.5705\/ss.202016.0526"},{"key":"596_CR43","first-page":"1611","volume":"28","author":"KM Lee","year":"2018","unstructured":"Lee, K. M., Biedermann, S. & Mitra, R. Optimal design for experiments with possibly incomplete observations. Stat. Sinica 28, 1611\u20131632 (2018).","journal-title":"Stat. Sinica"},{"key":"596_CR44","doi-asserted-by":"crossref","unstructured":"Noonan, J. & Zhigljavsky, A. in Black Box Optimization, Machine Learning, and No-Free Lunch Theorems (eds Pardalos, P. M. et al.) 273\u2013318 (Springer, 2021).","DOI":"10.1007\/978-3-030-66515-9_10"},{"key":"596_CR45","doi-asserted-by":"publisher","first-page":"18","DOI":"10.1007\/s43069-020-0015-8","volume":"1","author":"A Zhigljavsky","year":"2020","unstructured":"Zhigljavsky, A. & Noonan, J. Covering of high-dimensional cubes and quantization. SN Oper. Res. Forum 1, 18 (2020).","journal-title":"SN Oper. Res. Forum"},{"key":"596_CR46","doi-asserted-by":"publisher","first-page":"690","DOI":"10.1002\/sim.8797","volume":"40","author":"T Burnett","year":"2020","unstructured":"Burnett, T. & Jennison, C. Adaptive enrichment trials: what are the benefits? Stat. Med. 40, 690\u2013711 (2020).","journal-title":"Stat. Med."},{"key":"596_CR47","doi-asserted-by":"publisher","first-page":"218","DOI":"10.1016\/j.jclinepi.2021.11.023","volume":"142","author":"SWJ Nijman","year":"2022","unstructured":"Nijman, S. W. J. et al. Missing data is poorly handled and reported in prediction model studies using machine learning: a literature review. J. Clin. Epidemiol. 142, 218\u2013229 (2022).","journal-title":"J. Clin. Epidemiol."},{"key":"596_CR48","unstructured":"Ipsen, N., Mattei, P.-A. & Frellsen, J. How to deal with missing data in supervised deep learning? In Artemiss-ICML Workshop on the Art of Learning with Missing Values (2020)."},{"key":"596_CR49","unstructured":"Buolamwini, J. & Gebru, T. Gender shades: intersectional accuracy disparities in commercial gender classification. In Conference on Fairness, Accountability and Transparency 77\u201391 (PMLR, 2018)."},{"key":"596_CR50","doi-asserted-by":"publisher","unstructured":"Leslie, D. Understanding bias in facial recognition technologies. Preprint at https:\/\/doi.org\/10.48550\/arXiv.2010.07023 (2020).","DOI":"10.48550\/arXiv.2010.07023"},{"key":"596_CR51","unstructured":"Gelman, A. et al. Bayesian Data Analysis (3rd ed.). (Chapman and Hall\/CRC, 2013)."},{"key":"596_CR52","doi-asserted-by":"publisher","first-page":"398","DOI":"10.1080\/01621459.1990.10476213","volume":"85","author":"AE Gelfand","year":"1990","unstructured":"Gelfand, A. E. & Smith, A. F. M. Sampling-based approaches to calculating marginal densities. J. Am. Stat. Assoc. 85, 398\u2013409 (1990).","journal-title":"J. Am. Stat. Assoc."},{"key":"596_CR53","doi-asserted-by":"crossref","unstructured":"Van Buuren, S. Flexible Imputation of Missing Data (CRC, 2018).","DOI":"10.1201\/9780429492259"},{"key":"596_CR54","doi-asserted-by":"publisher","first-page":"2909","DOI":"10.1080\/00949655.2018.1491577","volume":"88","author":"RM Schouten","year":"2018","unstructured":"Schouten, R. M., Lugtig, P. & Vink, G. Generating missing values for simulation purposes: a multivariate amputation procedure. J. Stat. Comput. Sim. 88, 2909\u20132930 (2018).","journal-title":"J. Stat. Comput. Sim."},{"key":"596_CR55","unstructured":"Brand, J. P. L. Development, Implementation and Evaluation of Multiple Imputation Strategies for the Statistical Analysis of Incomplete Data Sets (Print Partners Ispkamp, 1999)."},{"key":"596_CR56","doi-asserted-by":"publisher","first-page":"36","DOI":"10.1111\/1467-9574.00219","volume":"57","author":"JPL Brand","year":"2003","unstructured":"Brand, J. P. L., Van Buuren, S., Groothuis-Oudshoorn, K. & Gelsema, E. S. A toolkit in SAS for the evaluation of multiple imputation methods. Stat. Neerland. 57, 36\u201345 (2003).","journal-title":"Stat. Neerland."},{"key":"596_CR57","unstructured":"Mayer, I. Causal Inference from Heterogeneous Data with Missing Data: Application to Critical Care Management. PhD thesis, EHESS (2021)."},{"key":"596_CR58","unstructured":"Kusner, M. J., Loftus, J., Russell, C. & Silva, R. Counterfactual fairness. Advances in neural information processing systems, 30. NeurIPS (2017)."},{"key":"596_CR59","unstructured":"Shen, A., Han, X., Cohn, T., Baldwin, T. & Frermann, L. Contrastive learning for fair representations. Preprint at https:\/\/arxiv.org\/abs\/2109.10645 (2021)."},{"key":"596_CR60","first-page":"214","volume":"33","author":"P Ding","year":"2017","unstructured":"Ding, P. & Li, F. Causal inference: a missing data perspective. Stat. Sci. 33, 214\u2013237 (2017).","journal-title":"Stat. Sci."},{"key":"596_CR61","doi-asserted-by":"publisher","first-page":"278","DOI":"10.1177\/0962280210395740","volume":"22","author":"SR Seaman","year":"2013","unstructured":"Seaman, S. R. & White, I. R. Review of inverse probability weighting for dealing with missing data. Stat. Methods Med. Res. 22, 278\u2013295 (2013).","journal-title":"Stat. Methods Med. Res."},{"key":"596_CR62","doi-asserted-by":"publisher","first-page":"585","DOI":"10.1093\/aje\/kwx350","volume":"187","author":"BaoLuo Sun","year":"2017","unstructured":"Sun, BaoLuo et al. Inverse-probability-weighted estimation for monotone and nonmonotone missing data. Am. J. Epidemiol. 187, 585\u2013591 (2017).","journal-title":"Am. J. Epidemiol."},{"key":"596_CR63","doi-asserted-by":"publisher","first-page":"1731","DOI":"10.1093\/ije\/dyv135","volume":"44","author":"D Westreich","year":"2015","unstructured":"Westreich, D. et al. Imputation approaches for potential outcomes in causal inference. Int. J. Epidemiol. 44, 1731\u20131737 (2015).","journal-title":"Int. J. Epidemiol."},{"key":"596_CR64","doi-asserted-by":"publisher","first-page":"e185","DOI":"10.2196\/jmir.9134","volume":"20","author":"RA Verheij","year":"2018","unstructured":"Verheij, R. A., Curcin, V., Delaney, B. C. & McGilchrist, M. M. Possible sources of bias in primary care electronic health record data use and reuse. J. Med. Internet Res. 20, e185 (2018).","journal-title":"J. Med. Internet Res."},{"key":"596_CR65","doi-asserted-by":"publisher","DOI":"10.1038\/s41598-021-94516-7","volume":"11","author":"MV Kiang","year":"2021","unstructured":"Kiang, M. V. et al. Sociodemographic characteristics of missing data in digital phenotyping. Sci. Rep. 11, 15408 (2021).","journal-title":"Sci. Rep."},{"key":"596_CR66","doi-asserted-by":"crossref","unstructured":"Tsiampalis, T. & Panagiotakos, D. B. Missing-data analysis: socio-demographic, clinical and lifestyle determinants of low response rate on self-reported psychological and nutrition related multi-item instruments in the context of the ATTICA epidemiological study. BMC Med. Res. Methodol. 20, 148 (2020).","DOI":"10.1186\/s12874-020-01038-3"},{"key":"596_CR67","doi-asserted-by":"publisher","first-page":"n304","DOI":"10.1136\/bmj.n304","volume":"372","author":"D Leslie","year":"2021","unstructured":"Leslie, D., Mazumder, A., Peppin, A., Wolters, M. K. & Hagerty, A. Does \u201cAI\" stand for augmenting inequality in the era of covid-19 healthcare? BMJ 372, n304 (2021).","journal-title":"BMJ"},{"key":"596_CR68","doi-asserted-by":"publisher","first-page":"243","DOI":"10.1038\/s41591-021-01672-4","volume":"28","author":"S Fatumo","year":"2022","unstructured":"Fatumo, S. et al. A roadmap to increase diversity in genomic studies. Nat. Med. 28, 243\u2013250 (2022).","journal-title":"Nat. Med."},{"key":"596_CR69","doi-asserted-by":"crossref","unstructured":"Abdill, R. J., Adamowicz, E. M. & Blekhman, R. Public human microbiome data are dominated by highly developed countries. PLoS Biol. 20, e3001536 (2022).","DOI":"10.1371\/journal.pbio.3001536"},{"key":"596_CR70","doi-asserted-by":"publisher","first-page":"86","DOI":"10.1145\/3458723","volume":"64","author":"T Gebru","year":"2021","unstructured":"Gebru, T. et al. Datasheets for datasets. Commun. ACM 64, 86\u201392 (2021).","journal-title":"Commun. ACM"},{"key":"596_CR71","doi-asserted-by":"crossref","unstructured":"Rostamzadeh, N. et al. Healthsheet: development of a transparency artifact for health datasets. Preprint at https:\/\/arxiv.org\/abs\/2202.13028 (2022).","DOI":"10.1145\/3531146.3533239"},{"key":"596_CR72","doi-asserted-by":"publisher","first-page":"e007450","DOI":"10.1136\/bmjopen-2014-007450","volume":"5","author":"NJ Tierney","year":"2015","unstructured":"Tierney, N. J., Harden, F. A., Harden, M. J. & Mengersen, K. L. Using decision trees to understand structure in missing data. BMJ Open 5, e007450 (2015).","journal-title":"BMJ Open"},{"key":"596_CR73","doi-asserted-by":"crossref","unstructured":"Mart\u00ednez-Plumed, F., Ferri, C., Nieves, D. & Hern\u00e1ndez-Orallo, J. Missing the missing values: the ugly duckling of fairness in machine learning. Int. J. Intell. Syst. 36, 3217\u20133258 (2021).","DOI":"10.1002\/int.22415"},{"key":"596_CR74","doi-asserted-by":"publisher","first-page":"584","DOI":"10.1038\/s41588-019-0379-x","volume":"51","author":"AR Martin","year":"2019","unstructured":"Martin, A. R. et al. Clinical use of current polygenic risk scores may exacerbate health disparities. Nat. Genet. 51, 584\u2013591 (2019).","journal-title":"Nat. Genet."},{"key":"596_CR75","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3502287","volume":"54","author":"A Bansal","year":"2022","unstructured":"Bansal, A., Sharma, R. & Kathuria, M. A systematic review on data scarcity problem in deep learning: solution and applications. ACM Comput. Surv. 54, 1\u201329 (2022).","journal-title":"ACM Comput. Surv."},{"key":"596_CR76","doi-asserted-by":"publisher","first-page":"20170387","DOI":"10.1098\/rsif.2017.0387","volume":"15","author":"T Ching","year":"2018","unstructured":"Ching, T. et al. Opportunities and obstacles for deep learning in biology and medicine. J. R. Soc. Interf. 15, 20170387 (2018).","journal-title":"J. R. Soc. Interf."},{"key":"596_CR77","doi-asserted-by":"publisher","first-page":"669","DOI":"10.1038\/s42256-022-00516-1","volume":"4","author":"W Liang","year":"2022","unstructured":"Liang, W. et al. Advances, challenges and opportunities in creating data for trustworthy AI. Nat. Mach. Intell. 4, 669\u2013677 (2022).","journal-title":"Nat. Mach. Intell."},{"key":"596_CR78","unstructured":"Koch, B., Denton, E., Hanna, A. & Foster, J. G. Reduced, reused and recycled: the life of a dataset in machine learning research. Preprint at https:\/\/arxiv.org\/abs\/2112.01716 (2021)."},{"key":"596_CR79","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/j.ygeno.2015.11.003","volume":"107","author":"JM Heather","year":"2016","unstructured":"Heather, J. M. & Chain, B. The sequence of sequencers: the history of sequencing DNA. Genomics 107, 1\u20138 (2016).","journal-title":"Genomics"},{"key":"596_CR80","doi-asserted-by":"crossref","unstructured":"Li, P. et al. CleanML: a study for evaluating the impact of data cleaning on ml classification tasks. In 2021 IEEE 37th International Conference on Data Engineering 13\u201324 (IEEE, 2021).","DOI":"10.1109\/ICDE51399.2021.00009"},{"key":"596_CR81","doi-asserted-by":"publisher","first-page":"948","DOI":"10.14778\/2994509.2994514","volume":"9","author":"S Krishnan","year":"2016","unstructured":"Krishnan, S., Wang, J., Wu, E., Franklin, M. J. & Goldberg, K. ActiveClean: interactive data cleaning for statistical modeling. Proc. VLDB Endow. 9, 948\u2013959 (2016).","journal-title":"Proc. VLDB Endow."},{"key":"596_CR82","unstructured":"Zhang, L., Yang, M. & Feng, X. Sparse representation or collaborative representation: which helps face recognition? In IEEE International Conference on Computer Vision 471\u2013478 (IEEE, 2011)."},{"key":"596_CR83","doi-asserted-by":"crossref","unstructured":"Chakraborti, T., McCane, B., Mills, S. & Pal, U. A generalised formulation for collaborative representation of image patches (GP-CRC). In Proc. British Machine Vision Conference (2017).","DOI":"10.5244\/C.31.65"},{"key":"596_CR84","doi-asserted-by":"crossref","unstructured":"Ben Schafer, J., Frankowski, D., Herlocker, J. & Sen, S. Collaborative filtering recommender systems. In Lecture Notes in Computer Science: The Adaptive Web. Springer, Berlin, Heidelberg. 291\u2013324 (2007).","DOI":"10.1007\/978-3-540-72079-9_9"},{"key":"596_CR85","doi-asserted-by":"crossref","unstructured":"Chakraborti, T., McCane, B., Mills, S. & Pal, U. Collaborative representation based fine-grained species recognition. In Proc. IEEE International Conference on Image and Vision Computing New Zealand, 1-6 (IEEE, 2016).","DOI":"10.1109\/IVCNZ.2016.7804421"},{"key":"596_CR86","doi-asserted-by":"publisher","first-page":"1273","DOI":"10.1126\/science.287.5456.1273","volume":"287","author":"WE Vinje","year":"2000","unstructured":"Vinje, W. E. & Gallant, J. L. Sparse coding and decorrelation in primary visual cortex during natural vision. Science 287, 1273\u20131276 (2000).","journal-title":"Science"},{"key":"596_CR87","doi-asserted-by":"publisher","first-page":"129","DOI":"10.1146\/annurev-statistics-040720-031848","volume":"8","author":"TE Raghunathan","year":"2021","unstructured":"Raghunathan, T. E. Synthetic data. Annu. Rev. Stat. Appl. 8, 129\u2013140 (2021).","journal-title":"Annu. Rev. Stat. Appl."},{"key":"596_CR88","unstructured":"Jordon, J. et al. Synthetic data\u2014what, why and how? Preprint at https:\/\/arxiv.org\/abs\/2205.03257 (2022)."},{"key":"596_CR89","unstructured":"Vaswani, A. et al. Attention is all you need. In Advances in Neural Information Processing Systems 30 (2017)."},{"key":"596_CR90","unstructured":"Zhang, H., Goodfellow, I., Metaxas, D. & Odena, A. Self-attention generative adversarial networks. In International conference on machine learning. 7354\u20137363 (PMLR, 2019)"},{"key":"596_CR91","unstructured":"Yoon, J., Jordon, J. & Schaar, M. GAIN: missing data imputation using generative adversarial nets. In International Conference on Machine Learning 80, 5689\u20135698 (PMLR, 2018)."},{"key":"596_CR92","doi-asserted-by":"publisher","unstructured":"Birnbaum, B. et al. Model-assisted cohort selection with bias analysis for generating large-scale cohorts from the EHR for oncology research. Preprint at https:\/\/doi.org\/10.48550\/arXiv.2001.09765 (2020).","DOI":"10.48550\/arXiv.2001.09765"},{"key":"596_CR93","doi-asserted-by":"publisher","first-page":"111485","DOI":"10.1016\/j.rse.2019.111485","volume":"236","author":"E Alerskans","year":"2020","unstructured":"Alerskans, E. et al. Construction of a climate data record of sea surface temperature from passive microwave measurements. Remote Sens. Environ. 236, 111485 (2020).","journal-title":"Remote Sens. Environ."},{"key":"596_CR94","doi-asserted-by":"crossref","unstructured":"Katiraie-Boroujerdy, P. S., Nasrollahi, N., Hsu, K. L. & Sorooshian, S. Evaluation of satellite-based precipitation estimation over Iran. J. Arid Environ. 97, 205\u2013219 (2013).","DOI":"10.1016\/j.jaridenv.2013.05.013"},{"key":"596_CR95","doi-asserted-by":"publisher","first-page":"5124","DOI":"10.1038\/s41467-021-25257-4","volume":"12","author":"TR Andersson","year":"2021","unstructured":"Andersson, T. R. et al. Seasonal arctic sea ice forecasting with probabilistic deep learning. Nat. Commun. 12, 5124 (2021).","journal-title":"Nat. Commun."},{"key":"596_CR96","unstructured":"Groves, R. M. et al. Survey Methodology (John Wiley & Sons, 2011)."},{"key":"596_CR97","doi-asserted-by":"publisher","first-page":"328","DOI":"10.1038\/d41586-020-01747-1","volume":"582","author":"H Ledford","year":"2020","unstructured":"Ledford, H. How Facebook, Twitter and other data troves are revolutionizing social science. Nature 582, 328\u2013331 (2020).","journal-title":"Nature"}],"container-title":["Nature Machine Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.nature.com\/articles\/s42256-022-00596-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/www.nature.com\/articles\/s42256-022-00596-z","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/www.nature.com\/articles\/s42256-022-00596-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,13]],"date-time":"2024-10-13T01:38:33Z","timestamp":1728783513000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.nature.com\/articles\/s42256-022-00596-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,1,25]]},"references-count":97,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2023,1]]}},"alternative-id":["596"],"URL":"https:\/\/doi.org\/10.1038\/s42256-022-00596-z","relation":{},"ISSN":["2522-5839"],"issn-type":[{"value":"2522-5839","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,1,25]]},"assertion":[{"value":"27 January 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 November 2022","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 January 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"The authors declare no competing interests.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}]}}