{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,17]],"date-time":"2026-04-17T16:09:05Z","timestamp":1776442145862,"version":"3.51.2"},"reference-count":50,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"3","license":[{"start":{"date-parts":[[2022,3,1]],"date-time":"2022-03-01T00:00:00Z","timestamp":1646092800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2022,3,1]],"date-time":"2022-03-01T00:00:00Z","timestamp":1646092800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,3,1]],"date-time":"2022-03-01T00:00:00Z","timestamp":1646092800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"DirtyData","award":["ANR-17-CE23-0018-01"],"award-info":[{"award-number":["ANR-17-CE23-0018-01"]}]},{"name":"FUI Wendelin"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Knowl. Data Eng."],"published-print":{"date-parts":[[2022,3,1]]},"DOI":"10.1109\/tkde.2020.2992529","type":"journal-article","created":{"date-parts":[[2020,5,4]],"date-time":"2020-05-04T19:42:48Z","timestamp":1588621368000},"page":"1164-1176","source":"Crossref","is-referenced-by-count":83,"title":["Encoding High-Cardinality String Categorical Variables"],"prefix":"10.1109","volume":"34","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5184-6362","authenticated-orcid":false,"given":"Patricio","family":"Cerda","sequence":"first","affiliation":[]},{"given":"Gael","family":"Varoquaux","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.4324\/9780203774441"},{"key":"ref2","volume-title":"Data Preparation for Data Mining","author":"Pyle","year":"1999"},{"issue":"4","key":"ref3","first-page":"3","article-title":"Data cleaning: Problems and current approaches","volume":"23","author":"Rahm","year":"2000","journal-title":"IEEE Data Eng. Bull."},{"key":"ref4","article-title":"Overview of record linkage and current research directions","author":"Winkler","year":"2006"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2007.250581"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-31164-2"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/s10994-018-5724-2"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.5120\/11638-7118"},{"key":"ref9","first-page":"518","article-title":"Similarity search in high dimensions via hashing","volume-title":"Proc. 25th Int. Conf. Very Large Data Bases","volume":"99","author":"Gionis"},{"key":"ref10","article-title":"UCI machine learning repository","author":"Dheeru","year":"2017"},{"issue":"1","key":"ref11","doi-asserted-by":"crossref","first-page":"81","DOI":"10.1023\/A:1021564703268","article-title":"A taxonomy of dirty data","volume":"7","author":"Kim","year":"2003","journal-title":"Data Mining Knowl. Discov."},{"key":"ref12","first-page":"1","article-title":"A formal definition of data quality problems","volume-title":"Proc. Int. Conf. Inf. Qual.","author":"Oliveira"},{"key":"ref13","first-page":"161","article-title":"The tradeoffs of large scale learning","volume-title":"Proc. 20th Int. Conf. Neural Inf. Process. Syst.","author":"Bottou"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1145\/507533.507538"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/1553374.1553516"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1016\/0306-4573(83)90022-5"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.3115\/1119176.1119204"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3357384.3360319"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1080\/01638539809545028"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1090\/conm\/026\/737400"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1016\/S0022-0000(03)00025-4"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"ref23","first-page":"1","article-title":"Efficient estimation of word representations in vector space","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Mikolov"},{"key":"ref24","first-page":"107","article-title":"A simple but tough-to-beat baseline for sentence embeddings","volume-title":"Proc. 5th Int. Conf. Learn. Representations","volume":"152","author":"Arora"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00051"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.18653\/vl\/N19-142"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/SEQUEN.1997.666900"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/1008992.1009016"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-33460-3_36"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2699960"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.5244\/C.22.50"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1006\/jcss.1999.1690"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9781139924801"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICDM.2013.119"},{"key":"ref35","article-title":"Murmurhash3","author":"Appleby","year":"2014"},{"key":"ref36","article-title":"Algorithms for non-negative matrix factorization","volume-title":"Proc. 13th Int. Conf. Neural Inf. Process. Syst.","author":"Lee"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ASPAA.2011.6082314"},{"key":"ref38","first-page":"1","article-title":"Advances in pre-training distributed word representations","volume-title":"Proc. 11th Int. Conf. Lang. Resour. Eval.","author":"Mikolov"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1137\/090771806"},{"key":"ref40","first-page":"2837","article-title":"Information theoretic measures for clusterings comparison: Variants, properties, normalization and correction for chance","volume":"11","author":"Vinh","year":"2010","journal-title":"J. Mach. Learn. Res."},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1145\/2939672.2939785"},{"key":"ref42","first-page":"6639","article-title":"CatBoost: Unbiased boosting with categorical features","volume-title":"Proc. 32nd Int. Conf. Neural Inf. Process. Syst.","author":"Prokhorenkova"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1142\/9789813235533_0018"},{"key":"ref44","first-page":"1","article-title":"Statistical comparisons of classifiers over multiple data sets","volume":"7","author":"Dem\u0161ar","year":"2006","journal-title":"J. Mach. Learn. Res."},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.2307\/2279372"},{"key":"ref46","article-title":"Distribution-free multiple comparisons","volume":"18","author":"Nemenyi","year":"1962","journal-title":"Biometrics"},{"key":"ref47","first-page":"1177","article-title":"Random features for large-scale kernel machines","volume-title":"Proc. 20th Int. Conf. Neural Inf. Process. Syst.","author":"Rahimi"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1093\/bioinformatics\/btq134"},{"key":"ref49","first-page":"1","article-title":"Automatic machine learning (AutoML)","volume-title":"Proc. ICML Workshop Resource-Efficient Mach. Learn.","author":"Hutter"},{"key":"ref50","doi-asserted-by":"crossref","DOI":"10.1007\/978-3-030-05318-5","volume-title":"Automated Machine Learning-Methods, Systems, Challenges","author":"Hutter","year":"2019"}],"container-title":["IEEE Transactions on Knowledge and Data Engineering"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/69\/9702896\/09086128.pdf?arnumber=9086128","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,9]],"date-time":"2024-01-09T22:18:19Z","timestamp":1704838699000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9086128\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,3,1]]},"references-count":50,"journal-issue":{"issue":"3"},"URL":"https:\/\/doi.org\/10.1109\/tkde.2020.2992529","relation":{},"ISSN":["1041-4347","1558-2191","2326-3865"],"issn-type":[{"value":"1041-4347","type":"print"},{"value":"1558-2191","type":"electronic"},{"value":"2326-3865","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,3,1]]}}}