{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T18:30:39Z","timestamp":1775586639005,"version":"3.50.1"},"reference-count":41,"publisher":"Springer Science and Business Media LLC","issue":"8-10","license":[{"start":{"date-parts":[[2018,6,12]],"date-time":"2018-06-12T00:00:00Z","timestamp":1528761600000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100001665","name":"Agence Nationale de la Recherche","doi-asserted-by":"publisher","award":["ANR-17-CE23-0018"],"award-info":[{"award-number":["ANR-17-CE23-0018"]}],"id":[{"id":"10.13039\/501100001665","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Mach Learn"],"published-print":{"date-parts":[[2018,9]]},"DOI":"10.1007\/s10994-018-5724-2","type":"journal-article","created":{"date-parts":[[2018,6,12]],"date-time":"2018-06-12T16:43:57Z","timestamp":1528821837000},"page":"1477-1494","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":214,"title":["Similarity encoding for learning with dirty categorical variables"],"prefix":"10.1007","volume":"107","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5184-6362","authenticated-orcid":false,"given":"Patricio","family":"Cerda","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1076-5122","authenticated-orcid":false,"given":"Ga\u00ebl","family":"Varoquaux","sequence":"additional","affiliation":[]},{"given":"Bal\u00e1zs","family":"K\u00e9gl","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2018,6,12]]},"reference":[{"issue":"2","key":"5724_CR1","doi-asserted-by":"publisher","first-page":"202","DOI":"10.5296\/ije.v4i2.1962","volume":"4","author":"H Alkharusi","year":"2012","unstructured":"Alkharusi, H. (2012). Categorical variables in regression analysis: A comparison of dummy and effect coding. International Journal of Education, 4(2), 202\u2013210.","journal-title":"International Journal of Education"},{"issue":"4","key":"5724_CR2","doi-asserted-by":"publisher","first-page":"255","DOI":"10.1016\/0306-4573(83)90022-5","volume":"19","author":"RC Angell","year":"1983","unstructured":"Angell, R. C., Freund, G. E., & Willett, P. (1983). Automatic spelling correction using a trigram similarity measure. Information Processing & Management, 19(4), 255\u2013261.","journal-title":"Information Processing & Management"},{"issue":"3","key":"5724_CR3","doi-asserted-by":"publisher","first-page":"919","DOI":"10.2466\/pms.1998.87.3.919","volume":"87","author":"KJ Berry","year":"1998","unstructured":"Berry, K. J., Mielke, P. W, Jr., & Iyer, H. K. (1998). Factorial designs and dummy coding. Perceptual and Motor Skills, 87(3), 919\u2013927.","journal-title":"Perceptual and Motor Skills"},{"key":"5724_CR4","unstructured":"Bojanowski, P., Grave, E., Joulin, A., & Mikolov, T. (2016). Enriching word vectors with subword information. arXiv preprint arXiv:1607.04606"},{"issue":"9","key":"5724_CR5","doi-asserted-by":"publisher","first-page":"1537","DOI":"10.1109\/TKDE.2011.127","volume":"24","author":"P Christen","year":"2012","unstructured":"Christen, P. (2012). A survey of indexing techniques for scalable record linkage and deduplication. IEEE Transactions on Knowledge and Data Engineering, 24(9), 1537\u20131555.","journal-title":"IEEE Transactions on Knowledge and Data Engineering"},{"key":"5724_CR6","doi-asserted-by":"crossref","unstructured":"Cohen, J., Cohen, P., West, S. G., & Aiken, L. S. (2013). Applied multiple regression\/correlation analysis for the behavioral sciences. London: Routledge.","DOI":"10.4324\/9780203774441"},{"key":"5724_CR7","first-page":"73","volume":"3","author":"W Cohen","year":"2003","unstructured":"Cohen, W., Ravikumar, P., & Fienberg, S. (2003). A comparison of string metrics for matching names and records. Kdd Workshop on Data Cleaning and Object Consolidation, 3, 73\u201378.","journal-title":"Kdd Workshop on Data Cleaning and Object Consolidation"},{"key":"5724_CR8","doi-asserted-by":"crossref","unstructured":"Cohen, W. W. (1998). Integration of heterogeneous databases without common domains using queries based on textual similarity. In ACM SIGMOD record (Vol.\u00a027, pp. 201\u2013212). ACM.","DOI":"10.1145\/276304.276323"},{"issue":"2","key":"5724_CR9","doi-asserted-by":"publisher","first-page":"197","DOI":"10.1023\/A:1009869804967","volume":"3","author":"D Coppersmith","year":"1999","unstructured":"Coppersmith, D., Hong, S. J., & Hosking, J. R. (1999). Partitioning nominal attributes in decision trees. Data Mining and Knowledge Discovery, 3(2), 197\u2013217.","journal-title":"Data Mining and Knowledge Discovery"},{"issue":"1","key":"5724_CR10","doi-asserted-by":"crossref","first-page":"61","DOI":"10.6339\/JDS.2010.08(1).563","volume":"8","author":"MJ Davis","year":"2010","unstructured":"Davis, M. J. (2010). Contrast coding in multiple regression analysis: Strengths, weaknesses, and utility of popular coding structures. Journal of Data Science, 8(1), 61\u201373.","journal-title":"Journal of Data Science"},{"key":"5724_CR11","unstructured":"Duch, W., Grudzinski, K., & Stawski, G. (2000). Symbolic features in neural networks. In Proceedings of the 5th conference on neural networks and their applications. Citeseer."},{"issue":"1","key":"5724_CR12","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1109\/TKDE.2007.250581","volume":"19","author":"AK Elmagarmid","year":"2007","unstructured":"Elmagarmid, A. K., Ipeirotis, P. G., & Verykios, V. S. (2007). Duplicate record detection: A survey. IEEE Transactions on Knowledge and Data Engineering, 19(1), 1\u201316.","journal-title":"IEEE Transactions on Knowledge and Data Engineering"},{"issue":"328","key":"5724_CR13","doi-asserted-by":"publisher","first-page":"1183","DOI":"10.1080\/01621459.1969.10501049","volume":"64","author":"IP Fellegi","year":"1969","unstructured":"Fellegi, I. P., & Sunter, A. B. (1969). A theory for record linkage. Journal of the American Statistical Association, 64(328), 1183\u20131210.","journal-title":"Journal of the American Statistical Association"},{"issue":"13","key":"5724_CR14","doi-asserted-by":"publisher","first-page":"13","DOI":"10.5120\/11638-7118","volume":"68","author":"WH Gomaa","year":"2013","unstructured":"Gomaa, W. H., & Fahmy, A. A. (2013). A survey of text similarity approaches. International Journal of Computer Applications, 68(13), 13\u201318.","journal-title":"International Journal of Computer Applications"},{"key":"5724_CR15","doi-asserted-by":"crossref","unstructured":"Grabczewski, K., & Jankowski, N. (2003). Transformations of symbolic data for continuous data oriented models. In Artificial neural networks and neural information processing (pp. 359\u2013366). Springer.","DOI":"10.1007\/3-540-44989-2_43"},{"key":"5724_CR16","unstructured":"Guo, C., & Berkhahn, F. (2016). Entity embeddings of categorical variables. arXiv preprint arXiv:1604.06737"},{"issue":"1","key":"5724_CR17","doi-asserted-by":"publisher","first-page":"70","DOI":"10.1002\/(SICI)1097-4571(199601)47:1<70::AID-ASI7>3.0.CO;2-#","volume":"47","author":"DA Hull","year":"1996","unstructured":"Hull, D. A., et al. (1996). Stemming algorithms: A case study for detailed evaluation. JASIS, 47(1), 70\u201384.","journal-title":"JASIS"},{"issue":"406","key":"5724_CR18","doi-asserted-by":"publisher","first-page":"414","DOI":"10.1080\/01621459.1989.10478785","volume":"84","author":"MA Jaro","year":"1989","unstructured":"Jaro, M. A. (1989). Advances in record-linkage methodology as applied to matching the 1985 census of Tampa, Florida. Journal of the American Statistical Association, 84(406), 414\u2013420.","journal-title":"Journal of the American Statistical Association"},{"issue":"1","key":"5724_CR19","doi-asserted-by":"publisher","first-page":"81","DOI":"10.1023\/A:1021564703268","volume":"7","author":"W Kim","year":"2003","unstructured":"Kim, W., Choi, B. J., Hong, E. K., Kim, S. K., & Lee, D. (2003). A taxonomy of dirty data. Data Mining and Knowledge Discovery, 7(1), 81\u201399.","journal-title":"Data Mining and Knowledge Discovery"},{"key":"5724_CR20","doi-asserted-by":"crossref","unstructured":"Kondrak, G. (2005). N-gram similarity and distance. In International symposium on string processing and information retrieval (pp. 115\u2013126). Springer.","DOI":"10.1007\/11575832_13"},{"key":"5724_CR21","unstructured":"Krishnan, S., Franklin, M. J., Goldberg, K., & Wu, E. (2017). Boostclean: Automated error detection and repair for machine learning. arXiv preprint arXiv:1711.01299 ."},{"issue":"12","key":"5724_CR22","doi-asserted-by":"publisher","first-page":"948","DOI":"10.14778\/2994509.2994514","volume":"9","author":"S Krishnan","year":"2016","unstructured":"Krishnan, S., Wang, J., Wu, E., Franklin, M. J., & Goldberg, K. (2016). Activeclean: Interactive data cleaning for statistical modeling. Proceedings of the VLDB Endowment, 9(12), 948\u2013959.","journal-title":"Proceedings of the VLDB Endowment"},{"issue":"2\u20133","key":"5724_CR23","doi-asserted-by":"publisher","first-page":"259","DOI":"10.1080\/01638539809545028","volume":"25","author":"TK Landauer","year":"1998","unstructured":"Landauer, T. K., Foltz, P. W., & Laham, D. (1998). An introduction to latent semantic analysis. Discourse Processes, 25(2\u20133), 259\u2013284.","journal-title":"Discourse Processes"},{"key":"5724_CR24","first-page":"707","volume":"10","author":"VI Levenshtein","year":"1966","unstructured":"Levenshtein, V. I. (1966). Binary codes capable of correcting deletions, insertions, and reversals. Soviet Physics Doklady, 10, 707\u2013710.","journal-title":"Soviet Physics Doklady"},{"issue":"1\u20132","key":"5724_CR25","first-page":"22","volume":"11","author":"JB Lovins","year":"1968","unstructured":"Lovins, J. B. (1968). Development of a stemming algorithm. Mechanical Translation and Computational Linguistics, 11(1\u20132), 22\u201331.","journal-title":"Mechanical Translation and Computational Linguistics"},{"key":"5724_CR26","volume-title":"The theory of relational databases","author":"D Maier","year":"1983","unstructured":"Maier, D. (1983). The theory of relational databases (Vol. 11). Rockville: Computer Science Press."},{"issue":"1","key":"5724_CR27","doi-asserted-by":"publisher","first-page":"27","DOI":"10.1145\/507533.507538","volume":"3","author":"D Micci-Barreca","year":"2001","unstructured":"Micci-Barreca, D. (2001). A preprocessing scheme for high-cardinality categorical attributes in classification and prediction problems. ACM SIGKDD Explorations Newsletter, 3(1), 27\u201332.","journal-title":"ACM SIGKDD Explorations Newsletter"},{"key":"5724_CR28","unstructured":"Mikolov, T., Chen, K., Corrado, G., & Dean, J. (2013). Efficient estimation of word representations in vector space. In ICLR workshop papers."},{"key":"5724_CR29","volume-title":"Research design and statistical analysis","author":"JL Myers","year":"2010","unstructured":"Myers, J. L., Well, A., & Lorch, R. F. (2010). Research design and statistical analysis. London: Routledge."},{"issue":"2","key":"5724_CR30","doi-asserted-by":"publisher","first-page":"243","DOI":"10.1207\/s15327906mbr2302_7","volume":"23","author":"KE O\u2019Grady","year":"1988","unstructured":"O\u2019Grady, K. E., & Medoff, D. R. (1988). Categorical variables in multiple regression: Some cautions. Multivariate Behavioral Research, 23(2), 243\u20132060.","journal-title":"Multivariate Behavioral Research"},{"key":"5724_CR31","unstructured":"Oliveira, P., Rodrigues, F., & Henriques, P. R. (2005). A formal definition of data quality problems. In Proceedings of the 2005 international conference on information quality (MIT IQ conference)."},{"key":"5724_CR32","volume-title":"Multiple regression in behavioral research","author":"EJ Pedhazur","year":"1973","unstructured":"Pedhazur, E. J., Kerlinger, F. N., et al. (1973). Multiple regression in behavioral research. Rinehart and Winston New York: Holt."},{"key":"5724_CR33","first-page":"2825","volume":"12","author":"F Pedregosa","year":"2011","unstructured":"Pedregosa, F., Varoquaux, G., Gramfort, A., Michel, V., Thirion, B., Grisel, O., et al. (2011). Scikit-learn: Machine learning in Python. Journal of Machine Learning Research, 12, 2825\u20132830.","journal-title":"Journal of Machine Learning Research"},{"key":"5724_CR34","volume-title":"Data preparation for data mining","author":"D Pyle","year":"1999","unstructured":"Pyle, D. (1999). Data preparation for data mining (Vol. 1). Burlington: Morgan Kaufmann."},{"key":"5724_CR35","unstructured":"Rahimi, A., & Recht, B. (2008). Random features for large-scale kernel machines. In J. C. Platt, D. Koller, Y. Singer, & S. T. Roweis (Eds.), Advances in neural information processing systems 20 (pp. 1177\u20131184). Curran Associates, Inc. http:\/\/papers.nips.cc\/paper\/3182-random-features-for-large-scale-kernel-machines.pdf ."},{"issue":"4","key":"5724_CR36","first-page":"3","volume":"23","author":"E Rahm","year":"2000","unstructured":"Rahm, E., & Do, H. H. (2000). Data cleaning: Problems and current approaches. IEEE Data Engineering Bulletin, 23(4), 3\u201313.","journal-title":"IEEE Data Engineering Bulletin"},{"key":"5724_CR37","doi-asserted-by":"crossref","unstructured":"Sarawagi, S., & Bhamidipaty, A. (2002). Interactive deduplication using active learning. In Proceedings of the eighth ACM SIGKDD international conference on knowledge discovery and data mining (pp. 269\u2013278). ACM.","DOI":"10.1145\/775047.775087"},{"key":"5724_CR38","unstructured":"Shyu, M. L., Sarinnapakorn, K., Kuruppu-Appuhamilage, I., Chen, S. C., Chang, L., & Goldring, T. (2005). Handling nominal features in anomaly intrusion detection problems. In 15th international workshop on research issues in data engineering: Stream data mining and applications (pp. 55\u201362). IEEE."},{"key":"5724_CR39","doi-asserted-by":"crossref","unstructured":"Weinberger, K., Dasgupta, A., Langford, J., Smola, A., & Attenberg, J. (2009). Feature hashing for large scale multitask learning. In Proceedings of the 26th annual international conference on machine learning (pp. 1113\u20131120). ACM.","DOI":"10.1145\/1553374.1553516"},{"key":"5724_CR40","volume-title":"The state of record linkage and current research problems","author":"WE Winkler","year":"1999","unstructured":"Winkler, W. E. (1999). The state of record linkage and current research problems. Citeseer: Statistical Research Division, US Census Bureau."},{"key":"5724_CR41","unstructured":"Winkler, W. E. (2002). Methods for record linkage and bayesian networks. Technical report, Statistical Research Division, US Census Bureau, Washington, DC."}],"container-title":["Machine Learning"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10994-018-5724-2\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-018-5724-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-018-5724-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,8,25]],"date-time":"2022-08-25T15:33:47Z","timestamp":1661441627000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10994-018-5724-2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,6,12]]},"references-count":41,"journal-issue":{"issue":"8-10","published-print":{"date-parts":[[2018,9]]}},"alternative-id":["5724"],"URL":"https:\/\/doi.org\/10.1007\/s10994-018-5724-2","relation":{},"ISSN":["0885-6125","1573-0565"],"issn-type":[{"value":"0885-6125","type":"print"},{"value":"1573-0565","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018,6,12]]},"assertion":[{"value":"10 December 2017","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"1 June 2018","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 June 2018","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}