{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,12]],"date-time":"2025-09-12T18:16:33Z","timestamp":1757700993448,"version":"3.32.0"},"publisher-location":"Berlin, Heidelberg","reference-count":35,"publisher":"Springer Berlin Heidelberg","isbn-type":[{"type":"print","value":"9783540463290"},{"type":"electronic","value":"9783540463306"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2006]]},"DOI":"10.1007\/11890591_5","type":"book-chapter","created":{"date-parts":[[2006,11,20]],"date-time":"2006-11-20T12:42:18Z","timestamp":1164026538000},"page":"136-164","source":"Crossref","is-referenced-by-count":7,"title":["Unsupervised Duplicate Detection Using Sample Non-duplicates"],"prefix":"10.1007","author":[{"given":"Patrick","family":"Lehti","sequence":"first","affiliation":[]},{"given":"Peter","family":"Fankhauser","sequence":"additional","affiliation":[]}],"member":"297","reference":[{"key":"5_CR1","doi-asserted-by":"publisher","first-page":"1183","DOI":"10.2307\/2286061","volume":"64","author":"I.P. Fellegi","year":"1969","unstructured":"Fellegi, I.P., Sunter, A.B.: A theory for record linkage. Journal of the American Statistical Association\u00a064, 1183\u20131210 (1969)","journal-title":"Journal of the American Statistical Association"},{"key":"5_CR2","volume-title":"Artificial Intelligence: A Modern Approach","author":"S. Russell","year":"2002","unstructured":"Russell, S., Norvig, P.: Artificial Intelligence: A Modern Approach. Prentice-Hall, Englewood Cliffs (2002)"},{"key":"5_CR3","doi-asserted-by":"publisher","first-page":"9","DOI":"10.1023\/A:1009761603038","volume":"2","author":"M.A. Hernandez","year":"1998","unstructured":"Hernandez, M.A., Stolfo, S.J.: Real-world data is dirty: Data cleansing and the merge\/purge problem. Data Mining and Knowledge Discovery\u00a02, 9\u201337 (1998)","journal-title":"Data Mining and Knowledge Discovery"},{"key":"5_CR4","doi-asserted-by":"crossref","unstructured":"Galhardas, H., Florescu, D., Shasha, D., Simon, E.: An extensible framework for data cleaning. In: Proceddings of the 16th International Conference on Data Engineering ICDE 2000, vol.\u00a0312 (2000)","DOI":"10.1109\/ICDE.2000.839429"},{"key":"5_CR5","unstructured":"Monge, A., Elkan, C.: An efficient domain independent algorithm for detecting approximately duplicate database records. In: Proceedings of the SIGMOD Workshop on Data Mining and Knowledge Discovery (1997)"},{"key":"5_CR6","doi-asserted-by":"publisher","first-page":"954","DOI":"10.1126\/science.130.3381.954","volume":"130","author":"H.B. Newcombe","year":"1959","unstructured":"Newcombe, H.B., Kennedy, J.M., Axford, S.J., James, A.P.: Automatic linkage of vital records. Science\u00a0130, 954\u2013959 (1959)","journal-title":"Science"},{"key":"5_CR7","unstructured":"Winkler, W.E.: Using the em algorithm for weight computation in the fellegi-sunter model of record linkage. In: Proceedings of the Section on Survey Research Methods, American Statistical Association, pp. 667\u2013671 (1988)"},{"key":"5_CR8","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1111\/j.2517-6161.1977.tb01600.x","volume":"34","author":"A.P. Dempster","year":"1977","unstructured":"Dempster, A.P., Laird, N.M., Rubin, D.B.: Maximum likelihood from incomplete data via the em algorithm. Journal of the Royal Statistical Society, Series B\u00a034, 1\u201338 (1977)","journal-title":"Journal of the Royal Statistical Society, Series B"},{"key":"5_CR9","unstructured":"Winkler, W.E.: Improved decision rules in the fellegi-sunter model of record linkage. In: Proceedings of the Section on Survey Research Methods, American Statistical Association, pp. 274\u2013279 (1993)"},{"key":"5_CR10","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1198\/016214501750332956","volume":"79","author":"M.D. Larsen","year":"2001","unstructured":"Larsen, M.D., Rubin, D.B.: Alternative automated record linkage using mixture models. Journal of the American Statistical Association\u00a079, 32\u201341 (2001)","journal-title":"Journal of the American Statistical Association"},{"key":"5_CR11","unstructured":"Ravikumar, P., Cohen, W.W.: A hierarchical graphical model for record linkage. In: AUAI 2004: Proceedings of the 20th conference on Uncertainty in artificial intelligence, pp. 454\u2013461. AUAI Press (2004)"},{"key":"5_CR12","volume-title":"Proceedings of the 18th International Conference on Data Engineering (ICDE 2002)","author":"M.G. Elfeky","year":"2002","unstructured":"Elfeky, M.G., Verykios, V.S., Elmargarid, A.K.: Tailor: A record linkage toolbox. In: Proceedings of the 18th International Conference on Data Engineering (ICDE 2002), Washington, DC, USA, vol.\u00a017. IEEE Computer Society, Los Alamitos (2002)"},{"key":"5_CR13","doi-asserted-by":"crossref","unstructured":"Cohen, W.W., Richman, J.: Learning to match and cluster large high-dimensional data sets for data integration. In: Proceedings of the Eighth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD 2002), Edmonton, Alberta (2002)","DOI":"10.1145\/775047.775116"},{"key":"5_CR14","unstructured":"Bilenko, M., Mooney, R.J.: Learning to combine trained distance metrics for duplicate detection in databases. Technical Report AI 02-296, Artificial Intelligence Laboratory. University of Texas at Austin, Austin, TX (2002)"},{"key":"5_CR15","doi-asserted-by":"publisher","first-page":"635","DOI":"10.1016\/S0306-4379(01)00043-6","volume":"26","author":"S. Tejada","year":"2001","unstructured":"Tejada, S., Knoblock, C.A., Minton, S.: Learning object identification rules for information integration. Information Systems Journal\u00a026, 635\u2013656 (2001)","journal-title":"Information Systems Journal"},{"key":"5_CR16","doi-asserted-by":"crossref","unstructured":"Sarawagi, S., Bhamidipaty, A.: Interactive deduplication using active learning. In: Proceedings of the Eighth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD 2002), Edmonton, Alberta (2002)","DOI":"10.1145\/775047.775087"},{"key":"5_CR17","doi-asserted-by":"crossref","unstructured":"Ananthakrishna, R., Chaudhuri, S., Ganti, V.: Eliminating fuzzy duplicates in data warehouses. In: Proceedings of the 28th International Conference on Very Large Data Bases(VLDB 2002) (2002)","DOI":"10.1016\/B978-155860869-6\/50058-5"},{"key":"5_CR18","unstructured":"Parag, D.P.: Multi-relational record linkage. In: Proceedings of the KDD 2004 Workshop on Multi-Relational Data Mining, pp. 31\u201348 (2004)"},{"key":"5_CR19","doi-asserted-by":"crossref","unstructured":"Dong, X., Halevy, A.Y., Madhavan, J.: Reference reconciliation in complex information spaces. In: SIGMOD Conference, pp. 85\u201396 (2005)","DOI":"10.1145\/1066157.1066168"},{"key":"5_CR20","unstructured":"Bhattacharya, I., Getoor, L.: Deduplication and group detection using links. In: Proceedings of the KDD 2004 Workshop on Link Analysis and Group Detection (2004)"},{"key":"5_CR21","volume-title":"Advances in Neural Information Processing Systems 15","author":"H. Pasula","year":"2003","unstructured":"Pasula, H., Marthi, B., Milch, B., Russell, S., Shpitser, I.: Identity uncertainty and citation matching. In: Advances in Neural Information Processing Systems 15. MIT Press, Cambridge (2003)"},{"key":"5_CR22","unstructured":"Ley, M.: DBLP computer science bibliography, http:\/\/dblp.uni-trier.de\/"},{"key":"5_CR23","unstructured":"Fachinformationszentrum-Karlsruhe: CompuScience, http:\/\/www.zblmath.fiz-karlsruhe.de\/cs\/"},{"key":"5_CR24","unstructured":"Cohen, W.W., Ravikumar, P., Fienberg, S.E.: A comparison of string metrics for matching names and records. In: Proceedings of the KDD 2003 Workshop on Data Cleaning, Record Linkage, and Object Consolidation, Washington, DC, pp. 13\u201318 (2003)"},{"key":"5_CR25","unstructured":"Evert, S.: Computational approaches to collocations, http:\/\/www.collocations.de\/"},{"key":"5_CR26","first-page":"22","volume":"16","author":"K.W. Church","year":"1990","unstructured":"Church, K.W., Hanks, P.: Word association norms, mutual information and lexicography. Computational Linguistics\u00a016, 22\u201329 (1990)","journal-title":"Computational Linguistics"},{"key":"5_CR27","doi-asserted-by":"crossref","unstructured":"Church, K.W., Gale, W., Hanks, P., Hindle, D.: Using statistics in lexical analysis. Lexical Acquisition: Using On-line Recources to Build a Lexicon, 115\u2013164 (1991)","DOI":"10.4324\/9781315785387-8"},{"key":"5_CR28","doi-asserted-by":"crossref","first-page":"434","DOI":"10.1007\/978-3-662-05744-5","volume-title":"Angewandte Statistik","author":"L. Sachs","year":"2004","unstructured":"Sachs, L.: Angewandte Statistik, pp. 434\u2013435. Springer, Berlin (2004)"},{"key":"5_CR29","unstructured":"MacQueen, J.: Some methods for classification and analysis of multivariate observations. In: Proceedings of the Fifth Berkeley Symposium on Math., Stat. and Prob., pp. 281\u2013296 (1967)"},{"key":"5_CR30","first-page":"888","volume":"13","author":"J. Shi","year":"2000","unstructured":"Shi, J., Malik, J.: Normalized cuts and image segmentation. IEEE Trans. CAD-Integrated Circuits and Systems\u00a013, 888\u2013905 (2000)","journal-title":"IEEE Trans. CAD-Integrated Circuits and Systems"},{"key":"5_CR31","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"210","DOI":"10.1007\/11546849_21","volume-title":"Data Warehousing and Knowledge Discovery","author":"P. Lehti","year":"2005","unstructured":"Lehti, P., Fankhauser, P.: A Precise Blocking Method for Record Linkage. In: Tjoa, A.M., Trujillo, J. (eds.) DaWaK 2005. LNCS, vol.\u00a03589, pp. 210\u2013220. Springer, Heidelberg (2005)"},{"key":"5_CR32","first-page":"74","volume-title":"Modern Information Retrieval","author":"R. Baeza-Yates","year":"1999","unstructured":"Baeza-Yates, R., Ribiero-Neto, B.: Modern Information Retrieval, pp. 74\u201379. Addison-Wesley, Reading (1999)"},{"key":"5_CR33","unstructured":"Chang, C.C., Lin, C.J.: Libsvm - a library for support vector machines, http:\/\/www.csie.ntu.edu.tw\/~cjlin\/libsvm\/"},{"key":"5_CR34","unstructured":"Cohen, W.W., Ravikumar, P., Fienberg, S.: Secondstring - an open-source java-based package of approximate string-matching techniques, http:\/\/secondstring.sourceforge.net\/"},{"key":"5_CR35","first-page":"707","volume":"10","author":"V.I. Levenshtein","year":"1966","unstructured":"Levenshtein, V.I.: Binary codes capable of correcting insertions and reversals. Soviet Physics Doklady\u00a010, 707\u2013710 (1966)","journal-title":"Soviet Physics Doklady"}],"container-title":["Lecture Notes in Computer Science","Journal on Data Semantics VII"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/11890591_5.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,12]],"date-time":"2025-01-12T00:27:44Z","timestamp":1736641664000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/11890591_5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2006]]},"ISBN":["9783540463290","9783540463306"],"references-count":35,"URL":"https:\/\/doi.org\/10.1007\/11890591_5","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2006]]}}}