{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,14]],"date-time":"2026-03-14T09:56:05Z","timestamp":1773482165801,"version":"3.50.1"},"reference-count":47,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2009,10,28]],"date-time":"2009-10-28T00:00:00Z","timestamp":1256688000000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Data Min Knowl Disc"],"published-print":{"date-parts":[[2010,1]]},"DOI":"10.1007\/s10618-009-0155-0","type":"journal-article","created":{"date-parts":[[2009,10,27]],"date-time":"2009-10-27T11:23:53Z","timestamp":1256642633000},"page":"152-187","source":"Crossref","is-referenced-by-count":26,"title":["An incremental clustering scheme for data de-duplication"],"prefix":"10.1007","volume":"20","author":[{"given":"Gianni","family":"Costa","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Giuseppe","family":"Manco","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Riccardo","family":"Ortale","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2009,10,28]]},"reference":[{"key":"155_CR1","doi-asserted-by":"crossref","unstructured":"Agichtein E, Ganti V (2004) Mining reference tables for automatic text segmentation. In: Proceedings of the ACM SIGKDD international conference on knowledge discovery and data mining, pp 20\u201329","DOI":"10.1145\/1014052.1014058"},{"key":"155_CR2","doi-asserted-by":"crossref","unstructured":"Ananthakrishna R, Chaudhuri S, Ganti V (2002) Eliminating fuzzy duplicates in data warehouses. In: Proceedings of the international conference on very large databases, pp 586\u2013597","DOI":"10.1016\/B978-155860869-6\/50058-5"},{"key":"155_CR3","unstructured":"Arasu A, Ganti V, Kaushik R (2006) Efficient exact set-similarity joins. In: Proceedings of the international conference on very large databases, pp 918\u2013929"},{"key":"155_CR4","doi-asserted-by":"crossref","unstructured":"Bawa M, Tyson S, Condie, Ganesan P (2005) LSH forest: self-tuning indexes for similarity search. In: Proceedings of the international conference on world wide web, pp 651\u2013660","DOI":"10.1145\/1060745.1060840"},{"key":"155_CR5","doi-asserted-by":"crossref","unstructured":"Bayardo RJ, Ma Y, Srikant R (2007) Scaling up all pairs similarity search. In: Proceedings of the international conference on world wide web, pp 131\u2013140","DOI":"10.1145\/1242572.1242591"},{"key":"155_CR6","doi-asserted-by":"crossref","unstructured":"Bhattacharya I, Getoor L (2004) Iterative record linkage for cleaning and integration. In: Proceedings of the SIGMOD workshop on research issues on data mining and knowledge discovery, pp 11\u201318","DOI":"10.1145\/1008694.1008697"},{"key":"155_CR7","doi-asserted-by":"crossref","unstructured":"Bilenko M, Mooney RJ (2003a) Adaptive duplicate detection using learnable string similarity measures. In: Proceedings of the ACM SIGKDD international conference on knowledge discovery and data mining, pp 39\u201348","DOI":"10.1145\/956750.956759"},{"key":"155_CR8","unstructured":"Bilenko M, Mooney RJ (2003b) On evaluation and training-set construction for duplicate detection. In: Proceedings of the KDD workshop on data cleaning, record linkage, and object consolidation, pp 7\u201312"},{"key":"155_CR9","doi-asserted-by":"crossref","unstructured":"Broder A, Glassman S, Manasse M, Zweig G (1997) Syntactic clustering on the Web. In: Proceedings of the international conference on World Wide Web, pp 1157\u20131166","DOI":"10.1016\/S0169-7552(97)00031-7"},{"key":"155_CR10","unstructured":"Broder A, Charikar M, Frieze AM, Mitzenmacher M (1998) Minwise independent permutations. In: Proceedings of the ACM symposium on theory of computing, pp 327\u2013336"},{"key":"155_CR11","doi-asserted-by":"crossref","unstructured":"Cesario E, Folino F, Manco G, Pontieri L (2005) An incremental clustering scheme for duplicate detection in large databases. In: Proceedings of the international conference databases and applications symposium, pp 89\u201395","DOI":"10.1109\/IDEAS.2005.10"},{"issue":"3","key":"155_CR12","doi-asserted-by":"crossref","first-page":"285","DOI":"10.1007\/s10115-007-0085-3","volume":"15","author":"E Cesario","year":"2008","unstructured":"Cesario E, Folino F, Locane A, Manco G, Ortale R (2008) Boosting text segmentation via progressive classification. J Knowl Inf Syst 15(3): 285\u2013320","journal-title":"J Knowl Inf Syst"},{"key":"155_CR13","doi-asserted-by":"crossref","unstructured":"Chaudhuri S, Ganjam K, Ganti V, Motwani R (2003) Robust and efficient fuzzy match for online data cleaning. In: Proceedings of the ACM SIGMOD conference on management of data, pp 313\u2013324","DOI":"10.1145\/872757.872796"},{"key":"155_CR14","doi-asserted-by":"crossref","unstructured":"Chaudhuri S, Ganti V, Motwani R (2005) Robust identification of fuzzy duplicates. In: Proceedings of the international conference on data engineering, pp 865\u2013876","DOI":"10.1109\/ICDE.2005.125"},{"issue":"3","key":"155_CR15","doi-asserted-by":"crossref","first-page":"273","DOI":"10.1145\/502807.502808","volume":"33","author":"E Chavez","year":"2001","unstructured":"Chavez E, Navarro G, Baeza-Yates R, Luis Marroquin J (2001) Searching in metric spaces. ACM Comput Surv 33(3): 273\u2013321","journal-title":"ACM Comput Surv"},{"key":"155_CR16","unstructured":"Ciaccia P, Patella M, Zezula P (1997) M-Tree: an efficient access method for similarity search in metric spaces. In: Proceedings of the international conference on very large databases, pp 426\u2013435"},{"key":"155_CR17","unstructured":"Cochinwala M, Dalal S, Elmagarmid AK, Verykios VS (2005) Record matching: past, present and future"},{"key":"155_CR18","unstructured":"Cohen W, Richman J (2001) Learning to match and cluster entity names. In: Proceedings of the ACM SIGIR workshop on mathematical\/formal methods in information retrieval, pp 13\u201318"},{"key":"155_CR19","doi-asserted-by":"crossref","unstructured":"Cohen WW, Richman J (2002) Learning to match and cluster large high-dimensional data sets for data integration. In: Proceedings of the ACM SIGKDD international conference on knowledge discovery and data mining, pp 475\u2013480","DOI":"10.1145\/775047.775116"},{"key":"155_CR20","unstructured":"Cohen WW, Ravikumar P, Fienberg SE (2003) A comparison of string distance metrics for name-matching tasks. In: Proceedings of the IJCAI workshop on information integration on the web, pp 73\u201378"},{"key":"155_CR21","unstructured":"Ester M, Kriegel HP, Sander J, Xu X (1996) A density-based algorithm for discovering clusters in large spatial databases with noise. In: Proceedings of the international conference on knowledge discovery and data mining, pp 226\u2013231"},{"key":"155_CR22","doi-asserted-by":"crossref","first-page":"1183","DOI":"10.1080\/01621459.1969.10501049","volume":"64","author":"IP Fellegi","year":"1969","unstructured":"Fellegi IP, Sunter AB (1969) A theory for record linkage. J Am Stat Assoc 64: 1183\u20131210","journal-title":"J Am Stat Assoc"},{"key":"155_CR23","doi-asserted-by":"crossref","unstructured":"Ganti V et al (1999) Clustering large datasets in arbitrary metric spaces. In: Proceedings of the international conference on data engineering, pp 502\u2013511","DOI":"10.1109\/ICDE.1999.754966"},{"key":"155_CR24","unstructured":"Gionis A, Indyk P, Motwani R (1999) Similarity search in high dimensions via hashing. In: Proceedings of the international conference on very large databases, pp 518\u2013529"},{"key":"155_CR25","unstructured":"Gravano L, Ipeirotis PG, Jagadish HV, Koudas N, Muthukrishnan S, Srivastava D (2001) Approximate string joins in a database (Almost) for free. In: Proceedings of the international conference on very large databases, pp 491\u2013500"},{"key":"155_CR26","unstructured":"Gu L, Baxter RA, Vickers D, Rainsford C (2003) Record linkage: current practice and future directions. Technical Report, number 03\/83. CSIRO Mathematical and Information Sciences"},{"key":"155_CR27","doi-asserted-by":"crossref","unstructured":"Guha S, Rastogi R, Shim K (1998) CURE: an efficient clustering algorithm for large databases. In: Proceedings of the ACM SIGMOD international conference on management of data, pp 73\u201384","DOI":"10.1145\/276304.276312"},{"issue":"5","key":"155_CR28","doi-asserted-by":"crossref","first-page":"345","DOI":"10.1016\/S0306-4379(00)00022-3","volume":"25","author":"S Guha","year":"2001","unstructured":"Guha S, Rastogi R, Shim K (2001) ROCK: a robust clustering algorithm for categorical attributes. Inf Syst 25(5): 345\u2013366","journal-title":"Inf Syst"},{"key":"155_CR29","doi-asserted-by":"crossref","DOI":"10.1017\/CBO9780511574931","volume-title":"Algorithms on strings, trees and sequences","author":"D Gunsfield","year":"1997","unstructured":"Gunsfield D (1997) Algorithms on strings, trees and sequences. Cambridge University Press, Cambridge"},{"key":"155_CR30","doi-asserted-by":"crossref","unstructured":"Hern\u00e1ndez MA, Stolfo SJ (1995) The Merge\/Purge problem for large databases. In: Proceedings of the ACM SIGMOD international conference on management of data, pp 127\u2013138","DOI":"10.1145\/223784.223807"},{"issue":"4","key":"155_CR31","doi-asserted-by":"crossref","first-page":"517","DOI":"10.1145\/958942.958948","volume":"28","author":"GR Hjatason","year":"2003","unstructured":"Hjatason GR, Samet H (2003) Index-driven similarity search in metric spaces. ACM Trans Database Syst 28(4): 517\u2013518","journal-title":"ACM Trans Database Syst"},{"key":"155_CR32","unstructured":"Indyk P, Motwani R (1998) Approximate nearest neighbor-towards removing the curse of dimensionality. In: Proceedings of symposium on theory of computing, pp 604\u2013613"},{"issue":"1","key":"155_CR33","first-page":"1","volume":"18","author":"PG Ipeirotis","year":"2007","unstructured":"Ipeirotis PG, Verykios VS, Elmagarmid AK (2007) Duplicate record detection: a review. IEEE Trans Knowl Data Eng 18(1): 1\u201316","journal-title":"IEEE Trans Knowl Data Eng"},{"issue":"3","key":"155_CR34","doi-asserted-by":"crossref","first-page":"264","DOI":"10.1145\/331499.331504","volume":"31","author":"AK Jain","year":"1999","unstructured":"Jain AK, Murty MN, Flynn PJ (1999) Data clustering: a review. ACM Comput Surv 31(3): 264\u2013323","journal-title":"ACM Comput Surv"},{"key":"155_CR35","doi-asserted-by":"crossref","unstructured":"Kalashnikov DV, Mehrotra S, Chen Z (2005) Exploiting relationships for domain independent data cleaning. In: Proceedings of the SIAM conference on data mining, pp 262\u2013273","DOI":"10.1137\/1.9781611972757.24"},{"key":"155_CR36","doi-asserted-by":"crossref","unstructured":"McCallum AK, Nigam K, Ungar L (2000) Efficient clustering of high-dimensional data sets with application to reference matching. In: Proceedings of the ACM SIGKDD international conference on knowledge discovery and data mining, pp 169\u2013178","DOI":"10.1145\/347090.347123"},{"key":"155_CR37","unstructured":"Monge AE, Elkan CP (1996) The field matching problem: algorithms and applications. In: Proceedings of the international conference on knowledge discovery and data mining, pp 267\u2013270"},{"key":"155_CR38","unstructured":"Monge AE, Elkan CP (1997) An efficient domain-independent algorithm for detecting approximately duplicate database records. In: Proceedings of the SIGMOD workshop on research issues on data mining and knowledge discovery, pp 23\u201329"},{"key":"155_CR39","unstructured":"Monge AE, Elkan CP (2001) Automatic segmentation of text into structured records. In: Proceedings of the ACM SIGMOD conference on management of data"},{"key":"155_CR40","unstructured":"Neiling M, Jurk S (2003) The object identification framework. In: Proceedings of the KDD workshop on data cleaning, record linkage, and object consolidation, pp 37\u201339"},{"key":"155_CR41","doi-asserted-by":"crossref","unstructured":"Sarawagi S, Bhamidipaty A (2002) Interactive deduplication using active learning. In: Proceedings of the 8th ACM SIGKDD international conference on knowledge discovery and data mining, pp 269\u2013278","DOI":"10.1145\/775047.775087"},{"key":"155_CR42","unstructured":"Sarawagi S, Kirpal A (2004) Efficient exact set-similarity joins. In: Proceedings of the SIGMOD international conference on management of data, pp 743\u2013754"},{"key":"155_CR43","doi-asserted-by":"crossref","unstructured":"Tejada S, Knoblock CA, Minton S (2002) Learning domain-independent string transformation weights for high accuracy object identification. In: Proceedings of the ACM SIGKDD international conference on knowledge discovery and data mining, pp 350\u2013359","DOI":"10.1145\/775047.775099"},{"issue":"1","key":"155_CR44","doi-asserted-by":"crossref","first-page":"191","DOI":"10.1016\/0304-3975(92)90143-4","volume":"92","author":"E Ukkonen","year":"1982","unstructured":"Ukkonen E (1982) Approximate string matching using q-grams and maximal matches. Theor Comput Sci 92(1): 191\u2013211","journal-title":"Theor Comput Sci"},{"key":"155_CR45","unstructured":"Weber R, Schek HJ, Blott S (1998) A quantitative analsysis and performance study for similarity search in high-dimensional spaces. In: Proceedings of the international conference on very large databases, pp 194\u2013205"},{"key":"155_CR46","unstructured":"Winkler WE (1990) String comparator metrics and enhanced decision rules in the Fellegi-Sunter model of record linkage. In: Proceedings of the section on survey research methods, American Statistical Association, pp 354\u2013359"},{"key":"155_CR47","unstructured":"Winkler WE (1999) The state of record linkage and current research problems. Technical Report. Statistical Research Division, US Census Bureau"}],"container-title":["Data Mining and Knowledge Discovery"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10618-009-0155-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10618-009-0155-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10618-009-0155-0","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,5,30]],"date-time":"2019-05-30T19:29:41Z","timestamp":1559244581000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10618-009-0155-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2009,10,28]]},"references-count":47,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2010,1]]}},"alternative-id":["155"],"URL":"https:\/\/doi.org\/10.1007\/s10618-009-0155-0","relation":{},"ISSN":["1384-5810","1573-756X"],"issn-type":[{"value":"1384-5810","type":"print"},{"value":"1573-756X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2009,10,28]]}}}