{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,26]],"date-time":"2025-10-26T14:18:21Z","timestamp":1761488301831},"reference-count":51,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2009,8,20]],"date-time":"2009-08-20T00:00:00Z","timestamp":1250726400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["The VLDB Journal"],"published-print":{"date-parts":[[2009,10]]},"DOI":"10.1007\/s00778-009-0161-2","type":"journal-article","created":{"date-parts":[[2009,8,19]],"date-time":"2009-08-19T13:09:50Z","timestamp":1250687390000},"page":"1141-1166","source":"Crossref","is-referenced-by-count":38,"title":["Creating probabilistic databases from duplicated data"],"prefix":"10.1007","volume":"18","author":[{"given":"Oktie","family":"Hassanzadeh","sequence":"first","affiliation":[]},{"given":"Ren\u00e9e J.","family":"Miller","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2009,8,20]]},"reference":[{"key":"161_CR1","doi-asserted-by":"crossref","unstructured":"Ailon, N., Charikar, M., Newman, A.: Aggregating inconsistent information: ranking and clustering. In: ACM Symp. on Theory of Computing (STOC), pp. 684\u2013693 (2005)","DOI":"10.1145\/1060590.1060692"},{"key":"161_CR2","doi-asserted-by":"crossref","unstructured":"Andritsos, P., Fuxman, A., Miller, R.J.: Clean answers over dirty databases: a probabilistic approach. In: IEEE Proc. of the Int\u2019l Conf. on Data Eng., p. 30 (2006)","DOI":"10.1109\/ICDE.2006.35"},{"key":"161_CR3","doi-asserted-by":"crossref","unstructured":"Antova, L., Koch, C., Olteanu, D.: Fast and simple relational processing of uncertain data. In: IEEE Proc. of the Int\u2019l Conf. on Data Eng., pp. 983\u2013992 (2008)","DOI":"10.1109\/ICDE.2008.4497507"},{"key":"161_CR4","unstructured":"Arasu, A., Ganti, V., Kaushik, R.: Efficient exact set-similarity joins. In: Proc. of the Int\u2019l Conf. on Very Large Data Bases (VLDB), pp. 918\u2013929 (2006)"},{"key":"161_CR5","doi-asserted-by":"crossref","unstructured":"Arasu, A., R\u00e9, C., Suciu, D.: Large-scale deduplication with constraints using dedupalog. In: IEEE Proc. of the Int\u2019l Conf. on Data Eng., pp. 952\u2013963 (2009)","DOI":"10.1109\/ICDE.2009.43"},{"issue":"1","key":"161_CR6","doi-asserted-by":"crossref","first-page":"95","DOI":"10.7155\/jgaa.00084","volume":"8","author":"J.A. Aslam","year":"2004","unstructured":"Aslam J.A., Pelekhov E., Rus D.: The star clustering algorithm for static and dynamic information organization. J. Graph Algorithm. Appl. 8(1), 95\u2013129 (2004)","journal-title":"J. Graph Algorithm. Appl."},{"issue":"1\u20133","key":"161_CR7","doi-asserted-by":"crossref","first-page":"89","DOI":"10.1023\/B:MACH.0000033116.57574.95","volume":"56","author":"N. Bansal","year":"2004","unstructured":"Bansal N., Blum A., Chawla S.: Correlation clustering. Mach. Learn. 56(1\u20133), 89\u2013113 (2004)","journal-title":"Mach. Learn."},{"key":"161_CR8","doi-asserted-by":"crossref","unstructured":"Bayardo, R.J., Ma, Y., Srikant, R.: Scaling up all pairs similarity search. In: Int\u2019l World Wide Web Conference (WWW), pp. 131\u2013140, Banff (2007)","DOI":"10.1145\/1242572.1242591"},{"issue":"1","key":"161_CR9","doi-asserted-by":"crossref","first-page":"255","DOI":"10.1007\/s00778-008-0098-x","volume":"18","author":"O. Benjelloun","year":"2009","unstructured":"Benjelloun O., Garcia-Molina H., Menestrina D., Su Q., Whang S.E., Widom J.: Swoosh: a generic approach to entity resolution. Int. J. Very Large Data Bases 18(1), 255\u2013276 (2009)","journal-title":"Int. J. Very Large Data Bases"},{"key":"161_CR10","unstructured":"Beskales, G., Soliman, M.A., Ilyas, I.F., Ben-David, S.: Modeling and querying possible repairs in duplicate detection. In: Proc. of the Int\u2019l Conf. on Very Large Data Bases (VLDB), 2009 (To Appear). Available as University of Waterloo, Tech. Report CS-2009-15, 2009"},{"key":"161_CR11","doi-asserted-by":"crossref","DOI":"10.1007\/978-1-4757-0450-1","volume-title":"Pattern Recognition with Fuzzy Objective Function Algorithms","author":"J.C. Bezdek","year":"1981","unstructured":"Bezdek J.C.: Pattern Recognition with Fuzzy Objective Function Algorithms. Kluwer Academic Publishers, Dordrecht (1981)"},{"key":"161_CR12","doi-asserted-by":"crossref","unstructured":"Bhattacharya, I., Getoor, L.: A latent dirichlet model for unsupervised entity resolution. In: Proc. of the SIAM International Conference on Data Mining (SDM), pp. 47\u201358, Bethesda (2006)","DOI":"10.1137\/1.9781611972764.5"},{"issue":"2","key":"161_CR13","first-page":"4","volume":"29","author":"I. Bhattacharya","year":"2006","unstructured":"Bhattacharya I., Getoor L.: Collective entity resolution in relational data. IEEE Data Eng. Bull. 29(2), 4\u201312 (2006)","journal-title":"IEEE Data Eng. Bull."},{"key":"161_CR14","first-page":"993","volume":"3","author":"D.M. Blei","year":"2003","unstructured":"Blei D.M., Ng A.Y., Jordan M.I.: Latent dirichlet\u00a0allocation. J. Mach. Learn. Res. 3, 993\u20131022 (2003)","journal-title":"J. Mach. Learn. Res."},{"key":"161_CR15","doi-asserted-by":"crossref","unstructured":"Boulos, J., Dalvi, N., Mandhani, B., Mathur, S., Re, C., Suciu, D.: MYSTIQ: a system for finding more answers by using probabilities. In: ACM SIGMOD Int\u2019l Conf. on the Mgmt. of Data, pp. 891\u2013893 (2005)","DOI":"10.1145\/1066157.1066277"},{"key":"161_CR16","doi-asserted-by":"crossref","unstructured":"Chaudhuri, S., Ganjam, K., Ganti, V., Motwani, R.: Robust and efficient fuzzy match for online data cleaning. In: ACM SIGMOD Int\u2019l Conf. on the Mgmt. of Data, pp. 313\u2013324 (2003)","DOI":"10.1145\/872757.872796"},{"key":"161_CR17","doi-asserted-by":"crossref","unstructured":"Chaudhuri, S., Ganti, V., Motwani, R.: Robust identification of fuzzy duplicates. In: IEEE Proc. of the Int\u2019l Conf. on Data Eng., pp. 865\u2013876, Washington (2005)","DOI":"10.1109\/ICDE.2005.125"},{"key":"161_CR18","doi-asserted-by":"crossref","unstructured":"Chaudhuri, S., Das Sarma, A., Ganti, V., Kaushik, R.: Leveraging aggregate constraints for deduplication. In: ACM SIGMOD Int\u2019l Conf. on the Mgmt. of Data, pp. 437\u2013448 (2007)","DOI":"10.1145\/1247480.1247530"},{"issue":"1","key":"161_CR19","doi-asserted-by":"crossref","first-page":"722","DOI":"10.14778\/1453856.1453935","volume":"1","author":"R. Cheng","year":"2008","unstructured":"Cheng R., Chen J., Xie X.: Cleaning uncertain data with quality guarantees. Proc. VLDB Endow. (PVLDB) 1(1), 722\u2013735 (2008)","journal-title":"Proc. VLDB Endow. (PVLDB)"},{"key":"161_CR20","unstructured":"Cohen, W.W., Ravikumar, P., Fienberg, S.E.: A comparison of string distance metrics for name-matching tasks. In: Proc. of IJCAI-03 Workshop on Information Integration on the Web (IIWeb-03), pp. 73\u201378, Acapulco (2003)"},{"issue":"4","key":"161_CR21","doi-asserted-by":"crossref","first-page":"523","DOI":"10.1007\/s00778-006-0004-3","volume":"16","author":"N. Dalvi","year":"2007","unstructured":"Dalvi N., Suciu D.: Efficient query evaluation on probabilistic databases. Int. J. Very Large Data Bases 16(4), 523\u2013544 (2007)","journal-title":"Int. J. Very Large Data Bases"},{"key":"161_CR22","doi-asserted-by":"crossref","unstructured":"Dalvi, N., Suciu, D.: Management of probabilistic data: foundations and challenges. In: ACM SIGMOD Int\u2019l Conf. on the Mgmt. of Data, pp. 1\u201312 (2007)","DOI":"10.1145\/1265530.1265531"},{"issue":"2","key":"161_CR23","doi-asserted-by":"crossref","first-page":"172","DOI":"10.1016\/j.tcs.2006.05.008","volume":"361","author":"E.D. Demaine","year":"2006","unstructured":"Demaine E.D., Emanuel D., Fiat A., Immorlica N.: Correlation clustering in general weighted graphs. Theor. Comput. Sci. 361(2), 172\u2013187 (2006)","journal-title":"Theor. Comput. Sci."},{"key":"161_CR24","unstructured":"Dong, X.L., Halevy, A.Y., Yu, C.: Data integration with uncertainty. In: Proc. of the Int\u2019l Conf. on Very Large Data Bases (VLDB), pp. 687\u2013698 (2007)"},{"issue":"1","key":"161_CR25","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1109\/TKDE.2007.250581","volume":"19","author":"A.K. Elmagarmid","year":"2007","unstructured":"Elmagarmid A.K., Ipeirotis P.G., Verykios V.S.: Duplicate record detection: a survey. IEEE Trans. Know. Data Eng. 19(1), 1\u201316 (2007)","journal-title":"IEEE Trans. Know. Data Eng."},{"issue":"328","key":"161_CR26","doi-asserted-by":"crossref","first-page":"1183","DOI":"10.1080\/01621459.1969.10501049","volume":"64","author":"I.P. Fellegi","year":"1969","unstructured":"Fellegi I.P., Sunter A.B.: A theory for record linkage. J. Am. Stat. Assoc. 64(328), 1183\u20131210 (1969)","journal-title":"J. Am. Stat. Assoc."},{"issue":"4","key":"161_CR27","doi-asserted-by":"crossref","first-page":"385","DOI":"10.1080\/15427951.2004.10129093","volume":"1","author":"G.W. Flake","year":"2004","unstructured":"Flake G.W., Tarjan R.E., Tsioutsiouliklis K.: Graph clustering and minimum cut trees. Internet Math. 1(4), 385\u2013408 (2004)","journal-title":"Internet Math."},{"key":"161_CR28","unstructured":"Gravano, L., Ipeirotis, P.G., Jagadish, H.V., Koudas, N., Muthukrishnan, S., Srivastava, D.: Approximate string joins in a database (Almost) for free. In: Proc. of the Int\u2019l Conf. on Very Large Data Bases (VLDB), pp. 491\u2013500 (2001)"},{"key":"161_CR29","unstructured":"Gupta, R., Sarawagi, S.: Creating probabilistic databases from information extraction models. In: Proc. of the Int\u2019l Conf. on Very Large Data Bases (VLDB), pp. 965\u2013976 (2006)"},{"key":"161_CR30","doi-asserted-by":"crossref","unstructured":"Gusfield, D.: Algorithms on strings, trees, and sequences. In: Computer Science and Computational Biology. Cambridge University Press, Cambridge (1997)","DOI":"10.1017\/CBO9780511574931"},{"key":"161_CR31","unstructured":"Hassanzadeh, O.: Benchmarking Declarative Approximate Selection Predicates. Master\u2019s thesis, University of Toronto, February 2007"},{"key":"161_CR32","doi-asserted-by":"crossref","unstructured":"Hassanzadeh, O., Chiang, F., Lee, H.C., Miller, R.J.: Framework for evaluating clustering algorithms in duplicate detection. In: Proc. of the Int\u2019l Conf. on Very Large Data Bases (VLDB), 2009","DOI":"10.14778\/1687627.1687771"},{"key":"161_CR33","unstructured":"Hassanzadeh, O., Sadoghi, M., Miller, R.J.: Accuracy of approximate string joins using grams. In: Proc. of the International Workshop on Quality in Databases (QDB), pp. 11\u201318, Vienna (2007)"},{"key":"161_CR34","unstructured":"Haveliwala, T.H., Gionis, A., Indyk, P.: Scalable techniques for clustering the web. In: Proc. of the Int\u2019l Workshop on the Web and Databases (WebDB), pp. 129\u2013134, Dallas (2000)"},{"issue":"1","key":"161_CR35","doi-asserted-by":"crossref","first-page":"9","DOI":"10.1023\/A:1009761603038","volume":"2","author":"M.A. Hern\u00e1ndez","year":"1998","unstructured":"Hern\u00e1ndez M.A., Stolfo S.J.: Real-world data is dirty: data cleansing and the merge\/purge problem. Data Min. Know. Discov. 2(1), 9\u201337 (1998)","journal-title":"Data Min. Know. Discov."},{"key":"161_CR36","doi-asserted-by":"crossref","unstructured":"Indyk, P., Motwani, R., Raghavan, P., Vempala, S.: Locality-preserving hashing in multidimensional spaces. In: ACM Symp. on Theory of Computing (STOC), pp. 618\u2013625 (1997)","DOI":"10.1145\/258533.258656"},{"issue":"4","key":"161_CR37","doi-asserted-by":"crossref","first-page":"377","DOI":"10.1145\/146370.146380","volume":"24","author":"K. Kukich","year":"1992","unstructured":"Kukich K.: Techniques for automatically correcting words in text. ACM Comput. Surv. 24(4), 377\u2013439 (1992)","journal-title":"ACM Comput. Surv."},{"key":"161_CR38","unstructured":"Li, C., Wang, B., Yang, X.: VGRAM: Improving performance of approximate queries on string collections using variable-length grams. In: Proc. of the Int\u2019l Conf. on Very Large Data Bases (VLDB), pp. 303\u2013314, Vienna (2007)"},{"key":"161_CR39","doi-asserted-by":"crossref","unstructured":"McCallum, A., Nigam, K., Ungar, L.H.: Efficient clustering of high-dimensional data sets with application to reference matching. In: Proc. of the Int\u2019l Conf. on Knowledge Discovery & Data Mining, pp. 169\u2013178 (2000)","DOI":"10.1145\/347090.347123"},{"key":"161_CR40","doi-asserted-by":"crossref","unstructured":"Miller, D.R.H., Leek, T., Schwartz, R.M.: A hidden Markov model information retrieval system. In: ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 214\u2013221 (1999)","DOI":"10.1145\/312624.312680"},{"key":"161_CR41","unstructured":"Monge, A.E., Elkan, C.: An efficient domain-independent algorithm for detecting approximately duplicate database records. In: Proc. of SIGMOD Workshop on Data Mining and Knowledge Discovery (DMKD) (1997)"},{"issue":"4","key":"161_CR42","first-page":"3","volume":"23","author":"E. Rahm","year":"2000","unstructured":"Rahm E., Hai Do H.: Data cleaning: problems and current approaches. IEEE Data Eng. Bull. 23(4), 3\u201313 (2000)","journal-title":"IEEE Data Eng. Bull."},{"key":"161_CR43","doi-asserted-by":"crossref","unstructured":"Re, C., Dalvi, N., Suciu, D.: Efficient Top-k query evaluation on probabilistic data. In: IEEE Proc. of the Int\u2019l Conf. on Data Eng., pp. 886\u2013895 (2007)","DOI":"10.1109\/ICDE.2007.367934"},{"issue":"5","key":"161_CR44","doi-asserted-by":"crossref","first-page":"503","DOI":"10.1108\/00220410410560582","volume":"60","author":"S. Robertson","year":"2004","unstructured":"Robertson S.: Understanding inverse document frequency: on theoretical arguments for IDF. J. Doc. 60(5), 503\u2013520 (2004)","journal-title":"J. Doc."},{"key":"161_CR45","doi-asserted-by":"crossref","unstructured":"Sarawagi, S., Kirpal, A.: Efficient set joins on similarity predicates. In: ACM SIGMOD Int\u2019l Conf. on the Mgmt. of Data, pp. 743\u2013754, Paris (2004)","DOI":"10.1145\/1007568.1007652"},{"key":"161_CR46","doi-asserted-by":"crossref","unstructured":"Sen, P., Deshpande, A., Getoor, L.: Representing tuple and attribute uncertainty in probabilistic databases. In: ICDM Workshops, pp. 507\u2013512 (2007)","DOI":"10.1109\/ICDMW.2007.11"},{"key":"161_CR47","unstructured":"Slonim, N.: The Information Bottleneck: Theory and Applications. PhD thesis, The Hebrew University (2003)"},{"key":"161_CR48","doi-asserted-by":"crossref","unstructured":"Soliman, M.A., Ilyas, I.F., Chang, K.C.: Top-k query processing in uncertain databases. In: IEEE Proc. of the Int\u2019l Conf. on Data Eng., pp. 896\u2013905 (2007)","DOI":"10.1109\/ICDE.2007.367935"},{"issue":"3","key":"161_CR49","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/1386118.1386119","volume":"33","author":"M.A. Soliman","year":"2008","unstructured":"Soliman M.A., Ilyas I.F., Chang K.C.: Probabilistic top-k and ranking-aggregate queries. ACM Trans. Database Syst. (TODS) 33(3), 1\u201354 (2008)","journal-title":"ACM Trans. Database Syst. (TODS)"},{"key":"161_CR50","unstructured":"van Dongen, S.: Graph Clustering By Flow Simulation. PhD thesis, University of Utrecht (2000)"},{"key":"161_CR51","unstructured":"Widom J.: Trio: a system for integrated management of data, accuracy, and lineage. In: Proc. of the Conference on Innovative Data Systems Research (CIDR), pp. 262\u2013276 (2005)"}],"container-title":["The VLDB Journal"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s00778-009-0161-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s00778-009-0161-2\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s00778-009-0161-2","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,5,29]],"date-time":"2019-05-29T15:05:05Z","timestamp":1559142305000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s00778-009-0161-2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2009,8,20]]},"references-count":51,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2009,10]]}},"alternative-id":["161"],"URL":"https:\/\/doi.org\/10.1007\/s00778-009-0161-2","relation":{},"ISSN":["1066-8888","0949-877X"],"issn-type":[{"value":"1066-8888","type":"print"},{"value":"0949-877X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2009,8,20]]}}}