{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,1,2]],"date-time":"2024-01-02T15:01:08Z","timestamp":1704207668023},"reference-count":26,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2007,3,9]],"date-time":"2007-03-09T00:00:00Z","timestamp":1173398400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Knowl Inf Syst"],"published-print":{"date-parts":[[2008,2]]},"DOI":"10.1007\/s10115-007-0071-9","type":"journal-article","created":{"date-parts":[[2007,3,8]],"date-time":"2007-03-08T17:12:31Z","timestamp":1173373951000},"page":"217-232","source":"Crossref","is-referenced-by-count":6,"title":["A systematic study on parameter correlations in large-scale duplicate document detection"],"prefix":"10.1007","volume":"14","author":[{"given":"Shaozhi","family":"Ye","sequence":"first","affiliation":[]},{"given":"Ji-Rong","family":"Wen","sequence":"additional","affiliation":[]},{"given":"Wei-Ying","family":"Ma","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2007,3,9]]},"reference":[{"key":"71_CR1","doi-asserted-by":"crossref","unstructured":"Bellare M, Kohno T (2004) Hash function balance and its impact on birthday attacks. In: EUROCRYPT 2004: international conference on the theory and applications of cryptographic techniques, pp 401\u2013418","DOI":"10.1007\/978-3-540-24676-3_24"},{"key":"71_CR2","doi-asserted-by":"crossref","unstructured":"Bharat K, Broder AZ (1999) Mirror, mirror on the Web: a study of host pairs with replicated content. In: Proceedings of the 8th international World Wide Web (WWW) conference, pp 501\u2013512","DOI":"10.1016\/S1389-1286(99)00021-3"},{"issue":"12","key":"71_CR3","doi-asserted-by":"crossref","first-page":"1114","DOI":"10.1002\/1097-4571(2000)9999:9999<::AID-ASI1025>3.0.CO;2-0","volume":"51","author":"K Bharat","year":"2000","unstructured":"Bharat K, Broder AZ, Dean J, Henzinger MR (2000) A comparison of techniques to find mirrored hosts on the WWW. J Am Soc Inf Sci (JASIS) 51(12):1114\u20131122","journal-title":"J Am Soc Inf Sci (JASIS)"},{"key":"71_CR4","doi-asserted-by":"crossref","unstructured":"Brin S, Davis J, Garcia-Molina H (1995) Copy detection mechanisms for digital documents. In: Proceedings of the 1995 ACM international conference on management of data (SIGMOD), pp 398\u2013409","DOI":"10.1145\/223784.223855"},{"key":"71_CR5","doi-asserted-by":"crossref","unstructured":"Broder AZ, Glassman SC, Manasse MS, Zweig G (1997) Syntactic clustering of the Web. In: Proceedings of the sixth international World Wide Web (WWW) conference, pp 1157\u20131166","DOI":"10.1016\/S0169-7552(97)00031-7"},{"key":"71_CR6","doi-asserted-by":"crossref","unstructured":"Cho J, Shivakumar N, Garcia-Molina H (2000) Finding replicated Web collections. In: Proceedings of the 2000 ACM international conference on management of data (SIGMOD), pp 355\u2013366","DOI":"10.1145\/335191.335429"},{"issue":"2","key":"71_CR7","doi-asserted-by":"crossref","first-page":"171","DOI":"10.1145\/506309.506311","volume":"20","author":"A Chowdhury","year":"2002","unstructured":"Chowdhury A, Frieder O, Grossman D, McCabe MC (2002) Collection statistics for fast duplicate document detection. ACM Trans Inf Syst 20(2):171\u2013191","journal-title":"ACM Trans Inf Syst"},{"key":"71_CR8","doi-asserted-by":"crossref","unstructured":"Conrad JG, Guo XS, Schriber CP (2003) Online duplicate document detection: signature reliability in a dynamic retrieval environment. In: Proceedings of the 12th international conference on information and knowledge management (CIKM), pp 443\u2013452","DOI":"10.1145\/956863.956946"},{"key":"71_CR9","doi-asserted-by":"crossref","unstructured":"Cooper JW, Coden A, Brown EW (2002) Detecting similar documents using salient terms. In: Proceedings of the 11th ACM international conference on information and knowledge management (CIKM), pp 245\u2013251","DOI":"10.1145\/584792.584835"},{"key":"71_CR10","unstructured":"Crovella ME, Taqqu MS, Bestavros A (1998) Heavy-tailed probability distributions in the World Wide Web. In: Adler R, Feldman R, Taqqu M (eds.) A practical guide to heavy tails: statistical techniques and applications. Birkhauser, Boston, pp 3\u201325"},{"key":"71_CR11","doi-asserted-by":"crossref","unstructured":"Dean J, Henzinger MR (1999) Finding related pages in the World Wide Web. In: Proceeding of the 8th international World Wide Web conference (WWW), pp 1467\u20131479","DOI":"10.1016\/S1389-1286(99)00022-5"},{"key":"71_CR12","doi-asserted-by":"crossref","unstructured":"Fetterly D, Manasse M, Najork M (2003) On the evolution of clusters of near-duplicate Web pages. In: Proceedings of the 1st Latin American Web Congress (LA-Web), pp 37\u201345","DOI":"10.1109\/LAWEB.2003.1250280"},{"key":"71_CR13","doi-asserted-by":"crossref","unstructured":"Fetterly D, Manasse M, Najork M (2004) Spam, damn spam, and statistics: using statistical analysis to locate spam Web pages. In: Proceedings of the 7th international workshop on the Web and databases (WebDB), pp 1\u20136","DOI":"10.1145\/1017074.1017077"},{"key":"71_CR14","doi-asserted-by":"crossref","unstructured":"Fetterly D, Manasse M, Najork M, Wiener J (2003) A large-scale study of the evolution of Web pages. In: Proceedings of the 12th international World Wide Web (WWW) conference, pp 669\u2013678","DOI":"10.1145\/775152.775246"},{"key":"71_CR15","unstructured":"Gyongyi Z, Garcia-Molina H (2005) Web spam taxonomy, Technical report, Stanford University"},{"key":"71_CR16","unstructured":"Heintze N (1996) Scalable document fingerprinting. In: Proceedings of the 2nd USENIX electronic commerce workshop, pp 191\u2013200"},{"issue":"4","key":"71_CR17","doi-asserted-by":"crossref","first-page":"438","DOI":"10.1007\/s10115-004-0188-z","volume":"8","author":"Z Li","year":"2005","unstructured":"Li Z, Ng WK, Sun A (2005) Web data extraction based on structural similarity. Knowl Inf Syst 8(4):438\u2013461","journal-title":"Knowl Inf Syst"},{"issue":"2","key":"71_CR18","doi-asserted-by":"crossref","first-page":"230","DOI":"10.1007\/s10115-003-0112-y","volume":"6","author":"S Mukherjea1","year":"2004","unstructured":"Mukherjea1 S (2004) Discovering and analyzing World Wide Web collections. Knowl Inf Syst 6(2):230\u2013241","journal-title":"Knowl Inf Syst"},{"key":"71_CR19","unstructured":"Rabin M (1981) Fingerprinting by random polynomials, Technical report tr-15-81, Center for Research in Computing Technology, Harvard University"},{"key":"71_CR20","unstructured":"Shivakumar N, Garcia-Molina H (1998) Finding near-replicas of documents and servers on the Web. In: Proceedings of the 1st international workshop on World Wide Web and Databases (WebDB), pp 204\u2013212"},{"issue":"2","key":"71_CR21","doi-asserted-by":"crossref","first-page":"23","DOI":"10.1145\/792550.792554","volume":"36","author":"I Soboroff","year":"2002","unstructured":"Soboroff I (2002) Do TREC Web collections look like the Web? SIGIR Forum 36(2):23\u201331","journal-title":"SIGIR Forum"},{"key":"71_CR22","doi-asserted-by":"crossref","unstructured":"Wang Y, Kitsuregawa M (2002) Evaluating contents-link coupled Web page clustering for Web search results. In: Proceedings of the 11th ACM international conference on information and knowledge management (CIKM), pp 499\u2013506","DOI":"10.1145\/584792.584875"},{"key":"71_CR23","doi-asserted-by":"crossref","unstructured":"Ye S, Song R, Wen J-R, Ma W-Y (2004) A query-dependent duplicate detection approach for large scale search engines. In: Proceedings of the 6th Asia-Pacific Web conference (APWeb), pp 48\u201358","DOI":"10.1007\/978-3-540-24655-8_6"},{"key":"71_CR24","doi-asserted-by":"crossref","unstructured":"Yi L, Liu B, Li X (2003) Eliminating noisy information in Web pages for data mining. In: Proceedings of the 9th ACM international conference on knowledge discovery and data mining (SIGKDD), pp 296\u2013305","DOI":"10.1145\/956750.956785"},{"key":"71_CR25","doi-asserted-by":"crossref","unstructured":"Zamir O, Etzioni O (1998) Web document clustering: a feasibility demonstration. In: Proceedings of the 21st annual international ACM conference on research and development in information retrieval (SIGIR), pp 46\u201354","DOI":"10.1145\/290941.290956"},{"issue":"3","key":"71_CR26","doi-asserted-by":"crossref","first-page":"374","DOI":"10.1007\/s10115-004-0194-1","volume":"8","author":"S Zhong","year":"2005","unstructured":"Zhong S, Ghosh J (2005) Generative model-based document clustering: a comparative study. Knowl Inf Syst 8(3):374\u2013384","journal-title":"Knowl Inf Syst"}],"container-title":["Knowledge and Information Systems"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10115-007-0071-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10115-007-0071-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10115-007-0071-9","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,5,29]],"date-time":"2019-05-29T10:10:16Z","timestamp":1559124616000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10115-007-0071-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2007,3,9]]},"references-count":26,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2008,2]]}},"alternative-id":["71"],"URL":"https:\/\/doi.org\/10.1007\/s10115-007-0071-9","relation":{},"ISSN":["0219-1377","0219-3116"],"issn-type":[{"value":"0219-1377","type":"print"},{"value":"0219-3116","type":"electronic"}],"subject":[],"published":{"date-parts":[[2007,3,9]]}}}