{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,9,4]],"date-time":"2024-09-04T17:28:58Z","timestamp":1725470938224},"publisher-location":"Berlin, Heidelberg","reference-count":31,"publisher":"Springer Berlin Heidelberg","isbn-type":[{"type":"print","value":"9783540457749"},{"type":"electronic","value":"9783540457756"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2006]]},"DOI":"10.1007\/11880561_10","type":"book-chapter","created":{"date-parts":[[2006,9,28]],"date-time":"2006-09-28T12:16:48Z","timestamp":1159445808000},"page":"110-121","source":"Crossref","is-referenced-by-count":10,"title":["Compact Features for Detection of Near-Duplicates in Distributed Retrieval"],"prefix":"10.1007","author":[{"given":"Yaniv","family":"Bernstein","sequence":"first","affiliation":[]},{"given":"Milad","family":"Shokouhi","sequence":"additional","affiliation":[]},{"given":"Justin","family":"Zobel","sequence":"additional","affiliation":[]}],"member":"297","reference":[{"key":"10_CR1","doi-asserted-by":"crossref","unstructured":"Allan, J., et al.: Challenges in information retrieval and language modeling: report of a workshop held at the center for intelligent information retrieval. In: SIGIR Forum, University of Massachusetts Amherst, September 2002, vol.\u00a037(1), pp. 31\u201347 (2003)","DOI":"10.1145\/945546.945549"},{"key":"10_CR2","doi-asserted-by":"crossref","unstructured":"Bernstein, Y., Zobel, J.: A scalable system for identifying co-derivative documents. In: Proc. String Processing and Information Retrieval Symposium, Padova, Italy, pp. 55\u201367 (2004)","DOI":"10.1007\/978-3-540-30213-1_6"},{"key":"10_CR3","doi-asserted-by":"crossref","unstructured":"Bernstein, Y., Zobel, J.: Redundant documents and search effectiveness. In: Proc. ACM CIKM Conf., Bremen, Germany, pp. 736\u2013743 (2005)","DOI":"10.1145\/1099554.1099733"},{"key":"10_CR4","doi-asserted-by":"crossref","unstructured":"Brin, S., Davis, J., Garc\u00eda-Molina, H.: Copy detection mechanisms for digital documents. In: Proc. ACM SIGMOD international conference on Management of Data, San Jose, California, pp. 398\u2013409 (1995)","DOI":"10.1145\/568271.223855"},{"issue":"8-13","key":"10_CR5","doi-asserted-by":"publisher","first-page":"1157","DOI":"10.1016\/S0169-7552(97)00031-7","volume":"29","author":"A.Z. Broder","year":"1997","unstructured":"Broder, A.Z., Glassman, S.C., Manasse, M.S., Zweig, G.: Syntactic clustering of the web. Computer Networks and ISDN Systems\u00a029(8-13), 1157\u20131166 (1997)","journal-title":"Computer Networks and ISDN Systems"},{"key":"10_CR6","first-page":"327","volume-title":"Proc. ACM symposium on Theory of computing (STOC)","author":"A.Z. Broder","year":"1998","unstructured":"Broder, A.Z., Charikar, M., Frieze, A.M., Mitzenmacher, M.: Min-wise independent permutations (extended abstract). In: Proc. ACM symposium on Theory of computing (STOC), pp. 327\u2013336. ACM Press, New York (1998)"},{"issue":"2","key":"10_CR7","doi-asserted-by":"publisher","first-page":"97","DOI":"10.1145\/382979.383040","volume":"19","author":"J. Callan","year":"2001","unstructured":"Callan, J., Connell, M.: Query-based sampling of text databases. ACM Transactions on Information Systems\u00a019(2), 97\u2013130 (2001)","journal-title":"ACM Transactions on Information Systems"},{"key":"10_CR8","doi-asserted-by":"crossref","unstructured":"Callan, J., Lu, Z., Croft, W.B.: Searching distributed collections with inference networks. In: Proc. Int. ACM-SIGIR Conf., Seattle, Washington, pp. 21\u201328 (1995)","DOI":"10.1145\/215206.215328"},{"issue":"2","key":"10_CR9","doi-asserted-by":"publisher","first-page":"171","DOI":"10.1145\/506309.506311","volume":"20","author":"A. Chowdhury","year":"2002","unstructured":"Chowdhury, A., Frieder, O., Grossman, D., McCabe, M.C.: Collection statistics for fast duplicate document detection. ACM Transactions on Information Systems\u00a020(2), 171\u2013191 (2002)","journal-title":"ACM Transactions on Information Systems"},{"key":"10_CR10","doi-asserted-by":"crossref","unstructured":"Conrad, J.G., Guo, X.S., Schriber, C.P.: Online duplicate document detection: Signature reliability in a dynamic retrieval environment. In: Proc. ACM-CIKM Conf., New Orleans, Louisiana, pp. 443\u2013452 (2003)","DOI":"10.1145\/956863.956946"},{"key":"10_CR11","doi-asserted-by":"crossref","unstructured":"Cooper, J.W., Coden, A.R., Brown, E.W.: Detecting similar documents using salient terms. In: Proc. ACM-CIKM Conf., McLean, Virginia, pp. 245\u2013251 (2002)","DOI":"10.1145\/584792.584835"},{"key":"10_CR12","first-page":"37","volume-title":"Proc. first Latin American Web Congress","author":"D. Fetterly","year":"2003","unstructured":"Fetterly, D., Manasse, M., Najork, M.: On the evolution of clusters of near-duplicate web pages. In: Proc. first Latin American Web Congress, pp. 37\u201345. IEEE, Los Alamitos (2003)"},{"issue":"9","key":"10_CR13","first-page":"637","volume":"2","author":"S. Gauch","year":"1996","unstructured":"Gauch, S., Wang, G., Gomez, M.: ProFusion: Intelligent fusion from multiple, distributed search engines. J. Universal Computer Science\u00a02(9), 637\u2013649 (1996)","journal-title":"J. Universal Computer Science"},{"key":"10_CR14","doi-asserted-by":"crossref","unstructured":"Gravano, L., Chang, C.K., Garcia-Molina, H., Paepcke, A.: STARTS: Stanford proposal for Internet meta-searching. In: Proc. ACM SIGMOD international conference on Management of Data, Tucson, Arizona, pp. 207\u2013218 (1997)","DOI":"10.1145\/253260.253299"},{"key":"10_CR15","doi-asserted-by":"crossref","unstructured":"Harman, D.: Overview of the first TREC conference. In: Proc. ACM-SIGIR Conf., Pittsburgh, Pennsylvania, pp. 36\u201347 (1993)","DOI":"10.1145\/160688.160692"},{"key":"10_CR16","doi-asserted-by":"crossref","unstructured":"Hernandez, T., Kambhampati, S.: Improving text collection selection with coverage and overlap statistics. In: Proc. Int. Conf. on World Wide Web, Chiba, Japan, pp. 1128\u20131129 (2005)","DOI":"10.1145\/1062745.1062902"},{"issue":"3","key":"10_CR17","doi-asserted-by":"publisher","first-page":"203","DOI":"10.1002\/asi.10170","volume":"54","author":"T.C. Hoad","year":"2003","unstructured":"Hoad, T.C., Zobel, J.: Methods for identifying versioned and plagiarised documents. J. the American Society for Information Science and Technology\u00a054(3), 203\u2013215 (2003)","journal-title":"J. the American Society for Information Science and Technology"},{"key":"10_CR18","unstructured":"Ilyinski, S., Kuzmin, M., Melkov, A., Segalovich, I.: An efficient method to detect duplicates of web documents with the use of inverted index. In: Proc. Int. Conf. on World Wide Web, Honolulu, Hawaii (2002)"},{"key":"10_CR19","doi-asserted-by":"crossref","unstructured":"Kolcz, A., Chowdhury, A., Alspector, J.: Improved robustness of signature-based near-replica detection via lexicon randomization. In: Proc. ACM SIGKDD Int. Conf. on Knowledge Discovery and Data Mining, Seattle, WA, pp. 605\u2013610 (2004)","DOI":"10.1145\/1014052.1014127"},{"key":"10_CR20","unstructured":"Lyon, C., Malcolm, J., Dickerson, B.: Detecting short passages of similar text in large document collections. In: Proc. Conf. on Empirical Methods in Natural Language Processing, Philadelphia, Pennsylvania (2001)"},{"key":"10_CR21","unstructured":"Manber, U.: Finding similar files in a large file system. In: Proc. USENIX Winter Technical Conf., San Fransisco, CA, pp. 1\u201310, 17\u201321 (1994)"},{"issue":"1","key":"10_CR22","doi-asserted-by":"publisher","first-page":"48","DOI":"10.1145\/505282.505284","volume":"34","author":"W. Meng","year":"2002","unstructured":"Meng, W., Yu, C., Liu, K.: Building efficient and effective metasearch engines. ACM Computing Surveys\u00a034(1), 48\u201389 (2002)","journal-title":"ACM Computing Surveys"},{"key":"10_CR23","doi-asserted-by":"crossref","unstructured":"Nottelmann, H., Fuhr, N.: Evaluating different methods of estimating retrieval quality for resource selection. In: Proc. Int. ACM-SIGIR Conf., Toronto, Canada, pp. 290\u2013297 (2003)","DOI":"10.1145\/860435.860489"},{"issue":"4","key":"10_CR24","doi-asserted-by":"publisher","first-page":"412","DOI":"10.1145\/944012.944016","volume":"21","author":"A.L. Powell","year":"2003","unstructured":"Powell, A.L., French, J.: Comparing the performance of collection selection algorithms. ACM Transactions on Information Systems\u00a021(4), 412\u2013456 (2003)","journal-title":"ACM Transactions on Information Systems"},{"key":"10_CR25","unstructured":"Pugh, W., Henzinger, M.H.: Detecting duplicate and near-duplicate files (United States Patent 6,658,423) (2003)"},{"key":"10_CR26","doi-asserted-by":"crossref","unstructured":"Selberg, E., Etzioni, O.: The MetaCrawler architecture for resource aggregation on the Web. In: IEEE Expert (January\u2013February 1997), pp. 11\u201314 (1997)","DOI":"10.1109\/64.577468"},{"key":"10_CR27","doi-asserted-by":"crossref","unstructured":"Si, L., Callan, J.: Unified utility maximization framework for resource selection. In: Proc. ACM-CIKM Conf., Washington, D.C., pp. 32\u201341 (2004)","DOI":"10.1145\/1031171.1031180"},{"key":"10_CR28","doi-asserted-by":"crossref","unstructured":"Si, L., Callan, J.: Relevant document distribution estimation method for resource selection. In: Proc. ACM-SIGIR Conf., Toronto, Canada, pp. 298\u2013305 (2003)","DOI":"10.1145\/860435.860490"},{"key":"10_CR29","volume-title":"Hacker\u2019s Delight","author":"H.S. Warren Jr.","year":"2002","unstructured":"Warren Jr., H.S.: Hacker\u2019s Delight. Addison-Wesley, Reading (2002)"},{"key":"10_CR30","doi-asserted-by":"crossref","unstructured":"Zamir, O., Etzioni, O.: Grouper: a dynamic clustering interface to web search results. In: Proc. Int. Conf. on World Wide Web, Toronto, Canada, pp. 1361\u20131374 (1999)","DOI":"10.1016\/S1389-1286(99)00054-7"},{"key":"10_CR31","doi-asserted-by":"crossref","unstructured":"Zobel, J., Bernstein, Y.: The case of the duplicate documents: Measurement, search, and science. In: Proc. Asia-Pacific Web Conf., Harbin, China, pp. 26\u201339 (2006)","DOI":"10.1007\/11610113_4"}],"container-title":["Lecture Notes in Computer Science","String Processing and Information Retrieval"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/11880561_10.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,11,17]],"date-time":"2020-11-17T19:49:42Z","timestamp":1605642582000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/11880561_10"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2006]]},"ISBN":["9783540457749","9783540457756"],"references-count":31,"URL":"https:\/\/doi.org\/10.1007\/11880561_10","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2006]]}}}