{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,4]],"date-time":"2026-04-04T06:11:24Z","timestamp":1775283084660,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":21,"publisher":"ACM","license":[{"start":{"date-parts":[[2007,5,8]],"date-time":"2007-05-08T00:00:00Z","timestamp":1178582400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2007,5,8]]},"DOI":"10.1145\/1242572.1242588","type":"proceedings-article","created":{"date-parts":[[2007,6,6]],"date-time":"2007-06-06T14:37:31Z","timestamp":1181140651000},"page":"111-120","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":28,"title":["Do not crawl in the dust"],"prefix":"10.1145","author":[{"given":"Ziv","family":"Bar-Yossef","sequence":"first","affiliation":[{"name":"Technion and Google, Haifa, Israel"}]},{"given":"Idit","family":"Keidar","sequence":"additional","affiliation":[{"name":"Technion, Haifa, Israel"}]},{"given":"Uri","family":"Schonfeld","sequence":"additional","affiliation":[{"name":"UCLA, Log Angeles, CA"}]}],"member":"320","published-online":{"date-parts":[[2007,5,8]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"487","volume-title":"Proc. 20th VLDB","author":"Agrawal R.","year":"1994","unstructured":"R. Agrawal and R. Srikant . Fast algorithms for mining association rules . In Proc. 20th VLDB , pages 487 -- 499 , 1994 . R. Agrawal and R. Srikant. Fast algorithms for mining association rules. In Proc. 20th VLDB, pages 487--499, 1994."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1016\/S1389-1286(99)00021-3"},{"issue":"4","key":"e_1_3_2_1_4_1","first-page":"21","article-title":"A comparison of techniques to find mirrored hosts on the WWW","volume":"23","author":"Bharat K.","year":"2000","unstructured":"K. Bharat , A. Z. Broder , J. Dean , and M. R. Henzinger . A comparison of techniques to find mirrored hosts on the WWW . IEEE Data Engin. Bull. , 23 ( 4 ): 21 -- 26 , 2000 . K. Bharat, A. Z. Broder, J. Dean, and M. R. Henzinger. A comparison of techniques to find mirrored hosts on the WWW. IEEE Data Engin. Bull., 23(4):21--26, 2000.","journal-title":"IEEE Data Engin. Bull."},{"key":"e_1_3_2_1_5_1","volume-title":"A survey on abstract rewriting. Available online at: www.di.ubi.pt\/~desousa\/1998-1999\/logica\/mb.ps","author":"Bognar M.","year":"1995","unstructured":"M. Bognar . A survey on abstract rewriting. Available online at: www.di.ubi.pt\/~desousa\/1998-1999\/logica\/mb.ps , 1995 . M. Bognar. A survey on abstract rewriting. Available online at: www.di.ubi.pt\/~desousa\/1998-1999\/logica\/mb.ps, 1995."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/223784.223855"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.5555\/283554.283370"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/342009.335429"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/WI.2003.1241201"},{"key":"e_1_3_2_1_10_1","volume-title":"Proc. 1st USITS","author":"Douglis F.","year":"1997","unstructured":"F. Douglis , A. Feldman , B. Krishnamurthy , and J. Mogul . Rate of change and other metrics: a live study of the world wide web . In Proc. 1st USITS , 1997 . F. Douglis, A. Feldman, B. Krishnamurthy, and J. Mogul. Rate of change and other metrics: a live study of the world wide web. In Proc. 1st USITS, 1997."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.5555\/382006.383197"},{"key":"e_1_3_2_1_12_1","volume-title":"Computers and Intractability: A Guide to the Theory of NP-Completeness","author":"Garey M. R.","year":"1979","unstructured":"M. R. Garey and D. S. Johnson . Computers and Intractability: A Guide to the Theory of NP-Completeness . W. H. Freeman , 1979 . M. R. Garey and D. S. Johnson. Computers and Intractability: A Guide to the Theory of NP-Completeness. W. H. Freeman, 1979."},{"key":"e_1_3_2_1_13_1","unstructured":"Google Inc. Google sitemaps. http:\/\/sitemaps.google.com.  Google Inc. Google sitemaps. http:\/\/sitemaps.google.com."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","DOI":"10.1017\/CBO9780511574931","volume-title":"Algorithms on Strings, Trees and Sequences: Computer Science and COmputational Biology","author":"Gusfield D.","year":"1997","unstructured":"D. Gusfield . Algorithms on Strings, Trees and Sequences: Computer Science and COmputational Biology . Cambridge University Press , 1997 . D. Gusfield. Algorithms on Strings, Trees and Sequences: Computer Science and COmputational Biology. Cambridge University Press, 1997."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1002\/asi.10170"},{"key":"e_1_3_2_1_16_1","first-page":"25","volume-title":"Proc. 7th WebDB","author":"Jain N.","year":"2005","unstructured":"N. Jain , M. Dahlin , and R. Tewari . Using bloom filters to refine web search results . In Proc. 7th WebDB , pages 25 -- 30 , 2005 . N. Jain, M. Dahlin, and R. Tewari. Using bloom filters to refine web search results. In Proc. 7th WebDB, pages 25--30, 2005."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/511446.511484"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/11751649_67"},{"key":"e_1_3_2_1_19_1","volume-title":"A URL-String-Based Algorithm for Finding WWW Mirror Host. Master's thesis","author":"Liang H.","year":"2001","unstructured":"H. Liang . A URL-String-Based Algorithm for Finding WWW Mirror Host. Master's thesis , Auburn University , 2001 . H. Liang. A URL-String-Based Algorithm for Finding WWW Mirror Host. Master's thesis, Auburn University, 2001."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/1149941.1149972"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/1135777.1135992"},{"key":"e_1_3_2_1_22_1","first-page":"204","volume-title":"Proc. 1st WebDB","author":"Shivakumar N.","year":"1998","unstructured":"N. Shivakumar and H. Garcia-Molina . Finding Near-Replicas of Documents and Servers on the Web . In Proc. 1st WebDB , pages 204 -- 212 , 1998 . N. Shivakumar and H. Garcia-Molina. Finding Near-Replicas of Documents and Servers on the Web. In Proc. 1st WebDB, pages 204--212, 1998."}],"event":{"name":"WWW'07: 16th International World Wide Web Conference","location":"Banff Alberta Canada","acronym":"WWW'07","sponsor":["ACM Association for Computing Machinery"]},"container-title":["Proceedings of the 16th international conference on World Wide Web"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/1242572.1242588","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/1242572.1242588","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T14:47:53Z","timestamp":1750258073000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/1242572.1242588"}},"subtitle":["different urls with similar text"],"short-title":[],"issued":{"date-parts":[[2007,5,8]]},"references-count":21,"alternative-id":["10.1145\/1242572.1242588","10.1145\/1242572"],"URL":"https:\/\/doi.org\/10.1145\/1242572.1242588","relation":{},"subject":[],"published":{"date-parts":[[2007,5,8]]},"assertion":[{"value":"2007-05-08","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}