{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,19]],"date-time":"2025-12-19T09:31:08Z","timestamp":1766136668351},"reference-count":25,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2015,11,9]],"date-time":"2015-11-09T00:00:00Z","timestamp":1447027200000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Cluster Comput"],"published-print":{"date-parts":[[2016,3]]},"DOI":"10.1007\/s10586-015-0506-0","type":"journal-article","created":{"date-parts":[[2015,11,9]],"date-time":"2015-11-09T03:41:56Z","timestamp":1447040516000},"page":"109-126","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":18,"title":["Efficient top-k similarity document search utilizing distributed file systems and cosine similarity"],"prefix":"10.1007","volume":"19","author":[{"given":"Mahmoud","family":"Alewiwi","sequence":"first","affiliation":[]},{"given":"Cengiz","family":"Orencik","sequence":"additional","affiliation":[]},{"given":"Erkay","family":"Sava\u015f","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2015,11,9]]},"reference":[{"issue":"3","key":"506_CR1","doi-asserted-by":"crossref","first-page":"263","DOI":"10.1016\/j.datak.2004.08.003","volume":"53","author":"F Angiulli","year":"2005","unstructured":"Angiulli, F., Pizzuti, C.: An approximate algorithm for top-k closest pairs join query in large high dimensional data. Data Knowl. Eng. 53(3), 263\u2013281 (2005)","journal-title":"Data Knowl. Eng."},{"key":"506_CR2","unstructured":"Apache Hadoop. http:\/\/hadoop.apache.org"},{"key":"506_CR3","unstructured":"Arasu, A., Ganti, V., Kaushik, R.: Efficient exact set-similarity joins. In: Proceedings of the 32nd International Conference on Very Large Data Bases, VLDB \u201906, pp. 918\u2013929. VLDB Endowment (2006)"},{"key":"506_CR4","doi-asserted-by":"crossref","unstructured":"Baraglia, R., De\u00a0Francisci\u00a0Morales, G., Lucchese, C.: Document similarity self-join with MapReduce. In: 2010 IEEE 10th International Conference on Data Mining (ICDM), pp. 731\u2013736 (2010). doi: 10.1109\/ICDM.2010.70","DOI":"10.1109\/ICDM.2010.70"},{"key":"506_CR5","doi-asserted-by":"crossref","unstructured":"Bayardo, R.J., Ma, Y., Srikant, R.: Scaling up all pairs similarity search. In: Proceedings of the 16th International Conference on World Wide Web, WWW \u201907, pp. 131\u2013140. ACM, New York (2007). doi: 10.1145\/1242572.1242591","DOI":"10.1145\/1242572.1242591"},{"issue":"1","key":"506_CR6","doi-asserted-by":"crossref","first-page":"106","DOI":"10.1145\/1539024.1508904","volume":"41","author":"RA Brown","year":"2009","unstructured":"Brown, R.A.: Hadoop at home: large-scale computing at a small college. SIGCSE Bull. 41(1), 106\u2013110 (2009). doi: 10.1145\/1539024.1508904","journal-title":"SIGCSE Bull."},{"key":"506_CR7","doi-asserted-by":"crossref","unstructured":"Chaudhuri, S., Ganti, V., Kaushik, R.: A primitive operator for similarity joins in data cleaning. In: Proceedings of the 22nd International Conference on Data Engineering, ICDE \u201906, p. 5. IEEE Computer Society, Washington, DC (2006). doi: 10.1109\/ICDE.2006.9","DOI":"10.1109\/ICDE.2006.9"},{"issue":"4","key":"506_CR8","doi-asserted-by":"crossref","first-page":"599","DOI":"10.1109\/TVCG.2010.9","volume":"16","author":"M Connor","year":"2010","unstructured":"Connor, M., Kumar, P.: Fast construction of k-nearest neighbor graphs for point clouds. IEEE Trans. Vis. Comput. Graph. 16(4), 599\u2013608 (2010). doi: 10.1109\/TVCG.2010.9","journal-title":"IEEE Trans. Vis. Comput. Graph."},{"issue":"1","key":"506_CR9","doi-asserted-by":"crossref","first-page":"107","DOI":"10.1145\/1327452.1327492","volume":"51","author":"J Dean","year":"2008","unstructured":"Dean, J., Ghemawat, S.: Mapreduce: simplified data processing on large clusters. Commun. ACM 51(1), 107\u2013113 (2008). doi: 10.1145\/1327452.1327492","journal-title":"Commun. ACM"},{"key":"506_CR10","doi-asserted-by":"crossref","unstructured":"Elsayed, T., Lin, J., Oard, D.W.: Pairwise document similarity in large collections with mapreduce. In: Proceedings of the 46th Annual Meeting of the Association for Computational Linguistics on Human Language Technologies: Short Papers. HLT-Short \u201908, pp. 265\u2013268. Association for Computational Linguistics, Stroudsburg (2008)","DOI":"10.3115\/1557690.1557767"},{"key":"506_CR11","unstructured":"Enron Dataset. http:\/\/www.cs.cmu.edu\/~.\/enron\/"},{"key":"506_CR12","doi-asserted-by":"crossref","unstructured":"Falchi, F., Perego, R., Lucchese, C., Rabitti, F., Orlando, S.: A metric cache for similarity search. In: LSDS-IR (2008)","DOI":"10.1145\/1458469.1458473"},{"key":"506_CR13","unstructured":"Lewis, D.D., Yang, Y., Rose, T.G., Li, F.: RCV1: a new benchmark collection for text categorization research. J. Mach. Learn. Res. 5, 361\u2013397 (2004). http:\/\/dl.acm.org\/citation.cfm?id=1005332.1005345"},{"key":"506_CR14","first-page":"412","volume-title":"APWeb. Lecture Notes in Computer Science","author":"R Li","year":"2011","unstructured":"Li, R., Ju, L., Peng, Z., Yu, Z., Wang, C.: Batch text similarity search with mapreduce. In: Du, X., Fan, W., Peng, Z., Sharaf, M.A. (eds.) APWeb. Lecture Notes in Computer Science, vol. 6612, pp. 412\u2013423. Springer, Heidelberg (2011)"},{"key":"506_CR15","unstructured":"Lucene. http:\/\/lucene.apache.org\/"},{"key":"506_CR16","doi-asserted-by":"crossref","DOI":"10.1017\/CBO9780511809071","volume-title":"Introduction to Information Retrieval","author":"CD Manning","year":"2008","unstructured":"Manning, C.D., Raghavan, P., Sch\u00fctze, H.: Introduction to Information Retrieval. Cambridge University Press, New York (2008)"},{"key":"506_CR17","doi-asserted-by":"crossref","unstructured":"Phan, T.C., d\u2019Orazio, L., Rigaux, P.: Toward intersection filter-based optimization for joins in mapreduce. In: Cloud-I\u201913, p. 2 (2013)","DOI":"10.1145\/2501928.2501932"},{"key":"506_CR18","volume-title":"Mining of Massive Datasets","author":"A Rajaraman","year":"2012","unstructured":"Rajaraman, A., Ullman, J.D.: Mining of Massive Datasets. Cambridge University Press, Cambridge (2012)"},{"key":"506_CR19","doi-asserted-by":"crossref","unstructured":"Sarawagi, S., Kirpal, A.: Efficient set joins on similarity predicates. In: Proceedings of the 2004 ACM SIGMOD International Conference on Management of Data, SIGMOD \u201904, pp. 743\u2013754. ACM, New York (2004). doi: 10.1145\/1007568.1007652","DOI":"10.1145\/1007568.1007652"},{"issue":"3","key":"506_CR20","first-page":"20:1","volume":"35","author":"Y Tao","year":"2010","unstructured":"Tao, Y., Yi, K., Sheng, C., Kalnis, P.: Efficient and accurate nearest neighbor and closest pair search in high-dimensional space. ACM Trans. Database Syst. 35(3), 20:1\u201320:46 (2010). doi: 10.1145\/1806907.1806912","journal-title":"ACM Trans. Database Syst."},{"key":"506_CR21","doi-asserted-by":"crossref","unstructured":"Vernica, R., Carey, M.J., Li, C.: Efficient parallel set-similarity joins using mapreduce. In: Proceedings of the 2010 ACM SIGMOD International Conference on Management of Data, SIGMOD \u201910, pp. 495\u2013506. ACM, New York (2010). doi: 10.1145\/1807167.1807222","DOI":"10.1145\/1807167.1807222"},{"key":"506_CR22","doi-asserted-by":"crossref","unstructured":"Xiao, C., Wang, W., Lin, X., Yu, J.X.: Efficient similarity joins for near duplicate detection. In: Proceedings of the 17th International Conference on World Wide Web, WWW \u201908, pp. 131\u2013140. ACM, New York (2008). doi: 10.1145\/1367497.1367516","DOI":"10.1145\/1367497.1367516"},{"key":"506_CR23","doi-asserted-by":"crossref","unstructured":"Yang, B., Myung, J., Lee, S.G., Lee, D.: A mapreduce-based filtering algorithm for vector similarity join. In: Proceedings of the 7th International Conference on Ubiquitous Information Management and Communication, ICUIMC \u201913, pp. 71:1\u201371:5. ACM, New York (2013). doi: 10.1145\/2448556.2448627","DOI":"10.1145\/2448556.2448627"},{"key":"506_CR24","doi-asserted-by":"crossref","unstructured":"Zhang, C., Li, F., Jestes, J.: Efficient parallel knn joins for large data in mapreduce. In: Proceedings of the 15th International Conference on Extending Database Technology, EDBT \u201912, pp. 38\u201349. ACM, New York (2012). doi: 10.1145\/2247596.2247602","DOI":"10.1145\/2247596.2247602"},{"issue":"1","key":"506_CR25","doi-asserted-by":"crossref","first-page":"60","DOI":"10.1016\/j.datak.2010.08.004","volume":"70","author":"S Zhu","year":"2011","unstructured":"Zhu, S., Wu, J., Xiong, H., Xia, G.: Scaling up top-k cosine similarity search. Data Knowl. Eng. 70(1), 60\u201383 (2011)","journal-title":"Data Knowl. Eng."}],"container-title":["Cluster Computing"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10586-015-0506-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10586-015-0506-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10586-015-0506-0","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,5,31]],"date-time":"2019-05-31T11:37:45Z","timestamp":1559302665000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10586-015-0506-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2015,11,9]]},"references-count":25,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2016,3]]}},"alternative-id":["506"],"URL":"https:\/\/doi.org\/10.1007\/s10586-015-0506-0","relation":{},"ISSN":["1386-7857","1573-7543"],"issn-type":[{"value":"1386-7857","type":"print"},{"value":"1573-7543","type":"electronic"}],"subject":[],"published":{"date-parts":[[2015,11,9]]}}}