{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T12:10:08Z","timestamp":1755864608001,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,6,9]],"date-time":"2024-06-09T00:00:00Z","timestamp":1717891200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,6,9]]},"DOI":"10.1145\/3626246.3653370","type":"proceedings-article","created":{"date-parts":[[2024,5,23]],"date-time":"2024-05-23T10:26:39Z","timestamp":1716459999000},"page":"80-92","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Similarity Joins of Sparse Features"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1084-8078","authenticated-orcid":false,"given":"Ahmed","family":"Metwally","sequence":"first","affiliation":[{"name":"Uber Technologies, Inc., Sunnyvale, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-3689-6113","authenticated-orcid":false,"given":"Michael","family":"Shum","sequence":"additional","affiliation":[{"name":"Massachusetts Institute of Technology, Cambridge, MA, USA"}]}],"member":"320","published-online":{"date-parts":[[2024,6,9]]},"reference":[{"key":"e_1_3_2_2_1_1","first-page":"918","volume-title":"Efficient Exact Set-Similarity Joins. In VLDB International Conference on Very Large Data Bases","author":"Arasu A.","year":"2006","unstructured":"A. Arasu, V. Ganti, and R. Kaushik. Efficient Exact Set-Similarity Joins. In VLDB International Conference on Very Large Data Bases, pages 918--929, 2006."},{"key":"e_1_3_2_2_2_1","first-page":"731","volume-title":"Lucchese. Document Similarity Self-Join with MapReduce. In IEEE ICDM International Conference on Data Mining","author":"Baraglia R.","year":"2010","unstructured":"R. Baraglia, G. De Francisci Morales, and C. Lucchese. Document Similarity Self-Join with MapReduce. In IEEE ICDM International Conference on Data Mining, pages 731--736, 2010."},{"key":"e_1_3_2_2_3_1","first-page":"131","volume-title":"Scaling Up All Pairs Similarity Search. In WWW International Conference on World Wide Web","author":"Bayardo R.","year":"2007","unstructured":"R. Bayardo, Y. Ma, and R. Srikant. Scaling Up All Pairs Similarity Search. In WWW International Conference on World Wide Web, pages 131--140, 2007."},{"key":"e_1_3_2_2_4_1","first-page":"21","volume-title":"On the Resemblance and Containment of Documents. In Compression and Complexity of SEQUENCES 1997 (Cat. No. 97TB100171)","author":"Broder A.","year":"1997","unstructured":"A. Broder. On the Resemblance and Containment of Documents. In Compression and Complexity of SEQUENCES 1997 (Cat. No. 97TB100171), pages 21--29. IEEE, 1997."},{"key":"e_1_3_2_2_5_1","first-page":"346","volume-title":"Compact Similarity Joins. In IEEE ICDE International Conference on Data Engineering","author":"Bryan B.","year":"2008","unstructured":"B. Bryan, F. Eberhardt, and C. Faloutsos. Compact Similarity Joins. In IEEE ICDE International Conference on Data Engineering, pages 346--355, 2008."},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_6_1","DOI":"10.1016\/S0031-3203(01)00118-2"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_7_1","DOI":"10.1109\/ICDE.2006.9"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_8_1","DOI":"10.1109\/TKDE.2016.2631599"},{"key":"e_1_3_2_2_9_1","volume-title":"DartMinHash: Fast Sketching for Weighted Sets. arXiv preprint arXiv:2005.11547","author":"Christiani T.","year":"2020","unstructured":"T. Christiani. DartMinHash: Fast Sketching for Weighted Sets. arXiv preprint arXiv:2005.11547, 2020."},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_10_1","DOI":"10.1109\/69.908981"},{"key":"e_1_3_2_2_11_1","first-page":"137","volume-title":"Mapreduce: Simplified Data Processing on Large Clusters. In USENIX OSDI Symposium on Operating System Design and Implementation","author":"Dean J.","year":"2004","unstructured":"J. Dean and S. Ghemawat. Mapreduce: Simplified Data Processing on Large Clusters. In USENIX OSDI Symposium on Operating System Design and Implementation, pages 137--150, 2004."},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_12_1","DOI":"10.1145\/1327452.1327492"},{"key":"e_1_3_2_2_13_1","first-page":"340","volume-title":"MassJoin: A MapReduce-based Method for Scalable String Similarity Joins. In IEEE ICDE International Conference on Data Engineering","author":"Deng D.","year":"2014","unstructured":"D. Deng, G. Li, S. Hao, J. Wang, and J. Feng. MassJoin: A MapReduce-based Method for Scalable String Similarity Joins. In IEEE ICDE International Conference on Data Engineering, pages 340--351. IEEE, 2014."},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_14_1","DOI":"10.14778\/2856318.2856330"},{"key":"e_1_3_2_2_15_1","first-page":"265","volume-title":"Pairwise Document Similarity in Large Collections with MapReduce. In HLT Meeting of the ACL on Human Language Technologies: Short Papers","author":"Elsayed T.","year":"2008","unstructured":"T. Elsayed, J. Lin, and D. Oard. Pairwise Document Similarity in Large Collections with MapReduce. In HLT Meeting of the ACL on Human Language Technologies: Short Papers, pages 265--268, 2008."},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_16_1","DOI":"10.1145\/3219819.3220089"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_17_1","DOI":"10.14778\/3231751.3231760"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_18_1","DOI":"10.1007\/978-3-030-89657-7_2"},{"issue":"6","key":"e_1_3_2_2_19_1","first-page":"518","article-title":"Similarity Search in High Dimensions via Hashing","volume":"99","author":"Gionis A.","year":"1999","unstructured":"A. Gionis, P. Indyk, and R. Motwani. Similarity Search in High Dimensions via Hashing. VLDB Endowment, 99(6):518--529, 1999.","journal-title":"VLDB Endowment"},{"key":"e_1_3_2_2_20_1","first-page":"491","volume-title":"VLDB International Conference on Very Large Data Bases","author":"Gravano L.","year":"2001","unstructured":"L. Gravano, P. Ipeirotis, H. Jagadish, N. Koudas, S. Muthukrishnan, and D. Srivastava. Approximate String Joins in a Database (Almost) for Free. In VLDB International Conference on Very Large Data Bases, pages 491--500, 2001."},{"key":"e_1_3_2_2_21_1","first-page":"246","volume-title":"Sketching. In IEEE ICDM International Conference on Data Mining","author":"Ioffe S.","year":"2010","unstructured":"S. Ioffe. Improved Consistent Sampling, Weighted Minhash and L1 Sketching. In IEEE ICDM International Conference on Data Mining, pages 246--255. IEEE, 2010."},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_22_1","DOI":"10.1007\/s00607-005-0139-x"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_23_1","DOI":"10.1016\/j.scico.2007.07.001"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_24_1","DOI":"10.1017\/9781108684163"},{"key":"e_1_3_2_2_25_1","first-page":"257","volume-title":"Efficient Merging and Filtering Algorithms for Approximate String Searches. In ICDE 42nd IEEE International Conference on Data Engineering","author":"Li C.","year":"2008","unstructured":"C. Li, J. Lu, and Y. Lu. Efficient Merging and Filtering Algorithms for Approximate String Searches. In ICDE 42nd IEEE International Conference on Data Engineering, pages 257--266, 2008."},{"issue":"3","key":"e_1_3_2_2_26_1","first-page":"253","volume":"5","author":"Li G.","year":"2012","unstructured":"G. Li, D. Deng, J. Wang, and J. Feng. Pass-Join: A Partition-based Method for Similarity Joins. VLDB Endowment, 5(3):253--264, 2012.","journal-title":"A Partition-based Method for Similarity Joins. VLDB Endowment"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_27_1","DOI":"10.1145\/2783258.2783406"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_28_1","DOI":"10.1609\/aaai.v35i5.16543"},{"key":"e_1_3_2_2_29_1","volume-title":"Consistent Weighted Sampling. Unpublished technical report) http:\/\/research. microsoft. com\/en-us\/people\/manasse, 2","author":"Manasse M.","year":"2010","unstructured":"M. Manasse, F. McSherry, and K. Talwar. Consistent Weighted Sampling. Unpublished technical report) http:\/\/research. microsoft. com\/en-us\/people\/manasse, 2, 2010."},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_30_1","DOI":"10.14778\/2947618.2947620"},{"key":"e_1_3_2_2_31_1","volume-title":"Scaling and Load-Balancing Equi-Joins. arXiv preprint arXiv:2209.08475","author":"Metwally A.","year":"2022","unstructured":"A. Metwally. Scaling and Load-Balancing Equi-Joins. arXiv preprint arXiv:2209.08475, 2022."},{"key":"e_1_3_2_2_32_1","first-page":"2163","volume-title":"Scaling Equi-Joins. In ACM SIGMOD International Conference on Management of Data","author":"Metwally A.","year":"2022","unstructured":"A. Metwally. Scaling Equi-Joins. In ACM SIGMOD International Conference on Management of Data, pages 2163--2176, 2022."},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_33_1","DOI":"10.1145\/1242572.1242606"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_34_1","DOI":"10.14778\/2212351.2212353"},{"key":"e_1_3_2_2_35_1","first-page":"1766","volume-title":"Huang. Scalable Similarity Joins of Tokenized Strings. In IEEE ICDE International Conference on Data Engineering","author":"Metwally A.","year":"2019","unstructured":"A. Metwally and C.-H. Huang. Scalable Similarity Joins of Tokenized Strings. In IEEE ICDE International Conference on Data Engineering, pages 1766--1777. IEEE, 2019."},{"key":"e_1_3_2_2_36_1","first-page":"74","volume-title":"Gapped Local Similarity Search with Provable Guarantees. In International Workshop on Algorithms in Bioinformatics","author":"Narayanan M.","year":"2004","unstructured":"M. Narayanan and R. Karp. Gapped Local Similarity Search with Provable Guarantees. In International Workshop on Algorithms in Bioinformatics, pages 74--86. Springer, 2004."},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_37_1","DOI":"10.1145\/3269206.3271690"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_38_1","DOI":"10.1007\/s10766-022-00733-6"},{"key":"e_1_3_2_2_39_1","first-page":"1059","volume-title":"Fast and Scalable Distributed Set Similarity Joins for Big Data Analytics. In IEEE ICDE International Conference on Data Engineering","author":"Rong C.","year":"2017","unstructured":"C. Rong, C. Lin, Y. Silva, J.Wang,W. Lu, and X. Du. Fast and Scalable Distributed Set Similarity Joins for Big Data Analytics. In IEEE ICDE International Conference on Data Engineering, pages 1059--1070. IEEE, 2017."},{"issue":"10","key":"e_1_3_2_2_40_1","first-page":"2217","volume":"25","author":"Rong C.","year":"2012","unstructured":"C. Rong, W. Lu, X. Wang, X. Du, Y. Chen, and A. Tung. Efficient and Scalable Processing of String Similarity Join. IEEE Transactions on Knowledge and Data Engineering, 25(10):2217--2230, 2012.","journal-title":"Efficient and Scalable Processing of String Similarity Join. IEEE Transactions on Knowledge and Data Engineering"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_41_1","DOI":"10.14778\/2732977.2732981"},{"key":"e_1_3_2_2_42_1","first-page":"29","author":"Shrivastava A.","year":"2016","unstructured":"A. Shrivastava. Simple and Efficient Weighted Minwise Hashing. NeurIPS Advances in Neural Information Processing Systems, 29, 2016.","journal-title":"NeurIPS Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_43_1","first-page":"693","volume-title":"Exploiting MapReduce-based Similarity Joins. In 32nd ACM SIGMOD International Conference on Management of Data","author":"Silva Y.","year":"2012","unstructured":"Y. Silva and J. Reed. Exploiting MapReduce-based Similarity Joins. In 32nd ACM SIGMOD International Conference on Management of Data, pages 693--696, 2012."},{"doi-asserted-by":"crossref","unstructured":"R. Stanley. Enumerative Combinatorics volume 1. Cambridge University Press 2002.","key":"e_1_3_2_2_44_1","DOI":"10.1007\/978-1-4615-9763-6_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_45_1","DOI":"10.14778\/3137765.3137810"},{"key":"e_1_3_2_2_46_1","first-page":"495","volume-title":"Efficient Parallel Set-Similarity Joins Using MapReduce. In ACM SIGMOD International Conference on Management of Data","author":"Vernica R.","year":"2010","unstructured":"R. Vernica, M. Carey, and C. Li. Efficient Parallel Set-Similarity Joins Using MapReduce. In ACM SIGMOD International Conference on Management of Data, pages 495--506, 2010."},{"key":"e_1_3_2_2_47_1","first-page":"829","volume-title":"Scalable All-Pairs Similarity Search in Metric Spaces. In ACM SIGKDD International Conference on Knowledge Discovery and Data Mining","author":"Wang Y.","year":"2013","unstructured":"Y.Wang, A. Metwally, and S. Parthasarathy. Scalable All-Pairs Similarity Search in Metric Spaces. In ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pages 829--837, 2013."},{"issue":"6","key":"e_1_3_2_2_48_1","first-page":"2553","volume":"34","author":"Wu W.","year":"2020","unstructured":"W. Wu, B. Li, L. Chen, J. Gao, and C. Zhang. A Review for Weighted Min-Hash Algorithms. TKDE IEEE Transactions on Knowledge and Data Engineering, 34(6):2553--2573, 2020.","journal-title":"A Review for Weighted Min-Hash Algorithms. TKDE IEEE Transactions on Knowledge and Data Engineering"},{"key":"e_1_3_2_2_49_1","first-page":"1035","volume-title":"Consistent Weighted Sampling Made More Practical. In WWW International Conference on World Wide Web","author":"Wu W.","year":"2017","unstructured":"W. Wu, B. Li, L. Chen, and C. Zhang. Consistent Weighted Sampling Made More Practical. In WWW International Conference on World Wide Web, pages 1035--1043, 2017."},{"doi-asserted-by":"publisher","key":"e_1_3_2_2_50_1","DOI":"10.1109\/TKDE.2018.2876250"},{"issue":"1","key":"e_1_3_2_2_51_1","first-page":"933","volume":"1","author":"Xiao C.","year":"2008","unstructured":"C. Xiao, W. Wang, and X. Lin. Ed-Join: An Efficient Algorithm for Similarity Joins With Edit Distance Constraints. VLDB Endowment, 1(1):933--944, 2008.","journal-title":"An Efficient Algorithm for Similarity Joins With Edit Distance Constraints. VLDB Endowment"},{"key":"e_1_3_2_2_52_1","first-page":"131","volume-title":"Efficient Similarity Joins for Near Duplicate Detection. In WWW International Conference on World Wide Web","author":"Xiao C.","year":"2008","unstructured":"C. Xiao, W. Wang, X. Lin, and J. Yu. Efficient Similarity Joins for Near Duplicate Detection. In WWW International Conference on World Wide Web, pages 131--140, 2008."},{"key":"e_1_3_2_2_53_1","volume-title":"Spark: Cluster Computing with Working Sets. HotCloud, 10(10--10):95","author":"Zaharia M.","year":"2010","unstructured":"M. Zaharia, M. Chowdhury, M. Franklin, S. Shenker, and I. Stoica. Spark: Cluster Computing with Working Sets. HotCloud, 10(10--10):95, 2010."},{"key":"e_1_3_2_2_54_1","volume-title":"A Survey of Large Language Models. arXiv preprint arXiv:2303.18223","author":"Zhao W.","year":"2023","unstructured":"W. Zhao, K. Zhou, J. Li, T. Tang, X. Wang, Y. Hou, Y. Min, B. Zhang, J. Zhang, Z. Dong, D. Yifan, Y. Chen, C. Yushuo, C. Zhipeng, J. Jinhao, R. Ruiyang, L. Yifan, T. Xinyu, L. Zikang, L. Peiyu, N. Jian-Yun, and W. Ji-Rong. A Survey of Large Language Models. arXiv preprint arXiv:2303.18223, 2023."}],"event":{"sponsor":["SIGMOD ACM Special Interest Group on Management of Data"],"acronym":"SIGMOD\/PODS '24","name":"SIGMOD\/PODS '24: International Conference on Management of Data","location":"Santiago AA Chile"},"container-title":["Companion of the 2024 International Conference on Management of Data"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3626246.3653370","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3626246.3653370","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T11:29:24Z","timestamp":1755862164000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3626246.3653370"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,9]]},"references-count":54,"alternative-id":["10.1145\/3626246.3653370","10.1145\/3626246"],"URL":"https:\/\/doi.org\/10.1145\/3626246.3653370","relation":{},"subject":[],"published":{"date-parts":[[2024,6,9]]},"assertion":[{"value":"2024-06-09","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}