{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,6]],"date-time":"2026-02-06T22:55:45Z","timestamp":1770418545451,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":55,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,6,10]],"date-time":"2022-06-10T00:00:00Z","timestamp":1654819200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,6,10]]},"DOI":"10.1145\/3514221.3526178","type":"proceedings-article","created":{"date-parts":[[2022,6,12]],"date-time":"2022-06-12T02:33:49Z","timestamp":1655001229000},"page":"1146-1159","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":9,"title":["TxtAlign: Efficient Near-Duplicate Text Alignment Search via Bottom-k Sketches for Plagiarism Detection"],"prefix":"10.1145","author":[{"given":"Zhizhi","family":"Wang","sequence":"first","affiliation":[{"name":"Rutgers University, New Brunswick, NJ, USA"}]},{"given":"Chaoji","family":"Zuo","sequence":"additional","affiliation":[{"name":"Rutgers University, New Brunswick, NJ, USA"}]},{"given":"Dong","family":"Deng","sequence":"additional","affiliation":[{"name":"Rutgers University, New Brunswick, NJ, USA"}]}],"member":"320","published-online":{"date-parts":[[2022,6,11]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Size comparision, accessed","author":"Wikipedia","year":"2021","unstructured":"Wikipedia: Size comparision, accessed September 14, 2021. https:\/\/en.wikipedia.org\/wiki\/Wikipedia:Size_comparisons."},{"key":"e_1_3_2_1_2_1","first-page":"918","volume-title":"VLDB","author":"Arasu A.","year":"2006","unstructured":"A. Arasu, V. Ganti, and R. Kaushik. Efficient exact set-similarity joins. In VLDB, pages 918--929, 2006."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.5555\/646978.711822"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/1242572.1242591"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1111\/j.1467-842X.1972.tb00899.x"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/223784.223855"},{"key":"e_1_3_2_1_7_1","first-page":"21","volume-title":"Proceedings. Compression and Complexity of SEQUENCES","author":"Broder A. Z.","year":"1997","unstructured":"A. Z. Broder. On the resemblance and containment of documents. In Proceedings. Compression and Complexity of SEQUENCES, pages 21--29. IEEE, 1997."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1006\/jcss.1999.1690"},{"key":"e_1_3_2_1_9_1","volume-title":"Syntactic clustering of the web. Comput. Networks, 29(8--13):1157--1166","author":"Broder A. Z.","year":"1997","unstructured":"A. Z. Broder, S. C. Glassman, M. S. Manasse, and G. Zweig. Syntactic clustering of the web. Comput. Networks, 29(8--13):1157--1166, 1997."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1006\/jcss.1997.1534"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/2588555.2593675"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.14778\/2856318.2856330"},{"key":"e_1_3_2_1_13_1","first-page":"905","volume-title":"Proceedings of the 2018 International Conference on Management of Data, SIGMOD Conference 2018","author":"Deng D.","year":"2018","unstructured":"D. Deng, Y. Tao, and G. Li. Overlap set similarity joins with theoretical guarantees. In G. Das, C. M. Jermaine, and P. A. Bernstein, editors, Proceedings of the 2018 International Conference on Management of Data, SIGMOD Conference 2018, Houston, TX, USA, June 10--15, 2018, pages 905--920. ACM, 2018."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ipm.2016.04.006"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/S15-2011"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3448016.3457548"},{"key":"e_1_3_2_1_17_1","first-page":"137","volume-title":"Discrete Mathematics and Theoretical Computer Science","author":"Flajolet P.","year":"2007","unstructured":"P. Flajolet, \u00c9. Fusy, O. Gandouet, and F. Meunier. Hyperloglog: the analysis of a near-optimal cardinality estimation algorithm. In Discrete Mathematics and Theoretical Computer Science, pages 137--156. Discrete Mathematics and Theoretical Computer Science, 2007."},{"key":"e_1_3_2_1_18_1","volume-title":"Probabilistic counting algorithms for data base applications. Journal of computer and system sciences, 31(2):182--209","author":"Flajolet P.","year":"1985","unstructured":"P. Flajolet and G. N. Martin. Probabilistic counting algorithms for data base applications. Journal of computer and system sciences, 31(2):182--209, 1985."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3345317"},{"key":"e_1_3_2_1_20_1","volume-title":"Semantic analysis: A practical introduction","author":"Goddard C.","year":"2011","unstructured":"C. Goddard. Semantic analysis: A practical introduction. Oxford University Press, 2011."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ipm.2016.06.007"},{"key":"e_1_3_2_1_22_1","volume-title":"Working Notes Papers of the CLEF 2015 Evaluation Labs, CEUR Workshop Proceedings. CEUR-WS.org","author":"Hagen M.","year":"2015","unstructured":"M. Hagen, M. Potthast, and B. Stein. Source Retrieval for Plagiarism Detection from Large Web Corpora: Recent Approaches. In Working Notes Papers of the CLEF 2015 Evaluation Labs, CEUR Workshop Proceedings. CEUR-WS.org, Sept. 2015."},{"key":"e_1_3_2_1_23_1","first-page":"61","volume-title":"Proceedings of the 18th International Conference on World Wide Web, WWW 2009","author":"Hamid O. A.","year":"2009","unstructured":"O. A. Hamid, B. Behzadi, S. Christoph, and M. R. Henzinger. Detecting the origin of text segments efficiently. In J. Quemada, G. Le\u00f3n, Y. S. Maarek, and W. Nejdl, editors, Proceedings of the 18th International Conference on World Wide Web, WWW 2009, Madrid, Spain, April 20--24, 2009, pages 61--70. ACM, 2009."},{"key":"e_1_3_2_1_24_1","volume-title":"arXiv preprint arXiv:1301.6705","author":"Hofmann T.","year":"2013","unstructured":"T. Hofmann. Probabilistic latent semantic analysis. arXiv preprint arXiv:1301.6705, 2013."},{"key":"e_1_3_2_1_25_1","first-page":"81","volume-title":"Proceedings of the 18th International Conference on World Wide Web, WWW 2009","author":"Kim J. W.","year":"2009","unstructured":"J. W. Kim, K. S. Candan, and J. Tatemura. Efficient overlap and content reuse detection in blogs and online news articles. In J. Quemada, G. Le\u00f3n, Y. S. Maarek, and W. Nejdl, editors, Proceedings of the 18th International Conference on World Wide Web, WWW 2009, Madrid, Spain, April 20--24, 2009, pages 81--90. ACM, 2009."},{"key":"e_1_3_2_1_26_1","unstructured":"S. Kliff. 2016 (accessed August 23 2020). https:\/\/www.vox.com\/2016\/7\/19\/12227658\/melania-plagarism-trump-excuses."},{"key":"e_1_3_2_1_27_1","volume-title":"An introduction to latent semantic analysis. Discourse processes, 25(2--3):259--284","author":"Landauer T. K.","year":"1998","unstructured":"T. K. Landauer, P. W. Foltz, and D. Laham. An introduction to latent semantic analysis. Discourse processes, 25(2--3):259--284, 1998."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE.2008.4497434"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/1989323.1989379"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.14778\/2078331.2078340"},{"key":"e_1_3_2_1_31_1","first-page":"953","article-title":"One sketch for all: Theory and application of conditional random sampling","volume":"21","author":"Li P.","year":"2008","unstructured":"P. Li, K. Church, and T. Hastie. One sketch for all: Theory and application of conditional random sampling. Advances in Neural Information Processing Systems, 21:953--960, 2008.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.3115\/1220575.1220664"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/1772690.1772759"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/1978542.1978566"},{"key":"e_1_3_2_1_35_1","first-page":"3122","volume-title":"NIPS","author":"Li P.","year":"2012","unstructured":"P. Li, A. B. Owen, and C. Zhang. One permutation hashing. In P. L. Bartlett, F. C. N. Pereira, C. J. C. Burges, L. Bottou, and K. Q. Weinberger, editors, NIPS, pages 3122--3130, 2012."},{"key":"e_1_3_2_1_36_1","first-page":"1","volume-title":"USENIX","author":"Manber U.","year":"1994","unstructured":"U. Manber. Finding similar files in a large file system. In USENIX Winter 1994 Technical Conference, San Francisco, California, USA, January 17--21, 1994, Conference Proceedings, pages 1--10. USENIX Association, 1994."},{"key":"e_1_3_2_1_37_1","volume-title":"Sequential poisson sampling. Journal of official Statistics, 14(2):149","author":"Ohlsson E.","year":"1998","unstructured":"E. Ohlsson. Sequential poisson sampling. Journal of official Statistics, 14(2):149, 1998."},{"key":"e_1_3_2_1_38_1","volume-title":"2nd International Competition on Plagiarism Detection. In M. Braschler, D. Harman, and E. Pianta, editors, Working Notes Papers of the CLEF 2010 Evaluation Labs","author":"Potthast M.","year":"2010","unstructured":"M. Potthast, A. Barr\u00f3n-Cede\u00f1o, A. Eiselt, B. Stein, and P. Rosso. Overview of the 2nd International Competition on Plagiarism Detection. In M. Braschler, D. Harman, and E. Pianta, editors, Working Notes Papers of the CLEF 2010 Evaluation Labs, Sept. 2010."},{"key":"e_1_3_2_1_39_1","volume-title":"Working Notes Papers of the CLEF 2011 Evaluation Labs.","author":"Potthast M.","unstructured":"M. Potthast, A. Eiselt, A. Barr\u00f3n-Cede\u00f1o, B. Stein, and P. Rosso. In Working Notes Papers of the CLEF 2011 Evaluation Labs."},{"key":"e_1_3_2_1_40_1","volume-title":"4th International Competition on Plagiarism Detection. In P. Forner, J. Karlgren, and C. Womser-Hacker, editors, Working Notes Papers of the CLEF 2012 Evaluation Labs","author":"Potthast M.","year":"2012","unstructured":"M. Potthast, T. Gollub, M. Hagen, J. Gra\u00dfegger, J. Kiesel, M. Michel, A. Oberl\u00e4nder, M. Tippmann, A. Barr\u00f3n-Cede\u00f1o, P. Gupta, P. Rosso, and B. Stein. Overview of the 4th International Competition on Plagiarism Detection. In P. Forner, J. Karlgren, and C. Womser-Hacker, editors, Working Notes Papers of the CLEF 2012 Evaluation Labs, Sept. 2012."},{"key":"e_1_3_2_1_41_1","volume-title":"5th International Competition on Plagiarism Detection. In P. Forner, R. Navigli, and D. Tufis, editors, Working Notes Papers of the CLEF 2013 Evaluation Labs","author":"Potthast M.","year":"2013","unstructured":"M. Potthast, T. Gollub, M. Hagen, M. Tippmann, J. Kiesel, P. Rosso, E. Stamatatos, and B. Stein. Overview of the 5th International Competition on Plagiarism Detection. In P. Forner, R. Navigli, and D. Tufis, editors, Working Notes Papers of the CLEF 2013 Evaluation Labs, Sept. 2013."},{"key":"e_1_3_2_1_42_1","volume-title":"Working Notes Papers of the CLEF 2014 Evaluation Labs, CEUR Workshop Proceedings. CEUR-WS.org","author":"Potthast M.","year":"2014","unstructured":"M. Potthast, M. Hagen, A. Beyer, M. Busse, M. Tippmann, P. Rosso, and B. Stein. Overview of the 6th International Competition on Plagiarism Detection. In L. Cappellato, N. Ferro, M. Halvey, and W. Kraaij, editors, Working Notes Papers of the CLEF 2014 Evaluation Labs, CEUR Workshop Proceedings. CEUR-WS.org, Sept. 2014."},{"key":"e_1_3_2_1_43_1","first-page":"997","volume-title":"COLING","author":"Potthast M.","year":"2010","unstructured":"M. Potthast, B. Stein, A. Barr\u00f3n-Cede\u00f1o, and P. Rosso. An evaluation framework for plagiarism detection. In COLING, pages 997--1005. Chinese Information Processing Society of China, 2010."},{"key":"e_1_3_2_1_44_1","first-page":"1","volume-title":"Overview of the 1st International Competition on Plagiarism Detection","author":"Potthast M.","year":"2009","unstructured":"M. Potthast, B. Stein, A. Eiselt, A. Barr\u00f3n-Cede\u00f1o, and P. Rosso. Overview of the 1st International Competition on Plagiarism Detection. In B. Stein, P. Rosso, E. Stamatatos, M. Koppel, and E. Agirre, editors, SEPLN 2009 Workshop on Uncovering Plagiarism, Authorship, and Social Software Misuse (PAN 2009), pages 1--9. CEUR-WS.org, Sept. 2009."},{"key":"e_1_3_2_1_45_1","volume-title":"Working Notes Papers of the CLEF 2014 Evaluation Labs. CEUR-WS.org","author":"Sanchez-Perez M.","year":"2014","unstructured":"M. Sanchez-Perez, G. Sidorov, and A. Gelbukh. A Winning Approach to Text Alignment for Text Reuse Detection at PAN 2014-Notebook for PAN at CLEF 2014. In L. Cappellato, N. Ferro, M. Halvey, and W. Kraaij, editors, Working Notes Papers of the CLEF 2014 Evaluation Labs. CEUR-WS.org, Sept. 2014."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24027-5_42"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/872757.872770"},{"key":"e_1_3_2_1_48_1","first-page":"571","volume-title":"Proceedings of the 31st Annual International ACM SIGIR Conference on Research and Development in Information Retrieval, SIGIR 2008","author":"Seo J.","year":"2008","unstructured":"J. Seo and W. B. Croft. Local text reuse detection. In S. Myaeng, D. W. Oard, F. Sebastiani, T. Chua, and M. Leong, editors, Proceedings of the 31st Annual International ACM SIGIR Conference on Research and Development in Information Retrieval, SIGIR 2008, Singapore, July 20--24, 2008, pages 571--578. ACM, 2008."},{"key":"e_1_3_2_1_49_1","first-page":"371","volume-title":"Symposium on Theory of Computing Conference, STOC'13","author":"Thorup M.","year":"2013","unstructured":"M. Thorup. Bottom-k and priority sampling, set similarity and subset sums with minimal independence. In D. Boneh, T. Roughgarden, and J. Feigenbaum, editors, Symposium on Theory of Computing Conference, STOC'13, Palo Alto, CA, USA, June 1--4, 2013, pages 371--380. ACM, 2013."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/1807167.1807222"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/2213836.2213847"},{"key":"e_1_3_2_1_52_1","first-page":"1991","volume-title":"SIGMOD","author":"Wang P.","year":"2016","unstructured":"P. Wang, C. Xiao, J. Qin, W. Wang, X. Zhang, and Y. Ishikawa. Local similarity search for unstructured text. In F. \u00d6zcan, G. Koutrika, and S. Madden, editors, SIGMOD, pages 1991--2005. ACM, 2016."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.14778\/3099622.3099624"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE.2009.111"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/1367497.1367516"}],"event":{"name":"SIGMOD\/PODS '22: International Conference on Management of Data","location":"Philadelphia PA USA","acronym":"SIGMOD\/PODS '22","sponsor":["SIGMOD ACM Special Interest Group on Management of Data"]},"container-title":["Proceedings of the 2022 International Conference on Management of Data"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3514221.3526178","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3514221.3526178","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T18:10:13Z","timestamp":1750183813000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3514221.3526178"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,6,10]]},"references-count":55,"alternative-id":["10.1145\/3514221.3526178","10.1145\/3514221"],"URL":"https:\/\/doi.org\/10.1145\/3514221.3526178","relation":{},"subject":[],"published":{"date-parts":[[2022,6,10]]},"assertion":[{"value":"2022-06-11","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}