{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,24]],"date-time":"2025-08-24T01:57:06Z","timestamp":1756000626481,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":76,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,12,15]],"date-time":"2023-12-15T00:00:00Z","timestamp":1702598400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100006374","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["IIS-18-16889, IIS-20-41415, IIS-21-14451"],"award-info":[{"award-number":["IIS-18-16889, IIS-20-41415, IIS-21-14451"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,12,15]]},"DOI":"10.1145\/3639233.3639356","type":"proceedings-article","created":{"date-parts":[[2024,3,5]],"date-time":"2024-03-05T11:02:10Z","timestamp":1709636530000},"page":"1-11","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Cross-lingual Text Clustering in a Large System"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9528-6077","authenticated-orcid":false,"given":"Nicole R.","family":"Schneider","sequence":"first","affiliation":[{"name":"Computer Science, University of Maryland, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-0369-816X","authenticated-orcid":false,"given":"Jagan","family":"Sankaranarayanan","sequence":"additional","affiliation":[{"name":"Data Infrastructure, Google, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8230-0653","authenticated-orcid":false,"given":"Hanan","family":"Samet","sequence":"additional","affiliation":[{"name":"Computer Science, University of Maryland, United States of America"}]}],"member":"320","published-online":{"date-parts":[[2024,3,5]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"[n. d.]. Kapabah. https:\/\/www.caravan.kz\/news\/smertelnyjj-smog-zasukha-i-ugroza-goloda-o-kakikh-ehkologicheskikh-problemakh-kazakhstana-chashhe-vsego-pishut-v-zapadnykh-smi-753179\/. Accessed: 2022-12-01."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","unstructured":"C. Aggarwal and P. Yu. 2006. A Framework for Clustering Massive Text and Categorical Data Streams Vol.\u00a02006. https:\/\/doi.org\/10.1137\/1.9781611972764.44","DOI":"10.1137\/1.9781611972764.44"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","unstructured":"C. Aggarwal and C. Zhai. 2012. A survey of text clustering algorithms. Mining Text Data (2012) 77\u2013128. https:\/\/doi.org\/10.1007\/978-1-4614-3223-4_4","DOI":"10.1007\/978-1-4614-3223-4_4"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1016\/B978-012722442-8\/50016-1"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"C.\u00a0C. Aggarwal J. Han J. Wang and P.\u00a0S. Yu. 2004. A Framework for Projected Clustering of High Dimensional Data Streams. In VLDB.","DOI":"10.1016\/B978-012088469-8\/50075-9"},{"volume-title":"Proceedings of the Workshop on Search in Social Media (SSM 2009), co-located with ACM SIGIR 2009 Conference on Information Retrieval","author":"Agrawal M.","key":"e_1_3_2_1_6_1","unstructured":"M. Agrawal, M. Karimzadehgan, and C. Zhai. 2009. An online news recommender system for social networks. In Proceedings of the Workshop on Search in Social Media (SSM 2009), co-located with ACM SIGIR 2009 Conference on Information Retrieval, Boston. Citeseer."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/502585.502635"},{"key":"e_1_3_2_1_8_1","unstructured":"W. Ammar G. Mulcaire G. Lample C. Dyer and N.\u00a0A. Smith. 2018. C L ] 2 1 M ay 2 01 6 Massively Multilingual Word Embeddings."},{"key":"e_1_3_2_1_9_1","unstructured":"R. Baeza-Yates and B. Ribeiro-Neto. 2011. Modern Information Retrieval the Concepts and Technology Behind Search."},{"volume-title":"[n. d.]. LingPipe","author":"Baldwin B.","key":"e_1_3_2_1_10_1","unstructured":"B. Baldwin and B. Carpenter. [n. d.]. LingPipe. http:\/\/alias-i.com\/lingpipe\/. Accessed: 2022-07-13."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","unstructured":"S. Banerjee K. Ramanathan and A. Gupta. 2007. Clustering Short Texts Using Wikipedia(SIGIR \u201907). Association for Computing Machinery New York NY USA 787\u2013788. https:\/\/doi.org\/10.1145\/1277741.1277909","DOI":"10.1145\/1277741.1277909"},{"volume-title":"Proceedings of the 2012 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. Association for Computational Linguistics","author":"Bansal M.","key":"e_1_3_2_1_12_1","unstructured":"M. Bansal, J. DeNero, and D. Lin. 2012. Unsupervised Translation Sense Clustering. In Proceedings of the 2012 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. Association for Computational Linguistics, Montr\u00e9al, Canada, 773\u2013782. https:\/\/aclanthology.org\/N12-1095"},{"key":"e_1_3_2_1_13_1","unstructured":"P.\u00a0S. Bradley U.\u00a0M. Fayyad and C. Reina. 1998. Scaling Clustering Algorithms to Large Databases. In KDD."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/1571941.1571967"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3130348.3130362"},{"key":"e_1_3_2_1_16_1","unstructured":"J. Ding L. Gravano and N. Shivakumar. 2000. Computing geographical scopes of web resources. (2000)."},{"volume-title":"Pattern Classification and Scene Analysis","author":"Duda O.","key":"e_1_3_2_1_17_1","unstructured":"R.\u00a0O. Duda and P.\u00a0E. Hart. 1973. Pattern Classification and Scene Analysis. Wiley Interscience, New York."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/360402.360419"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"F. Feng Y. Yang D. Cer N. Arivazhagan and W. Wang. 2022. Language-agnostic BERT Sentence Embedding. arxiv:2007.01852\u00a0[cs.CL]","DOI":"10.18653\/v1\/2022.acl-long.62"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.58680\/ce196524075"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10707-012-0173-8"},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of the 2012 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. Association for Computational Linguistics","author":"Green S.","year":"2012","unstructured":"S. Green, N. Andrews, M.\u00a0R. Gormley, M. Dredze, and C.\u00a0D. Manning. 2012. Entity Clustering Across Languages. In Proceedings of the 2012 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. Association for Computational Linguistics, Montr\u00e9al, Canada, 60\u201369. https:\/\/aclanthology.org\/N12-1007"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2003.1198387"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","unstructured":"Q. He K. Chang E. Lim and J. Zhang. [n. d.]. Bursty Feature Representation for Clustering Text Streams. 491\u2013496. https:\/\/doi.org\/10.1137\/1.9781611972771.50 arXiv:https:\/\/epubs.siam.org\/doi\/pdf\/10.1137\/1.9781611972771.50","DOI":"10.1137\/1.9781611972771.50"},{"volume-title":"Proceedings of the First ACM SIGSPATIAL International Workshop on Mobile Geographic Information Systems. 25\u201332","author":"Ho S.","key":"e_1_3_2_1_25_1","unstructured":"S. Ho, M. Lieberman, P. Wang, and H. Samet. 2012. Mining future spatiotemporal events and their sentiment from online news articles for location-aware recommendation system. In Proceedings of the First ACM SIGSPATIAL International Workshop on Mobile Geographic Information Systems. 25\u201332."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-017-4838-z"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/1390334.1390367"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/502512.502529"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/2903220.2903229"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"D. Kifer S. Ben-David and J. Gehrke. 2004. Detecting Change in Data Streams. In VLDB.","DOI":"10.1016\/B978-012088469-8.50019-X"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/1835449.1835633"},{"key":"e_1_3_2_1_32_1","unstructured":"G. Leban B. Fortuna J. Brank and M. Grobelnik. 2014. Cross-lingual detection of world events from news articles.. In ISWC (Posters & Demos). 21\u201324."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/2567948.2577024"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1023\/a:1023250105036"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","unstructured":"K. Lee K. Kageura and K. Choi. 2002. Implicit Ambiguity Resolution Using Incremental Clustering in Korean-to-English Cross-Language Information Retrieval.https:\/\/doi.org\/10.3115\/1072228.1072314","DOI":"10.3115\/1072228.1072314"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/2047296.2047298"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/2009916.2009937"},{"volume-title":"Proceedings of SIGIR\u201911","author":"Lieberman M.D.","key":"e_1_3_2_1_38_1","unstructured":"M.D. Lieberman and H. Samet. 2011. Multifaceted toponym recognition for streaming news. In Proceedings of SIGIR\u201911. Beijing, China, 843\u2013852."},{"volume-title":"Proceedings of SIGIR\u201912","author":"Lieberman D.","key":"e_1_3_2_1_39_1","unstructured":"M.\u00a0D. Lieberman and H. Samet. 2012. Adaptive context features for toponym resolution in streaming news. In Proceedings of SIGIR\u201912. Portland, OR, 731\u2013740."},{"volume-title":"Proceedings of 6th Workshop on Geographic Information Retrieval","author":"Lieberman D.","key":"e_1_3_2_1_40_1","unstructured":"M.\u00a0D. Lieberman, H. Samet, and J. Sankaranarayanan. 2010. Geotagging: Using proximity, sibling, and prominence clues to understand comma groups. In Proceedings of 6th Workshop on Geographic Information Retrieval. Zurich, Switzerland."},{"volume-title":"Proceedings of the 26th IEEE International Conference on Data Engineering","author":"Lieberman D.","key":"e_1_3_2_1_41_1","unstructured":"M.\u00a0D. Lieberman, H. Samet, and J. Sankaranarayanan. 2010. Geotagging with local lexicons to build indexes for textually-specified spatial data. In Proceedings of the 26th IEEE International Conference on Data Engineering. Long Beach, CA, 201\u2013212."},{"volume-title":"Proceedings of the 15th ACM International Symposium on Advances in Geographic Information Systems, H.\u00a0Samet, M.\u00a0Schneider, and C.\u00a0Shahabi (Eds.)","author":"Lieberman D.","key":"e_1_3_2_1_42_1","unstructured":"M.\u00a0D. Lieberman, H. Samet, J. Sankaranarayanan, and J. Sperling. 2007. STEWARD: architecture of a spatio-textual search engine. In Proceedings of the 15th ACM International Symposium on Advances in Geographic Information Systems, H.\u00a0Samet, M.\u00a0Schneider, and C.\u00a0Shahabi (Eds.). Seattle, WA, 186\u2013193."},{"volume-title":"Proceedings of LocWeb\u201914","author":"Liu F.","key":"e_1_3_2_1_43_1","unstructured":"F. Liu, M. Vasardani, and T. Baldwin. 2014. Automatic identification of locative expressions from social media text: A comparative analysis. In Proceedings of LocWeb\u201914. Shanghai, China, 9\u201316."},{"volume-title":"Machine Translation and the Information Soup","author":"Oard W.","key":"e_1_3_2_1_44_1","unstructured":"D.\u00a0W. Oard. 1998. A Comparative Study of Query and Document Translation for Cross-Language Information Retrieval. In Machine Translation and the Information Soup, David Farwell, Laurie Gerber, and Eduard Hovy (Eds.). Springer Berlin Heidelberg, Berlin, Heidelberg, 472\u2013483."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE.2002.994785"},{"volume-title":"Annual Meeting of the Association for Computational Linguistics.","author":"Pires P.","key":"e_1_3_2_1_46_1","unstructured":"T.\u00a0J.\u00a0P. Pires, E. Schlinger, and D. Garrette. 2019. How Multilingual is Multilingual BERT?. In Annual Meeting of the Association for Computational Linguistics."},{"volume-title":"COLING 2004: Proceedings of the 20th International Conference on Computational Linguistics. COLING","author":"Pouliquen B.","key":"e_1_3_2_1_47_1","unstructured":"B. Pouliquen, R. Steinberger, C. Ignat, E. K\u00e4sper, and I. Temnikova. 2004. Multilingual and cross-lingual news topic tracking. In COLING 2004: Proceedings of the 20th International Conference on Computational Linguistics. COLING, Geneva, Switzerland, 959\u2013965. https:\/\/aclanthology.org\/C04-1138"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/1869790.1869800"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","unstructured":"M. Quezada and B. Poblete. 2019. A Lightweight Representation of News Events on Social Media(SIGIR\u201919). Association for Computing Machinery New York NY USA 1049\u20131052. https:\/\/doi.org\/10.1145\/3331184.3331300","DOI":"10.1145\/3331184.3331300"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.3115\/1119394.1119402"},{"volume-title":"Conference on Empirical Methods in Natural Language Processing.","author":"Reimers N.","key":"e_1_3_2_1_51_1","unstructured":"N. Reimers and I. Gurevych. 2019. Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks. In Conference on Empirical Methods in Natural Language Processing."},{"key":"e_1_3_2_1_52_1","volume-title":"Proceedings of the ACM SIGIR: SWSM 63","author":"Rosa K.","year":"2011","unstructured":"K. Rosa, R. Shah, B. Lin, A. Gershman, and R. Frederking. 2011. Topical clustering of tweets. Proceedings of the ACM SIGIR: SWSM 63 (2011)."},{"volume-title":"Proceedings of the 26th International Joint Conference on Artificial Intelligence","author":"Rupnik J.","key":"e_1_3_2_1_53_1","unstructured":"J. Rupnik, A. Muhi\u010d, G. Leban, B. Fortuna, and M. Grobelnik. 2017. News across Languages: Cross-Lingual Document Similarity and Event Tracking. In Proceedings of the 26th International Joint Conference on Artificial Intelligence (Melbourne, Australia) (IJCAI\u201917). AAAI Press, 5050\u20135054."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1016\/0306-4573(88)90021-0"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/361219.361220"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/2675354.2675698"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/2093973.2094065"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/1963192.1963303"},{"key":"e_1_3_2_1_59_1","volume-title":"Maps: The Power of Searching with Spatial Synonyms. Technical Report. Computer Science Department","author":"Samet H.","year":"2009","unstructured":"H. Samet, B.\u00a0E. Teitler, M.\u00a0D. Lieberman, J. Sankaranarayanan, D. Panozzo, and J. Sperling. 2009. Reading News with Maps: The Power of Searching with Spatial Synonyms. Technical Report. Computer Science Department, University of Maryland, College Park, MD. submitted for publication."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"crossref","unstructured":"J. Sankaranarayanan H. Samet B. Teitler M.\u00a0D. Lieberman and J. Sperling. 2009. TwitterStand: News in tweets D.\u00a0Agrawal W.\u00a0G. Aref C.-T. Lu M.\u00a0F. Mokbel P.\u00a0Scheuermann C.\u00a0Shahabi and O.\u00a0Wolfson (Eds.). Seattle WA 42\u201351.","DOI":"10.1145\/1653771.1653781"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3486183.3491066"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-2037"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1145\/3269206.3269262"},{"volume-title":"Proceedings of the 19th annual international ACM SIGIR conference on Research and development in information retrieval. 58\u201365","author":"Sheridan P.","key":"e_1_3_2_1_64_1","unstructured":"P. Sheridan and J. Ballerini. 1996. Experiments in multilingual information retrieval using the SPIDER system. In Proceedings of the 19th annual international ACM SIGIR conference on Research and development in information retrieval. 58\u201365."},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.4304\/jait.2.3.152-158"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.12720\/jait.13.3.224-229"},{"key":"e_1_3_2_1_67_1","unstructured":"L. Stankevivcius and M. Lukovsevivcius. 2020. Testing pre-trained Transformer models for Lithuanian news clustering."},{"key":"e_1_3_2_1_68_1","volume-title":"Proceedings of the International KDD Workshop on Text Mining (06","author":"Steinbach M.","year":"2000","unstructured":"M. Steinbach, G. Karypis, and V. Kumar. 2000. A Comparison of Document Clustering Techniques. Proceedings of the International KDD Workshop on Text Mining (06 2000)."},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"crossref","unstructured":"B. Teitler M.\u00a0D. Lieberman D. Panozzo J. Sankaranarayanan H. Samet and J. Sperling. 2008. NewsStand: A new view on news W.\u00a0G. Aref M.\u00a0F. Mokbel H.\u00a0Samet M.\u00a0Schneider C.\u00a0Shahabi and O.\u00a0Wolfson (Eds.). Irvine CA 144\u2013153.","DOI":"10.1145\/1463434.1463458"},{"key":"e_1_3_2_1_70_1","unstructured":"B.\u00a0E. Teitler J. Sankaranarayanan and H. Samet. 2010. Online document clustering using the GPU. Technical Report TR\u20134970. Computer Science Department University of Maryland College Park MD."},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"crossref","unstructured":"K. Wu and B. Lu. 2007. Cross-Lingual Document Clustering. In Advances in Knowledge Discovery and Data Mining Zhi-Hua Zhou Hang Li and Qiang Yang (Eds.). Springer Berlin Heidelberg Berlin Heidelberg 956\u2013963.","DOI":"10.1007\/978-3-540-71701-0_107"},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10489-021-02263-z"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN.2005.1556436"},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2005.06.008"},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","unstructured":"D. Zhou M. Truran T. Brailsford V. Wade and H. Ashman. 2012. Translation Techniques in Cross-Language Information Retrieval. ACM Comput. Surv. 45 1 Article 1 (dec 2012) 44\u00a0pages. https:\/\/doi.org\/10.1145\/2379776.2379777","DOI":"10.1145\/2379776.2379777"},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073163"}],"event":{"name":"NLPIR 2023: 2023 7th International Conference on Natural Language Processing and Information Retrieval","acronym":"NLPIR 2023","location":"Seoul Republic of Korea"},"container-title":["Proceedings of the 2023 7th International Conference on Natural Language Processing and Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3639233.3639356","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3639233.3639356","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T19:56:37Z","timestamp":1755892597000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3639233.3639356"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12,15]]},"references-count":76,"alternative-id":["10.1145\/3639233.3639356","10.1145\/3639233"],"URL":"https:\/\/doi.org\/10.1145\/3639233.3639356","relation":{},"subject":[],"published":{"date-parts":[[2023,12,15]]},"assertion":[{"value":"2024-03-05","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}