{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,27]],"date-time":"2025-12-27T07:24:03Z","timestamp":1766820243744,"version":"3.40.5"},"reference-count":42,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2024,8,3]],"date-time":"2024-08-03T00:00:00Z","timestamp":1722643200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,8,3]],"date-time":"2024-08-03T00:00:00Z","timestamp":1722643200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Lang Resources &amp; Evaluation"],"published-print":{"date-parts":[[2025,6]]},"DOI":"10.1007\/s10579-024-09759-3","type":"journal-article","created":{"date-parts":[[2024,8,3]],"date-time":"2024-08-03T08:01:53Z","timestamp":1722672113000},"page":"1179-1199","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["PESTS: Persian_English cross lingual corpus for semantic textual similarity"],"prefix":"10.1007","volume":"59","author":[{"given":"Mohammad","family":"Abdous","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Poorya","family":"Piroozfar","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Behrouz","family":"MinaeiBidgoli","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,8,3]]},"reference":[{"key":"9759_CR400","doi-asserted-by":"publisher","unstructured":"Abolfazl, AleAhmad Hadi, Amiri Ehsan, Darrudi Masoud, Rahgozar Farhad, Oroumchian (2009) Hamshahri: A standard Persian text collection Knowledge-Based Systems 22(5) 382\u2013387 https:\/\/doi.org\/10.1016\/j.knosys.2009.05.002","DOI":"10.1016\/j.knosys.2009.05.002"},{"key":"9759_CR1","unstructured":"Agirre, E., Cer, D., Diab, M., & Gonzalez-Agirre, A. (2012). Semeval-2012 task 6: A pilot on semantic textual similarity. In SEM 2012: The First Joint Conference on Lexical and Computational Semantics\u2013Volume 1: Proceedings of the main conference and the shared task, and Volume 2: Proceedings of the Sixth International Workshop on Semantic Evaluation (SemEval 2012) (pp. 385\u2013393)."},{"key":"9759_CR2","unstructured":"Agirre, E., Cer, D., Diab, M., Gonzalez-Agirre, A., & Guo, W. (2013). SEM 2013 shared task: Semantic textual similarity. In Second joint conference on lexical and computational semantics (SEM), volume 1: Proceedings of the Main conference and the shared task: Semantic textual similarity (pp. 32\u201343)."},{"key":"9759_CR3","doi-asserted-by":"crossref","unstructured":"Agirre, E., Banea, C., Cardie, C., Cer, D. M., Diab, M. T., Gonzalez-Agirre, A., Guo, W., Mihalcea, R., Rigau, G., & Wiebe, J. (2014, Aug). SemEval-2014 task 10: Multilingual semantic textual similarity. In: SemEval@ COLING (pp. 81\u201391).","DOI":"10.3115\/v1\/S14-2010"},{"key":"9759_CR4","doi-asserted-by":"crossref","unstructured":"Agirre, E., Banea, C., Cardie, C., Cer, D., Diab, M., Gonzalez-Agirre, A., Guo, W., Lopez-Gazpio, I., Maritxalar, M., Mihalcea, R., & Rigau, G. (2015, June). Semeval-2015 task 2: Semantic textual similarity, English, Spanish and pilot on interpretability. In Proceedings of the 9th international workshop on semantic evaluation (SemEval 2015) (pp. 252\u2013263).","DOI":"10.18653\/v1\/S15-2045"},{"key":"9759_CR5","doi-asserted-by":"crossref","unstructured":"Agirre, E., Banea, C., Cer, D., Diab, M., Gonzalez Agirre, A., Mihalcea, R., Rigau Claramunt, G., & Wiebe, J. (2016). Semeval-2016 task 1: Semantic textual similarity, monolingual and cross-lingual evaluation. In: SemEval-2016: 10th international workshop on semantic evaluation, 2016 Jun 16\u201317 (pp. 497\u2013511). Association for Computational Linguistics.","DOI":"10.18653\/v1\/S16-1081"},{"issue":"2","key":"9759_CR6","doi-asserted-by":"publisher","first-page":"189","DOI":"10.1016\/j.jksuci.2016.04.001","volume":"29","author":"FS Al-Anzi","year":"2017","unstructured":"Al-Anzi, F. S., & AbuZeina, D. (2017). Toward an enhanced Arabic text classification using cosine similarity and Latent Semantic Indexing. Journal of King Saud University\u2014Computer and Information Sciences, 29(2), 189\u2013195.","journal-title":"Journal of King Saud University\u2014Computer and Information Sciences"},{"issue":"4","key":"9759_CR7","doi-asserted-by":"publisher","first-page":"7764","DOI":"10.1016\/j.eswa.2008.11.022","volume":"36","author":"RM Aliguliyev","year":"2009","unstructured":"Aliguliyev, R. M. (2009). A new sentence similarity measure and sentence based extractive technique for automatic text summarization. Expert Systems with Applications, 36(4), 7764\u20137772.","journal-title":"Expert Systems with Applications"},{"issue":"2","key":"9759_CR8","doi-asserted-by":"publisher","first-page":"133","DOI":"10.1109\/TSMCC.2011.2134847","volume":"42","author":"SM Alzahrani","year":"2011","unstructured":"Alzahrani, S. M., Salim, N., & Abraham, A. (2011). Understanding plagiarism linguistic patterns, textual features, and detection methods. IEEE Transactions on Systems, Man, and Cybernetics, Part C (applications and Review), 42(2), 133\u2013149.","journal-title":"IEEE Transactions on Systems, Man, and Cybernetics, Part C (applications and Review)"},{"key":"9759_CR9","unstructured":"Ammar, W., Mulcaire, G., Tsvetkov, Y., Lample, G., Dyer, C., & Smith, N. A. (2016). Massively multilingual word embeddings. arXiv preprint arXiv:1602.01925."},{"key":"9759_CR10","unstructured":"Barbieri, F., Anke, L. E., & Camacho-Collados, J. (2021). Xlm-t: Multilingual language models in twitter for sentiment analysis and beyond. arXiv preprint arXiv:2104.12250."},{"key":"9759_CR11","first-page":"1","volume-title":"Noise reduction in speech processing","author":"J Benesty","year":"2009","unstructured":"Benesty, J., Chen, J., Huang, Y., & Cohen, I. (2009). Pearson correlation coefficient. Noise reduction in speech processing (pp. 1\u20134). Springer."},{"key":"9759_CR12","unstructured":"Bjerva, J., & \u00d6stling, R. (2017). Cross-lingual learning of semantic textual similarity with multilingual word representations. In: 21st Nordic Conference on Computational Linguistics, NoDaLiDa, Gothenburg, Sweden, 22\u201324 May, 2017 (pp. 211\u2013215). Link\u00f6ping University Electronic Press."},{"key":"9759_CR13","unstructured":"De Boni, M., & Manandhar, S. (2003). The use of sentence similarity as a semantic relevance metric for question answering. In: New directions in question answering (pp. 138\u2013144)."},{"key":"9759_CR15","doi-asserted-by":"crossref","unstructured":"Cer, D., Diab, M., Agirre, E., Lopez-Gazpio, I., & Specia, L. (2017). Semeval-2017 task 1: Semantic textual similarity-multilingual and cross-lingual focused evaluation. arXiv preprint arXiv:1708.00055.","DOI":"10.18653\/v1\/S17-2001"},{"key":"9759_CR16","doi-asserted-by":"crossref","unstructured":"Chidambaram, M., Yang, Y., Cer, D., Yuan, S., Sung, Y. H., Strope, B., & Kurzweil, R. (2018). Learning cross-lingual sentence representations via a multi-task dual-encoder model. arXiv preprint arXiv:1810.12836.","DOI":"10.18653\/v1\/W19-4330"},{"key":"9759_CR17","doi-asserted-by":"crossref","unstructured":"Conneau, A., Lample, G., Rinott, R., Williams, A., Bowman, S. R., Schwenk, H., & Stoyanov, V. (2018). XNLI: Evaluating cross-lingual sentence representations. arXiv preprint arXiv:1809.05053.","DOI":"10.18653\/v1\/D18-1269"},{"key":"9759_CR18","unstructured":"Conneau, A., & Lample, G. (2019). Cross-lingual language model pretraining. Advances in Neural Information Processing Systems, 32."},{"key":"9759_CR19","doi-asserted-by":"crossref","unstructured":"Conneau, A., Khandelwal, K., Goyal, N., Chaudhary, V., Wenzek, G., Guzm\u00e1n, F., Grave, E., Ott, M., Zettlemoyer, L., & Stoyanov, V. (2019). Unsupervised cross-lingual representation learning at scale. arXiv preprint arXiv:1911.02116.","DOI":"10.18653\/v1\/2020.acl-main.747"},{"key":"9759_CR20","unstructured":"Devlin, J., Chang, M. W., Lee, K., & Toutanova, K. (2018). Bert: Pre-training of deep bidirectional Transformers for language understanding. arXiv preprint arXiv:1810.04805."},{"key":"9759_CR21","doi-asserted-by":"crossref","unstructured":"Eghbalzadeh, H., Hosseini, B., Khadivi, S., & Khodabakhsh, A. (2012, Nov). Persica: A Persian corpus for multi-purpose text mining and Natural language processing. In 6th international symposium on telecommunications (IST) (pp. 1207\u20131214). IEEE.","DOI":"10.1109\/ISTEL.2012.6483172"},{"key":"9759_CR22","unstructured":"Ferrero, J., Agnes, F., Besacier, L., & Schwab, D. (2016, May). A multilingual, multi-style and multi-granularity dataset for cross-language textual similarity detection. In: 10th edition of the language resources and evaluation conference."},{"key":"9759_CR23","unstructured":"Gouws, S., Bengio, Y., & Corrado, G. (2015, June). Bilbowa: Fast bilingual distributed representations without word alignments. In: International conference on machine learning (pp. 748\u2013756). PMLR."},{"key":"9759_CR24","doi-asserted-by":"crossref","unstructured":"Hercig, T., & Kr\u00e1l, P. (2021, Sept). Evaluation datasets for cross-lingual semantic textual similarity. In Proceedings of the international conference on recent advances in natural language processing (RANLP 2021) (pp. 524\u2013529).","DOI":"10.26615\/978-954-452-072-4_059"},{"key":"9759_CR25","doi-asserted-by":"crossref","unstructured":"Jawahar, G., Sagot, B., & Seddah, D. (2019, July). What does BERT learn about the structure of language?. In: ACL 2019\u201357th annual meeting of the association for computational linguistics.","DOI":"10.18653\/v1\/P19-1356"},{"key":"9759_CR26","unstructured":"Kashefi, O., 2018. MIZAN: a large persian-english parallel corpus. arXiv preprint arXiv:1801.02107."},{"key":"9759_CR27","unstructured":"Klementiev, A., Titov, I., & Bhattarai, B. (2012, Dec). Inducing crosslingual distributed representations of words. In Proceedings of COLING 2012 (pp. 1459\u20131474)."},{"issue":"4","key":"9759_CR30","doi-asserted-by":"publisher","first-page":"647","DOI":"10.13053\/cys-20-4-2506","volume":"20","author":"G Majumder","year":"2016","unstructured":"Majumder, G., Pakray, P., Gelbukh, A., & Pinto, D. (2016). Semantic textual similarity methods, tools, and applications: A survey. Computaci\u00f3n y Sistemas, 20(4), 647\u2013665.","journal-title":"Computaci\u00f3n y Sistemas"},{"issue":"01","key":"9759_CR31","doi-asserted-by":"publisher","first-page":"107","DOI":"10.1142\/S0219649204000729","volume":"3","author":"D Manjula","year":"2004","unstructured":"Manjula, D., & Geetha, T. V. (2004). Semantic search engine. Journal of Information & Knowledge Management, 3(01), 107\u2013117.","journal-title":"Journal of Information & Knowledge Management"},{"key":"9759_CR32","unstructured":"Marelli, M., Menini, S., Baroni, M., Bentivogli, L., Bernardi, R., & Zamparelli, R. (2014, May). A SICK cure for the evaluation of compositional distributional semantic models. In: Proceedings of the ninth international conference on language resources and evaluation (LREC\u201914) (pp. 216\u2013223)."},{"key":"9759_CR34","unstructured":"Mikolov, T., Le, Q. V., & Sutskever, I. (2013). Exploiting similarities among languages for machine translation."},{"key":"9759_CR35","unstructured":"Mirzaei, A., & Safari, P. (2018, May). Persian discourse treebank and coreference corpus. In: Proceedings of the eleventh international conference on language resources and evaluation (lrec 2018)."},{"key":"9759_CR36","doi-asserted-by":"crossref","unstructured":"Pilevar, M. T., Faili, H., & Pilevar, A. H. (2011, Feb.) Tep: Tehran English\u2013Persian parallel corpus. In: International conference on intelligent text processing and computational linguistics (pp. 68\u201379). Springer.","DOI":"10.1007\/978-3-642-19437-5_6"},{"issue":"1","key":"9759_CR37","doi-asserted-by":"publisher","first-page":"257","DOI":"10.1007\/s10579-022-09620-5","volume":"57","author":"Z Rahimi","year":"2023","unstructured":"Rahimi, Z., & Homayounpour, M. M. (2023). The impact of preprocessing on word embedding quality: A comparative study. Language Resources and Evaluation, 57(1), 257\u2013291.","journal-title":"Language Resources and Evaluation"},{"key":"9759_CR38","doi-asserted-by":"crossref","unstructured":"Reimers, N., & Gurevych, I. (2019, Nov.) Sentence-BERT: Sentence embeddings using Siamese BERT-networks. In: Proceedings of the 2019 conference on empirical methods in natural language processing and the 9th international joint conference on natural language processing (EMNLP-IJCNLP) (pp. 3982\u20133992).","DOI":"10.18653\/v1\/D19-1410"},{"key":"9759_CR39","doi-asserted-by":"crossref","unstructured":"Reimers, N., & Gurevych, I. (2020, Nov). Making monolingual sentence embeddings multilingual using knowledge distillation. In: Proceedings of the 2020 conference on empirical methods in natural language processing (EMNLP) (pp. 4512\u20134525).","DOI":"10.18653\/v1\/2020.emnlp-main.365"},{"key":"9759_CR40","unstructured":"Sanh, V., Debut, L., Chaumond, J., & Wolf, T., 2019. DistilBERT, a distilled version of BERT: Smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108."},{"key":"9759_CR41","unstructured":"Shibata, Y., Kida, T., Fukamachi, S., Takeda, M., Shinohara, A., Shinohara, T., & Arikawa, S. (1999). Byte Pair encoding: A text compression scheme that accelerates pattern matching."},{"issue":"3","key":"9759_CR42","first-page":"271","volume":"3","author":"C Spearman","year":"1910","unstructured":"Spearman, C. (1910). Correlation calculated from faulty data. British Journal of Psychology, 3(3), 271.","journal-title":"British Journal of Psychology"},{"key":"9759_CR43","unstructured":"Tang, X., Cheng, S., Do, L., Min, Z., Ji, F., Yu, H., Zhang, J., & Chen, H. (2018). Improving multilingual semantic textual similarity with shared sentence encoder for low-resource languages. arXiv preprint arXiv:1810.08740."},{"key":"9759_CR44","doi-asserted-by":"crossref","unstructured":"\u017di\u017eka, J., & Da\u0159ena, F. (2010, Sept). Automatic sentiment analysis using the textual pattern content similarity in natural language. In: International conference on text, speech and dialogue (pp. 224\u2013231). Springer.","DOI":"10.1007\/978-3-642-15760-8_29"},{"key":"9759_CR45","doi-asserted-by":"crossref","unstructured":"Zou, W. Y., Socher, R., Cer, D., & Manning, C. D. (2013, Oct). Bilingual word embeddings for phrase-based machine translation. In: Proceedings of the 2013 conference on empirical methods in natural language processing (pp. 1393\u20131398).","DOI":"10.18653\/v1\/D13-1141"}],"container-title":["Language Resources and Evaluation"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10579-024-09759-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10579-024-09759-3\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10579-024-09759-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,18]],"date-time":"2025-05-18T15:03:46Z","timestamp":1747580626000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10579-024-09759-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,3]]},"references-count":42,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2025,6]]}},"alternative-id":["9759"],"URL":"https:\/\/doi.org\/10.1007\/s10579-024-09759-3","relation":{},"ISSN":["1574-020X","1574-0218"],"issn-type":[{"type":"print","value":"1574-020X"},{"type":"electronic","value":"1574-0218"}],"subject":[],"published":{"date-parts":[[2024,8,3]]},"assertion":[{"value":"20 June 2024","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 August 2024","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no competing interests to declare.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}