{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,23]],"date-time":"2025-09-23T00:34:38Z","timestamp":1758587678748,"version":"3.44.0"},"publisher-location":"Cham","reference-count":44,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032054081","type":"print"},{"value":"9783032054098","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,9,15]],"date-time":"2025-09-15T00:00:00Z","timestamp":1757894400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,9,15]],"date-time":"2025-09-15T00:00:00Z","timestamp":1757894400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-05409-8_4","type":"book-chapter","created":{"date-parts":[[2025,9,22]],"date-time":"2025-09-22T17:03:08Z","timestamp":1758560588000},"page":"36-52","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Generating Synthetic Data with\u00a0Large Language Models for\u00a0Low-Resource Sentence Retrieval"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-3279-6480","authenticated-orcid":false,"given":"Davide","family":"Caffagni","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0005-1396-9114","authenticated-orcid":false,"given":"Federico","family":"Cocchi","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5538-5882","authenticated-orcid":false,"given":"Anna","family":"Mambelli","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7063-7782","authenticated-orcid":false,"given":"Fabio","family":"Tutrone","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0000-1208-6743","authenticated-orcid":false,"given":"Marco","family":"Zanella","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9640-9385","authenticated-orcid":false,"given":"Marcella","family":"Cornia","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2239-283X","authenticated-orcid":false,"given":"Rita","family":"Cucchiara","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,9,15]]},"reference":[{"key":"4_CR1","unstructured":"Achiam, J., Adler, S., Agarwal, S., Ahmad, L., Akkaya, I., Aleman, F.L., Almeida, D., Altenschmidt, J., Altman, S., Anadkat, S., et\u00a0al.: GPT-4 Technical Report. arXiv preprint arXiv:2303.08774 (2023)"},{"key":"4_CR2","unstructured":"Anil, R., Borgeaud, S., Wu, Y., Alayrac, J.B., Yu, J., Soricut, R., Schalkwyk, J., Dai, A.M., Hauth, A., et\u00a0al.: Gemini: A Family of Highly Capable Multimodal Models. arXiv preprint arXiv:2312.11805 (2023)"},{"key":"4_CR3","unstructured":"Bamman, D., Burns, P.J.: Latin BERT: A Contextual Language Model for Classical Philology. arXiv preprint arXiv:2009.10053 (2020)"},{"key":"4_CR4","doi-asserted-by":"crossref","unstructured":"Bonifacio, L., Abonizio, H., Fadaee, M., Nogueira, R.: InPars: Unsupervised Dataset Generation for Information Retrieval. In: Proceedings of the International ACM SIGIR Conference on Research and Development in Information Retrieval (2022)","DOI":"10.1145\/3477495.3531863"},{"key":"4_CR5","unstructured":"Brown, T., Mann, B., Ryder, N., Subbiah, M., Kaplan, J.D., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A., et\u00a0al.: Language models are few-shot learners. In: Advances in Neural Information Processing Systems (2020)"},{"key":"4_CR6","doi-asserted-by":"crossref","unstructured":"Caffagni, D., Barraco, M., Cornia, M., Baraldi, L., Cucchiara, R.: SynthCap: Augmenting Transformers with Synthetic Data for Image Captioning. In: Proceedings of the International Conference on Image Analysis and Processing (2023)","DOI":"10.1007\/978-3-031-43148-7_10"},{"key":"4_CR7","unstructured":"Caffagni, D., Cocchi, F., Mambelli, A., Tutrone, F., Zanella, M., Cornia, M., Cucchiara, R., et\u00a0al.: Benchmarking BERT-based Models for Latin: A Case Study on Biblical References in Ancient Christian Literature. In: Proceedings of the Conference on Information and Research Science Connecting to Digital and Library Science (2025)"},{"key":"4_CR8","doi-asserted-by":"crossref","unstructured":"Caffagni, D., Sarto, S., Cornia, M., Baraldi, L., Cucchiara, R.: Recurrence-Enhanced Vision-and-Language Transformers for Robust Multimodal Document Retrieval. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2025)","DOI":"10.1109\/CVPR52734.2025.00867"},{"key":"4_CR9","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A Simple Framework for Contrastive Learning of Visual Representations. In: Proceedings of the International Conference on Machine Learning (2020)"},{"key":"4_CR10","doi-asserted-by":"crossref","unstructured":"Cocchi, F., Moratelli, N., Cornia, M., Baraldi, L., Cucchiara, R.: Augmenting Multimodal LLMs with Self-Reflective Tokens for Knowledge-based Visual Question Answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2025)","DOI":"10.1109\/CVPR52734.2025.00859"},{"key":"4_CR11","unstructured":"Conneau, A., Lample, G.: Cross-lingual Language Model Pretraining. In: Advances in Neural Information Processing Systems (2019)"},{"key":"4_CR12","unstructured":"Dai, Z., Zhao, V.Y., Ma, J., Luan, Y., Ni, J., Lu, J., Bakalov, A., Guu, K., Hall, K.B., Chang, M.W.: Promptagator: Few-shot Dense Retrieval From 8 Examples. arXiv preprint arXiv:2209.11755 (2022)"},{"key":"4_CR13","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In: Proceedings of the Annual Conference of the North American Chapter of the Association for Computational Linguistics (2018)"},{"key":"4_CR14","unstructured":"Faghri, F., Fleet, D.J., Kiros, J.R., Fidler, S.: VSE++: Improving Visual-Semantic Embeddings with Hard Negatives. In: Proceedings of the British Machine Vision Conference (2018)"},{"key":"4_CR15","doi-asserted-by":"crossref","unstructured":"Feng, S.Y., Gangal, V., Wei, J., Chandar, S., Vosoughi, S., Mitamura, T., Hovy, E.: A Survey of Data Augmentation Approaches for NLP. In: Findings of the Annual Meeting of the Association for Computational Linguistics (2021)","DOI":"10.18653\/v1\/2021.findings-acl.84"},{"key":"4_CR16","unstructured":"Hu, J., Ruder, S., Siddhant, A., Neubig, G., Firat, O., Johnson, M.: XTREME: A Massively Multilingual Multi-task Benchmark for Evaluating Cross-lingual Generalisation. In: Proceedings of the International Conference on Machine Learning (2020)"},{"key":"4_CR17","doi-asserted-by":"crossref","unstructured":"Hutchinson, B.: Modeling the Sacred: Considerations when Using Religious Texts in Natural Language Processing. In: Findings of the Annual Conference of the North American Chapter of the Association for Computational Linguistics (2024)","DOI":"10.18653\/v1\/2024.findings-naacl.65"},{"issue":"3","key":"4_CR18","doi-asserted-by":"publisher","first-page":"535","DOI":"10.1109\/TBDATA.2019.2921572","volume":"7","author":"J Johnson","year":"2019","unstructured":"Johnson, J., Douze, M., J\u00e9gou, H.: Billion-scale Similarity Search with GPUs. IEEE Transactions on Big Data 7(3), 535\u2013547 (2019)","journal-title":"IEEE Transactions on Big Data"},{"key":"4_CR19","unstructured":"Kalantidis, Y., Sariyildiz, M.B., Pion, N., Weinzaepfel, P., Larlus, D.: Hard Negative Mixing for Contrastive Learning. In: Advances in Neural Information Processing Systems (2020)"},{"key":"4_CR20","unstructured":"Kingma, D.P., Ba, J.L.: ADAM: a Method for Stochastic Optimization. In: Proceedings of the International Conference on Machine Learning (2015)"},{"key":"4_CR21","unstructured":"Klie, J.C., Bugert, M., Boullosa, B., De\u00a0Castilho, R.E., Gurevych, I.: The INCEpTION Platform: Machine-Assisted and Knowledge-Oriented Interactive Annotation. In: Proceedings of System Demonstrations of the International Conference on Computational Linguistics (2018)"},{"key":"4_CR22","doi-asserted-by":"crossref","unstructured":"Kumar, V., Choudhary, A., Cho, E.: Data Augmentation using Pre-trained Transformer Models. In: Proceedings of the Annual Meeting of the Association for Computational Linguistics Workshops (2020)","DOI":"10.18653\/v1\/2020.lifelongnlp-1.3"},{"key":"4_CR23","doi-asserted-by":"crossref","unstructured":"Manjavacas, E., Fonteyn, L.: Adapting vs. Pre-Training Language Models for Historical Languages. Journal of Data Mining & Digital Humanities (Digital Humanities in Languages) (2022)","DOI":"10.46298\/jdmdh.9152"},{"key":"4_CR24","unstructured":"Nogueira, R., Yang, W., Lin, J., Cho, K.: Document Expansion by Query Prediction. arXiv preprint arXiv:1904.08375 (2019)"},{"key":"4_CR25","unstructured":"Oord, A.v.d., Li, Y., Vinyals, O.: Representation Learning with Contrastive Predictive Coding. arXiv preprint arXiv:1807.03748 (2018)"},{"key":"4_CR26","doi-asserted-by":"crossref","unstructured":"Piotrowski, M.: Natural Language Processing for Historical Texts, vol.\u00a017. Morgan & Claypool Publishers (2012)","DOI":"10.1007\/978-3-031-02146-6"},{"key":"4_CR27","doi-asserted-by":"crossref","unstructured":"Pires, T., Schlinger, E., Garrette, D.: How Multilingual is Multilingual BERT. In: Proceedings of the Annual Meeting of the Association for Computational Linguistics (2019)","DOI":"10.18653\/v1\/P19-1493"},{"key":"4_CR28","doi-asserted-by":"crossref","unstructured":"Poppi, S., Poppi, T., Cocchi, F., Cornia, M., Baraldi, L., Cucchiara, R.: Safe-CLIP: Removing NSFW Concepts from Vision-and-Language Models. In: Proceedings of the European Conference on Computer Vision (2024)","DOI":"10.1007\/978-3-031-73668-1_20"},{"key":"4_CR29","doi-asserted-by":"crossref","unstructured":"Ren, P., Liu, Z., Song, X., Tian, H., Chen, Z., Ren, Z., de\u00a0Rijke, M.: Wizard of Search Engine: Access to Information Through Conversations with Search Engines. In: Proceedings of the International ACM SIGIR Conference on Research and Development in Information Retrieval (2021)","DOI":"10.1145\/3404835.3462897"},{"key":"4_CR30","doi-asserted-by":"crossref","unstructured":"Riemenschneider, F., Frank, A.: Exploring Large Language Models for Classical Philology. In: Proceedings of the Annual Meeting of the Association for Computational Linguistics (2023)","DOI":"10.18653\/v1\/2023.acl-long.846"},{"key":"4_CR31","unstructured":"Sabatier\u00a0(ed.), P.: Bibliorum Sacrorum latinae versiones antiquae seu Vetus Italica (3 vols.). Reims, Reginaldus Florentain (1743\u20131751)"},{"key":"4_CR32","doi-asserted-by":"crossref","unstructured":"Salemi, A., Zamani, H.: Towards a Search Engine for Machines: Unified Ranking for Multiple Retrieval-Augmented Large Language Models. In: Proceedings of the International ACM SIGIR Conference on Research and Development in Information Retrieval (2024)","DOI":"10.1145\/3626772.3657733"},{"key":"4_CR33","doi-asserted-by":"crossref","unstructured":"Sarto, S., Barraco, M., Cornia, M., Baraldi, L., Cucchiara, R.: Positive-Augmented Contrastive Learning for Image and Video Captioning Evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2023)","DOI":"10.1109\/CVPR52729.2023.00668"},{"key":"4_CR34","doi-asserted-by":"crossref","unstructured":"Sarto, S., Moratelli, N., Cornia, M., Baraldi, L., Cucchiara, R.: Positive-Augmented Contrastive Learning for Vision-and-Language Evaluation and Training. arXiv preprint arXiv:2410.07336 (2024)","DOI":"10.1007\/s11263-025-02535-y"},{"key":"4_CR35","doi-asserted-by":"crossref","unstructured":"Singh, P., Rutten, G., Lefever, E.: A Pilot Study for BERT Language Modelling and Morphological Analysis for Ancient and Medieval Greek. In: Proceedings of the Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature (2021)","DOI":"10.18653\/v1\/2021.latechclfl-1.15"},{"issue":"3","key":"4_CR36","doi-asserted-by":"publisher","first-page":"703","DOI":"10.1162\/coli_a_00481","volume":"49","author":"T Sommerschield","year":"2023","unstructured":"Sommerschield, T., Assael, Y., Pavlopoulos, J., Stefanak, V., Senior, A., Dyer, C., Bodel, J., Prag, J., Androutsopoulos, I., de Freitas, N.: Machine Learning for Ancient Languages: A Survey. Comput. Linguist. 49(3), 703\u2013747 (2023)","journal-title":"Comput. Linguist."},{"key":"4_CR37","unstructured":"Touvron, H., Lavril, T., Izacard, G., Martinet, X., Lachaux, M.A., Lacroix, T., Rozi\u00e8re, B., Goyal, N., Hambro, E., Azhar, F., et\u00a0al.: LLaMA: Open and Efficient Foundation Language Models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"4_CR38","doi-asserted-by":"crossref","unstructured":"Wang, K., Thakur, N., Reimers, N., Gurevych, I.: GPL: Generative Pseudo Labeling for Unsupervised Domain Adaptation of Dense Retrieval. In: Proceedings of the Annual Conference of the North American Chapter of the Association for Computational Linguistics (2022)","DOI":"10.18653\/v1\/2022.naacl-main.168"},{"key":"4_CR39","doi-asserted-by":"crossref","unstructured":"Wang, L., Yang, N., Huang, X., Yang, L., Majumder, R., Wei, F.: Improving Text Embeddings with Large Language Models. In: Proceedings of the Annual Meeting of the Association for Computational Linguistics (2024)","DOI":"10.18653\/v1\/2024.acl-long.642"},{"key":"4_CR40","doi-asserted-by":"crossref","unstructured":"Wang, L., Yang, N., Huang, X., Yang, L., Majumder, R., Wei, F.: Large Search Model: Redefining Search Stack in the Era of LLMs. In: ACM SIGIR Forum (2024)","DOI":"10.1145\/3642979.3643006"},{"key":"4_CR41","doi-asserted-by":"crossref","unstructured":"Wang, L., Yang, N., Wei, F.: Query2doc: Query Expansion with Large Language Models. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.585"},{"key":"4_CR42","unstructured":"Weber, R., Gryson\u00a0(eds.), R.: Biblia Sacra iuxta Vulgatam Versionem. Stuttgart, Deutsche Bibelgesellschaft (2007$$^5$$ (R Weber, 1969$$^1$$))"},{"key":"4_CR43","doi-asserted-by":"crossref","unstructured":"Zhan, J., Mao, J., Liu, Y., Guo, J., Zhang, M., Ma, S.: Optimizing dense retrieval model training with hard negatives. In: Proceedings of the International ACM SIGIR Conference on Research and Development in Information Retrieval (2021)","DOI":"10.1145\/3404835.3462880"},{"key":"4_CR44","unstructured":"Zycha\u00a0(ed.), J.: Sancti Aureli Augustini: De Genesi ad litteram libri duodecim eiusdem libri capitula. De Genesi ad litteram imperfectus liber. Locutionum in Heptateuchum libri septem. Pragae-Vindobonae-Lipsiae, Tempsky-Freyta (1894)"}],"container-title":["Lecture Notes in Computer Science","Linking Theory and Practice of Digital Libraries"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-05409-8_4","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,22]],"date-time":"2025-09-22T17:03:21Z","timestamp":1758560601000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-05409-8_4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,15]]},"ISBN":["9783032054081","9783032054098"],"references-count":44,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-05409-8_4","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,9,15]]},"assertion":[{"value":"15 September 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"TPDL","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Theory and Practice of Digital Libraries","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tampere","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Finland","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 September 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"26 September 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"tpdl2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/tpdl2025.github.io\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}