{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,12]],"date-time":"2025-09-12T19:29:43Z","timestamp":1757705383402,"version":"3.40.3"},"publisher-location":"Cham","reference-count":48,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031785375"},{"type":"electronic","value":"9783031785382"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-78538-2_1","type":"book-chapter","created":{"date-parts":[[2025,1,24]],"date-time":"2025-01-24T07:29:55Z","timestamp":1737703795000},"page":"3-20","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Federated Learning-Based Tokenizer for\u00a0Domain-Specific Language Models in\u00a0Finance"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1998-9272","authenticated-orcid":false,"given":"Farouk","family":"Damoun","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0670-815X","authenticated-orcid":false,"given":"Hamida","family":"Seba","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4751-9577","authenticated-orcid":false,"given":"Radu","family":"State","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,1,25]]},"reference":[{"key":"1_CR1","unstructured":"Amid, E., et al.: Public data-assisted mirror descent for private model training. In: International Conference on Machine Learning, pp. 517\u2013535. PMLR (2022)"},{"key":"1_CR2","unstructured":"Arapinis, M., Figueira, D., Gaboardi, M.: Sensitivity of counting queries. In: International Colloquium on Automata, Languages, and Programming (ICALP) (2016)"},{"key":"1_CR3","unstructured":"Bagdasaryan, E., Song, C., van Dalen, R., Seigel, M., \u00c1ine Cahill: training a tokenizer for free with private federated learning. In: ACL (2022). https:\/\/arxiv.org\/abs\/2203.09943"},{"key":"1_CR4","unstructured":"Balle, B., Barthe, G., Gaboardi, M.: Privacy amplification by subsampling: tight analyses via couplings and divergences. Adv. Neural Inform. Process. Syst. 31 (2018)"},{"key":"1_CR5","doi-asserted-by":"crossref","unstructured":"Balle, B., Barthe, G., Gaboardi, M.: Privacy profiles and amplification by subsampling. J. Priv. Confidential. 10(1) (2020)","DOI":"10.29012\/jpc.726"},{"key":"1_CR6","doi-asserted-by":"crossref","unstructured":"Basu, P., Roy, T.S., Naidu, R., Muftuoglu, Z.: Privacy enabled financial text classification using differential privacy and federated learning. arXiv preprint arXiv:2110.01643 (2021)","DOI":"10.18653\/v1\/2021.econlp-1.7"},{"key":"1_CR7","unstructured":"Benamar, A., Grouin, C., Bothua, M., Vilnat, A.: Evaluating tokenizers impact on oovs representation with transformers models. In: Proceedings of the Thirteenth Language Resources and Evaluation Conference, pp. 4193\u20134204 (2022)"},{"key":"1_CR8","doi-asserted-by":"crossref","unstructured":"Berglund, M., van\u00a0der Merwe, B.: Formalizing bpe tokenization. arXiv preprint arXiv:2309.08715 (2023)","DOI":"10.21203\/rs.3.rs-4196947\/v1"},{"key":"1_CR9","doi-asserted-by":"publisher","unstructured":"Cai, D., Wu, Y., Wang, S., Lin, F.X., Xu, M.: Efficient federated learning for modern nlp. In: Proceedings of the 29th Annual International Conference on Mobile Computing and Networking. ACM MobiCom 2023, Association for Computing Machinery, New York (2023). https:\/\/doi.org\/10.1145\/3570361.3592505","DOI":"10.1145\/3570361.3592505"},{"issue":"240","key":"1_CR10","first-page":"1","volume":"24","author":"A Chowdhery","year":"2023","unstructured":"Chowdhery, A., et al.: Palm: scaling language modeling with pathways. J. Mach. Learn. Res. 24(240), 1\u2013113 (2023)","journal-title":"J. Mach. Learn. Res."},{"key":"1_CR11","unstructured":"Dagan, G., Synnaeve, G., Rozi\u00e8re, B.: Getting the most out of your tokenizer for pre-training and domain adaptation. arXiv preprint arXiv:2402.01035 (2024)"},{"issue":"3","key":"1_CR12","doi-asserted-by":"publisher","first-page":"291","DOI":"10.1145\/320613.320616","volume":"5","author":"DE Denning","year":"1980","unstructured":"Denning, D.E.: Secure statistical databases with random sample queries. ACM Trans. Database Syst. (TODS) 5(3), 291\u2013315 (1980)","journal-title":"ACM Trans. Database Syst. (TODS)"},{"key":"1_CR13","unstructured":"Ding, S., Renduchintala, A., Duh, K.: A call for prudent choice of subword merge operations in neural machine translation. arXiv preprint arXiv:1905.10453 (2019)"},{"issue":"3\u20134","key":"1_CR14","first-page":"211","volume":"9","author":"C Dwork","year":"2014","unstructured":"Dwork, C., Roth, A., et al.: The algorithmic foundations of differential privacy. Foundat. Trends Theoret. Comput. Sci. 9(3\u20134), 211\u2013407 (2014)","journal-title":"Foundat. Trends Theoret. Comput. Sci."},{"key":"1_CR15","unstructured":"Hazourli, A.: Financialbert-a pretrained language model for financial text mining. Res. Gate 2 (2022)"},{"key":"1_CR16","unstructured":"Kairouz, P., McMahan, B., Song, S., Thakkar, O., Thakurta, A., Xu, Z.: Practical and private (deep) learning without sampling or shuffling. In: International Conference on Machine Learning, pp. 5213\u20135225. PMLR (2021)"},{"issue":"3","key":"1_CR17","doi-asserted-by":"publisher","first-page":"793","DOI":"10.1137\/090756090","volume":"40","author":"SP Kasiviswanathan","year":"2011","unstructured":"Kasiviswanathan, S.P., Lee, H.K., Nissim, K., Raskhodnikova, S., Smith, A.: What can we learn privately? SIAM J. Comput. 40(3), 793\u2013826 (2011)","journal-title":"SIAM J. Comput."},{"key":"1_CR18","unstructured":"Kenton, J.D.M.W.C., Toutanova, L.K.: Bert: Pre-training of deep bidirectional transformers for language understanding. In: Proceedings of naacL-HLT, vol.\u00a01, p.\u00a02 (2019)"},{"key":"1_CR19","doi-asserted-by":"publisher","unstructured":"Kobayashi, G., Kuribayashi, T., Yokoi, S., Inui, K.: Transformer language models handle word frequency in prediction head. In: Rogers, A., Boyd-Graber, J., Okazaki, N. (eds.) Findings of the Association for Computational Linguistics: ACL 2023, pp. 4523\u20134535. Association for Computational Linguistics, Toronto, Canada (Jul 2023).https:\/\/doi.org\/10.18653\/v1\/2023.findings-acl.276, https:\/\/aclanthology.org\/2023.findings-acl.276","DOI":"10.18653\/v1\/2023.findings-acl.276"},{"key":"1_CR20","doi-asserted-by":"crossref","unstructured":"Kudo, T., Richardson, J.: Sentencepiece: A simple and language independent subword tokenizer and detokenizer for neural text processing. arXiv preprint arXiv:1808.06226 (2018)","DOI":"10.18653\/v1\/D18-2012"},{"key":"1_CR21","doi-asserted-by":"crossref","unstructured":"Larbi, I.B.C., Burchardt, A., Roller, R.: Clinical text anonymization, its influence on downstream nlp tasks and the risk of re-identification. In: Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics: Student Research Workshop, pp. 105\u2013111 (2023)","DOI":"10.18653\/v1\/2023.eacl-srw.11"},{"key":"1_CR22","doi-asserted-by":"crossref","unstructured":"Lewis, M., et al.: Bart: denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. arXiv preprint arXiv:1910.13461 (2019)","DOI":"10.18653\/v1\/2020.acl-main.703"},{"key":"1_CR23","doi-asserted-by":"crossref","unstructured":"Li, Q., Diao, Y., Chen, Q., He, B.: Federated learning on non-iid data silos: an experimental study. In: 2022 IEEE 38th International Conference on Data Engineering (ICDE), pp. 965\u2013978. IEEE (2022)","DOI":"10.1109\/ICDE53745.2022.00077"},{"key":"1_CR24","doi-asserted-by":"crossref","unstructured":"Limisiewicz, T., Balhar, J., Mare\u010dek, D.: Tokenization impacts multilingual language modeling: Assessing vocabulary allocation and overlap across languages. arXiv preprint arXiv:2305.17179 (2023)","DOI":"10.18653\/v1\/2023.findings-acl.350"},{"key":"1_CR25","doi-asserted-by":"crossref","unstructured":"Lin, B.Y., et al.: Fednlp: benchmarking federated learning methods for natural language processing tasks. arXiv preprint arXiv:2104.08815 (2021)","DOI":"10.18653\/v1\/2022.findings-naacl.13"},{"key":"1_CR26","doi-asserted-by":"crossref","unstructured":"Liu, Z., Huang, D., Huang, K., Li, Z., Zhao, J.: Finbert: a pre-trained financial language representation model for financial text mining. In: Proceedings of the Twenty-ninth International Conference on International Joint Conferences on Artificial Intelligence, pp. 4513\u20134519 (2021)","DOI":"10.24963\/ijcai.2020\/622"},{"key":"1_CR27","unstructured":"Mazzarino, S., Minieri, A., Gilli, L.: Nerpii: a python library to perform named entity recognition and generate personal identifiable information. In: Proceedings of the Seventh Workshop on Natural Language for Artificial Intelligence (NL4AI 2023) co-located with 22th International Conference of the Italian Association for Artificial Intelligence (AI* IA 2023) (2023)"},{"key":"1_CR28","unstructured":"McMahan, H.B., Ramage, D., Talwar, K., Zhang, L.: Learning differentially private recurrent language models. arXiv preprint arXiv:1710.06963 (2017)"},{"key":"1_CR29","doi-asserted-by":"crossref","unstructured":"Nayak, A., Timmapathini, H., Ponnalagu, K., Venkoparao, V.G.: Domain adaptation challenges of bert in tokenization and sub-word representations of out-of-vocabulary words. In: Proceedings of the first workshop on insights from negative results in NLP, pp.\u00a01\u20135 (2020)","DOI":"10.18653\/v1\/2020.insights-1.1"},{"key":"1_CR30","unstructured":"Nogueira, R., Jiang, Z., Lin, J.: Investigating the limitations of transformers with simple arithmetic tasks. arXiv preprint arXiv:2102.13019 (2021)"},{"key":"1_CR31","unstructured":"Petrov, A., La\u00a0Malfa, E., Torr, P., Bibi, A.: Language model tokenizers introduce unfairness between languages. Adv. Neural Inform. Process. Syst. 36 (2024)"},{"key":"1_CR32","doi-asserted-by":"crossref","unstructured":"Provilkov, I., Emelianenko, D., Voita, E.: Bpe-dropout: simple and effective subword regularization. arXiv preprint arXiv:1910.13267 (2019)","DOI":"10.18653\/v1\/2020.acl-main.170"},{"issue":"8","key":"1_CR33","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I., et al.: Language models are unsupervised multitask learners. OpenAI blog 1(8), 9 (2019)","journal-title":"OpenAI blog"},{"key":"1_CR34","unstructured":"Raffel, C., Shazeer, N., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. 21(140), 1\u201367 (2020). http:\/\/jmlr.org\/papers\/v21\/20-074.html"},{"key":"1_CR35","doi-asserted-by":"publisher","unstructured":"Rust, P., Pfeiffer, J., Vuli\u0107, I., Ruder, S., Gurevych, I.: How good is your tokenizer? on the monolingual performance of multilingual language models. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), pp. 3118\u20133135. Association for Computational Linguistics (Aug 2021).https:\/\/doi.org\/10.18653\/v1\/2021.acl-long.243, https:\/\/aclanthology.org\/2021.acl-long.243","DOI":"10.18653\/v1\/2021.acl-long.243"},{"key":"1_CR36","doi-asserted-by":"crossref","unstructured":"Sennrich, R., Haddow, B., Birch, A.: Neural machine translation of rare words with subword units. arXiv preprint arXiv:1508.07909 (2015)","DOI":"10.18653\/v1\/P16-1162"},{"key":"1_CR37","doi-asserted-by":"crossref","unstructured":"Shah, R.Set al.: When flue meets flang: benchmarks and large pretrained language model for financial domain. In: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing (EMNLP). Association for Computational Linguistics (2022)","DOI":"10.18653\/v1\/2022.emnlp-main.148"},{"key":"1_CR38","unstructured":"Shoham, O.B., Rappoport, N.: Federated learning of medical concepts embedding using behrt. arXiv preprint arXiv:2305.13052 (2023)"},{"issue":"4","key":"1_CR39","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3510033","volume":"13","author":"Y Tian","year":"2022","unstructured":"Tian, Y., Wan, Y., Lyu, L., Yao, D., Jin, H., Sun, L.: Fedbert: when federated learning meets pre-training. ACM Trans. Intell. Syst. Technol. (TIST) 13(4), 1\u201326 (2022)","journal-title":"ACM Trans. Intell. Syst. Technol. (TIST)"},{"issue":"4","key":"1_CR40","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3578707","volume":"22","author":"C Toraman","year":"2023","unstructured":"Toraman, C., Yilmaz, E.H., \u015eahinu\u00e7, F., Ozcelik, O.: Impact of tokenization on language models: an analysis for Turkish. ACM Trans. Asian Low-Resource Lang. Inform. Process. 22(4), 1\u201321 (2023)","journal-title":"ACM Trans. Asian Low-Resource Lang. Inform. Process."},{"key":"1_CR41","unstructured":"Touvron, H., et\u00a0al.: Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"1_CR42","doi-asserted-by":"crossref","unstructured":"Wang, C., Cho, K., Gu, J.: Neural machine translation with byte-level subwords. In: Proceedings of the AAAI conference on artificial intelligence, vol.\u00a034, pp. 9154\u20139160 (2020)","DOI":"10.1609\/aaai.v34i05.6451"},{"key":"1_CR43","unstructured":"Wang, Y.J., Li, Y., Qin, H., Guan, Y., Chen, S.: A novel deberta-based model for financial question answering task. arXiv preprint arXiv:2207.05875 (2022)"},{"key":"1_CR44","unstructured":"Wu, X.: Finmegatron: large financial domain language models. Proc. Second Type Res. Meeting 2021(FIN-026), 22 (2021)"},{"key":"1_CR45","doi-asserted-by":"crossref","unstructured":"Yang, H., Liu, X.Y., Wang, C.D.: Fingpt: open-source financial large language models. In: FinLLM Symposium at IJCAI 2023 (2023)","DOI":"10.2139\/ssrn.4489826"},{"issue":"2","key":"1_CR46","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3298981","volume":"10","author":"Q Yang","year":"2019","unstructured":"Yang, Q., Liu, Y., Chen, T., Tong, Y.: Federated machine learning: concept and applications. ACM Trans. Intell. Syst. Technol. (TIST) 10(2), 1\u201319 (2019)","journal-title":"ACM Trans. Intell. Syst. Technol. (TIST)"},{"key":"1_CR47","unstructured":"Zhu, W., Kairouz, P., McMahan, B., Sun, H., Li, W.: Federated heavy hitters discovery with differential privacy. In: International Conference on Artificial Intelligence and Statistics, pp. 3837\u20133847. PMLR (2020)"},{"key":"1_CR48","doi-asserted-by":"crossref","unstructured":"Zouhar, V., et al.: A formal perspective on byte-pair encoding. arXiv preprint arXiv:2306.16837 (2023)","DOI":"10.18653\/v1\/2023.findings-acl.38"}],"container-title":["Lecture Notes in Computer Science","Social Networks Analysis and Mining"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-78538-2_1","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,24]],"date-time":"2025-01-24T07:30:32Z","timestamp":1737703832000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-78538-2_1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031785375","9783031785382"],"references-count":48,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-78538-2_1","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"25 January 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ASONAM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Advances in Social Networks Analysis and Mining","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Rende","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 September 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"asonam-12024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/asonam.cpsc.ucalgary.ca\/2024\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}