{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,6,25]],"date-time":"2024-06-25T05:53:07Z","timestamp":1719294787542},"reference-count":44,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2024,3,1]],"date-time":"2024-03-01T00:00:00Z","timestamp":1709251200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2024,3,1]],"date-time":"2024-03-01T00:00:00Z","timestamp":1709251200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2024,3,1]],"date-time":"2024-03-01T00:00:00Z","timestamp":1709251200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2024,3,1]],"date-time":"2024-03-01T00:00:00Z","timestamp":1709251200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2024,3,1]],"date-time":"2024-03-01T00:00:00Z","timestamp":1709251200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,3,1]],"date-time":"2024-03-01T00:00:00Z","timestamp":1709251200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61773151"]},{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2020YFC2007702"]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Speech Communication"],"published-print":{"date-parts":[[2024,3]]},"DOI":"10.1016\/j.specom.2024.103037","type":"journal-article","created":{"date-parts":[[2024,2,23]],"date-time":"2024-02-23T16:46:32Z","timestamp":1708706792000},"page":"103037","update-policy":"http:\/\/dx.doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":1,"title":["Language fusion via adapters for low-resource speech recognition"],"prefix":"10.1016","volume":"158","author":[{"ORCID":"http:\/\/orcid.org\/0000-0001-6509-0834","authenticated-orcid":false,"given":"Qing","family":"Hu","sequence":"first","affiliation":[]},{"given":"Yan","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Xianlei","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Zongyu","family":"Han","sequence":"additional","affiliation":[]},{"given":"Xiuxia","family":"Liang","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.specom.2024.103037_b1","series-title":"Proceedings of the Twelfth Language Resources and Evaluation Conference","first-page":"4218","article-title":"Common voice: A massively-multilingual speech corpus","author":"Ardila","year":"2020"},{"key":"10.1016\/j.specom.2024.103037_b2","series-title":"Layer normalization","author":"Ba","year":"2016"},{"key":"10.1016\/j.specom.2024.103037_b3","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume":"vol. 33","author":"Baevski","year":"2020"},{"key":"10.1016\/j.specom.2024.103037_b4","series-title":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"6402","article-title":"Joint unsupervised and supervised training for multilingual ASR","author":"Bai","year":"2022"},{"key":"10.1016\/j.specom.2024.103037_b5","series-title":"Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)","first-page":"1","article-title":"BitFit: Simple parameter-efficient fine-tuning for transformer-based masked language-models","author":"Ben Zaken","year":"2022"},{"key":"10.1016\/j.specom.2024.103037_b6","first-page":"1877","article-title":"Language models are few-shot learners","volume":"vol. 33","author":"Brown","year":"2020"},{"key":"10.1016\/j.specom.2024.103037_b7","series-title":"2016 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"4960","article-title":"Listen, attend and spell: A neural network for large vocabulary conversational speech recognition","author":"Chan","year":"2016"},{"key":"10.1016\/j.specom.2024.103037_b8","series-title":"2018 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"4774","article-title":"State-of-the-art speech recognition with sequence-to-sequence models","author":"Chiu","year":"2018"},{"key":"10.1016\/j.specom.2024.103037_b9","article-title":"Attention-based models for speech recognition","volume":"vol. 28","author":"Chorowski","year":"2015"},{"key":"10.1016\/j.specom.2024.103037_b10","series-title":"Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","first-page":"1336","article-title":"Efficient hierarchical domain adaptation for pretrained language models","author":"Chronopoulou","year":"2022"},{"key":"10.1016\/j.specom.2024.103037_b11","series-title":"Findings of the Association for Computational Linguistics: EACL 2023","first-page":"2054","article-title":"AdapterSoup: Weight averaging to improve generalization of pretrained language models","author":"Chronopoulou","year":"2023"},{"key":"10.1016\/j.specom.2024.103037_b12","series-title":"2022 IEEE Spoken Language Technology Workshop","first-page":"798","article-title":"FLEURS: FEW-Shot learning evaluation of universal representations of speech","author":"Conneau","year":"2023"},{"key":"10.1016\/j.specom.2024.103037_b13","series-title":"2018 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"5884","article-title":"Speech-transformer: A no-recurrence sequence-to-sequence model for speech recognition","author":"Dong","year":"2018"},{"key":"10.1016\/j.specom.2024.103037_b14","series-title":"DRAFT: A novel framework to reduce domain shifting in self-supervised learning and its application to children\u2019s ASR","author":"Fan","year":"2022"},{"key":"10.1016\/j.specom.2024.103037_b15","series-title":"Proceedings of the 34th International Conference on Machine Learning","first-page":"1126","article-title":"Model-agnostic meta-learning for fast adaptation of deep networks","volume":"vol. 70","author":"Finn","year":"2017"},{"key":"10.1016\/j.specom.2024.103037_b16","series-title":"Findings of the Association for Computational Linguistics: NAACL 2022","first-page":"2608","article-title":"AdapterBias: Parameter-efficient token-dependent representation shift for adapters in NLP tasks","author":"Fu","year":"2022"},{"key":"10.1016\/j.specom.2024.103037_b17","series-title":"Proceedings of the 23rd International Conference on Machine Learning","first-page":"369","article-title":"Connectionist temporal classification: Labelling unsegmented sequence data with recurrent neural networks","author":"Graves","year":"2006"},{"key":"10.1016\/j.specom.2024.103037_b18","series-title":"Findings of the Association for Computational Linguistics: EMNLP 2022","first-page":"2184","article-title":"SparseAdapter: An easy approach for improving the parameter-efficiency of adapters","author":"He","year":"2022"},{"key":"10.1016\/j.specom.2024.103037_b19","series-title":"ICASSP 2019 - 2019 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"6381","article-title":"Streaming end-to-end speech recognition for mobile devices","author":"He","year":"2019"},{"key":"10.1016\/j.specom.2024.103037_b20","series-title":"Proc. Interspeech 2020","first-page":"1037","article-title":"Large-scale end-to-end multilingual speech recognition and language identification with multi-task learning","author":"Hou","year":"2020"},{"key":"10.1016\/j.specom.2024.103037_b21","series-title":"ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"7028","article-title":"Meta-adapter: Efficient cross-lingual adaptation with meta-learning","author":"Hou","year":"2021"},{"key":"10.1016\/j.specom.2024.103037_b22","doi-asserted-by":"crossref","first-page":"317","DOI":"10.1109\/TASLP.2021.3138674","article-title":"Exploiting adapters for cross-lingual low-resource speech recognition","volume":"30","author":"Hou","year":"2022","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.specom.2024.103037_b23","series-title":"Proceedings of the 36th International Conference on Machine Learning","first-page":"2790","article-title":"Parameter-efficient transfer learning for NLP","volume":"vol. 97","author":"Houlsby","year":"2019"},{"key":"10.1016\/j.specom.2024.103037_b24","series-title":"Lora: Low-rank adaptation of large language models","author":"Hu","year":"2021"},{"key":"10.1016\/j.specom.2024.103037_b25","series-title":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"1","article-title":"FindAdaptNet: Find and insert adapters by learned layer importance","author":"Huang","year":"2023"},{"key":"10.1016\/j.specom.2024.103037_b26","series-title":"Proc. Interspeech 2019","first-page":"2130","article-title":"Large-scale multilingual speech recognition with a streaming end-to-end model","author":"Kannan","year":"2019"},{"key":"10.1016\/j.specom.2024.103037_b27","first-page":"1022","article-title":"Compacter: Efficient low-rank hypercomplex adapter layers","volume":"vol. 34","author":"Karimi Mahabadi","year":"2021"},{"key":"10.1016\/j.specom.2024.103037_b28","series-title":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"3179","article-title":"An adapter based pre-training for efficient and scalable self-supervised speech representation learning","author":"Kessler","year":"2022"},{"key":"10.1016\/j.specom.2024.103037_b29","series-title":"Adam: A method for stochastic optimization","author":"Kingma","year":"2014"},{"key":"10.1016\/j.specom.2024.103037_b30","series-title":"Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)","first-page":"4582","article-title":"Prefix-tuning: Optimizing continuous prompts for generation","author":"Li","year":"2021"},{"key":"10.1016\/j.specom.2024.103037_b31","series-title":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"6397","article-title":"Massively multilingual ASR: A lifelong learning solution","author":"Li","year":"2022"},{"key":"10.1016\/j.specom.2024.103037_b32","article-title":"Rectifier nonlinearities improve neural network acoustic models","volume":"vol. 30","author":"Maas","year":"2013"},{"key":"10.1016\/j.specom.2024.103037_b33","series-title":"Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume","first-page":"487","article-title":"AdapterFusion: Non-destructive task composition for transfer learning","author":"Pfeiffer","year":"2021"},{"key":"10.1016\/j.specom.2024.103037_b34","series-title":"Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing","first-page":"7654","article-title":"MAD-X: An adapter-based framework for multi-task cross-lingual transfer","author":"Pfeiffer","year":"2020"},{"key":"10.1016\/j.specom.2024.103037_b35","series-title":"Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing","first-page":"7930","article-title":"AdapterDrop: On the efficiency of adapters in transformers","author":"R\u00fcckl\u00e9","year":"2021"},{"key":"10.1016\/j.specom.2024.103037_b36","series-title":"Language-universal adapter learning with knowledge distillation for end-to-end multilingual speech recognition","author":"Shen","year":"2023"},{"key":"10.1016\/j.specom.2024.103037_b37","series-title":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"7102","article-title":"Efficient adapter transfer of self-supervised speech models for automatic speech recognition","author":"Thomas","year":"2022"},{"key":"10.1016\/j.specom.2024.103037_b38","series-title":"Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing","first-page":"6751","article-title":"Residual adapters for parameter-efficient ASR adaptation to atypical and accented speech","author":"Tomanek","year":"2021"},{"key":"10.1016\/j.specom.2024.103037_b39","series-title":"2018 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"4904","article-title":"Multilingual speech recognition with a single end-to-end model","author":"Toshniwal","year":"2018"},{"key":"10.1016\/j.specom.2024.103037_b40","series-title":"Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","first-page":"8479","article-title":"SUPERB-SG: Enhanced speech processing universal PERformance benchmark for semantic and generative capabilities","author":"Tsai","year":"2022"},{"key":"10.1016\/j.specom.2024.103037_b41","series-title":"Adamix: Mixture-of-adapter for parameter-efficient tuning of large language models","author":"Wang","year":"2022"},{"key":"10.1016\/j.specom.2024.103037_b42","series-title":"Proc. Interspeech 2018","first-page":"2207","article-title":"Espnet: End-to-end speech processing toolkit","author":"Watanabe","year":"2018"},{"key":"10.1016\/j.specom.2024.103037_b43","series-title":"Proc. Interspeech 2021","first-page":"2451","article-title":"Adapt-and-adjust: Overcoming the long-tail problem of multilingual speech recognition","author":"Winata","year":"2021"},{"key":"10.1016\/j.specom.2024.103037_b44","first-page":"21682","article-title":"Contrastive adapters for foundation model group robustness","volume":"vol. 35","author":"Zhang","year":"2022"}],"container-title":["Speech Communication"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0167639324000098?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0167639324000098?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2024,3,20]],"date-time":"2024-03-20T00:28:03Z","timestamp":1710894483000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0167639324000098"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,3]]},"references-count":44,"alternative-id":["S0167639324000098"],"URL":"http:\/\/dx.doi.org\/10.1016\/j.specom.2024.103037","relation":{},"ISSN":["0167-6393"],"issn-type":[{"value":"0167-6393","type":"print"}],"subject":[],"published":{"date-parts":[[2024,3]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Language fusion via adapters for low-resource speech recognition","name":"articletitle","label":"Article Title"},{"value":"Speech Communication","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.specom.2024.103037","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2024 Elsevier B.V. All rights reserved.","name":"copyright","label":"Copyright"}],"article-number":"103037"}}