{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,7,5]],"date-time":"2024-07-05T00:08:35Z","timestamp":1720138115728},"reference-count":60,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2020YFC2007702"]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61773151"]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Information Fusion"],"published-print":{"date-parts":[[2024,11]]},"DOI":"10.1016\/j.inffus.2024.102506","type":"journal-article","created":{"date-parts":[[2024,6,6]],"date-time":"2024-06-06T06:29:05Z","timestamp":1717655345000},"page":"102506","update-policy":"http:\/\/dx.doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["CAM: A cross-lingual adaptation framework for low-resource language speech recognition"],"prefix":"10.1016","volume":"111","author":[{"ORCID":"http:\/\/orcid.org\/0000-0001-6509-0834","authenticated-orcid":false,"given":"Qing","family":"Hu","sequence":"first","affiliation":[]},{"ORCID":"http:\/\/orcid.org\/0000-0001-5507-7572","authenticated-orcid":false,"given":"Yan","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Xianlei","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Zongyu","family":"Han","sequence":"additional","affiliation":[]},{"given":"Xilong","family":"Yu","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.inffus.2024.102506_b1","series-title":"Proceedings of the 40th International Conference on Machine Learning","first-page":"28492","article-title":"Robust speech recognition via large-scale weak supervision","volume":"vol. 202","author":"Radford","year":"2023"},{"key":"10.1016\/j.inffus.2024.102506_b2","series-title":"Qwen-audio: Advancing universal audio understanding via unified large-scale audio-language models","author":"Chu","year":"2023"},{"key":"10.1016\/j.inffus.2024.102506_b3","series-title":"2017 20th Conference of the Oriental Chapter of the International Coordinating Committee on Speech Databases and Speech I\/O Systems and Assessment","first-page":"1","article-title":"AISHELL-1: An open-source mandarin speech corpus and a speech recognition baseline","author":"Bu","year":"2017"},{"key":"10.1016\/j.inffus.2024.102506_b4","series-title":"Proceedings of the IJCNLP-08 Workshop on NLP for Less Privileged Languages","article-title":"Natural language processing for less privileged languages: Where do we come from? Where are we going?","author":"Singh","year":"2008"},{"key":"10.1016\/j.inffus.2024.102506_b5","series-title":"Proceedings of the Tenth International Conference on Language Resources and Evaluation","first-page":"4543","article-title":"Selection criteria for low resource language programs","author":"Cieri","year":"2016"},{"key":"10.1016\/j.inffus.2024.102506_b6","article-title":"Opportunities and challenges in working with low-resource languages","author":"Tsvetkov","year":"2017","journal-title":"Slides Part-1"},{"key":"10.1016\/j.inffus.2024.102506_b7","first-page":"12449","article-title":"Wav2vec 2.0: A framework for self-supervised learning of speech representations","volume":"vol. 33","author":"Baevski","year":"2020"},{"key":"10.1016\/j.inffus.2024.102506_b8","doi-asserted-by":"crossref","first-page":"3451","DOI":"10.1109\/TASLP.2021.3122291","article-title":"HuBERT: Self-supervised speech representation learning by masked prediction of hidden units","volume":"29","author":"Hsu","year":"2021","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"issue":"6","key":"10.1016\/j.inffus.2024.102506_b9","doi-asserted-by":"crossref","first-page":"1505","DOI":"10.1109\/JSTSP.2022.3188113","article-title":"WavLM: Large-scale self-supervised pre-training for full stack speech processing","volume":"16","author":"Chen","year":"2022","journal-title":"IEEE J. Sel. Top. Sign. Proces."},{"key":"10.1016\/j.inffus.2024.102506_b10","series-title":"Proceedings of the 39th International Conference on Machine Learning","first-page":"1298","article-title":"data2vec: A general framework for self-supervised learning in speech, vision and language","volume":"vol. 162","author":"Baevski","year":"2022"},{"issue":"6","key":"10.1016\/j.inffus.2024.102506_b11","doi-asserted-by":"crossref","first-page":"1227","DOI":"10.1109\/JSTSP.2022.3184480","article-title":"Improving automatic speech recognition performance for low-resource languages with self-supervised models","volume":"16","author":"Zhao","year":"2022","journal-title":"IEEE J. Sel. Top. Sign. Proces."},{"key":"10.1016\/j.inffus.2024.102506_b12","series-title":"Unsupervised cross-lingual representation learning for speech recognition","author":"Conneau","year":"2020"},{"key":"10.1016\/j.inffus.2024.102506_b13","series-title":"2015 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"5206","article-title":"Librispeech: An ASR corpus based on public domain audio books","author":"Panayotov","year":"2015"},{"key":"10.1016\/j.inffus.2024.102506_b14","article-title":"Attention is all you need","volume":"vol. 30","author":"Vaswani","year":"2017"},{"key":"10.1016\/j.inffus.2024.102506_b15","series-title":"Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing","first-page":"6751","article-title":"Residual adapters for parameter-efficient ASR adaptation to atypical and accented speech","author":"Tomanek","year":"2021"},{"key":"10.1016\/j.inffus.2024.102506_b16","series-title":"ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"7028","article-title":"Meta-adapter: Efficient cross-lingual adaptation with meta-learning","author":"Hou","year":"2021"},{"key":"10.1016\/j.inffus.2024.102506_b17","series-title":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"7102","article-title":"Efficient adapter transfer of self-supervised speech models for automatic speech recognition","author":"Thomas","year":"2022"},{"key":"10.1016\/j.inffus.2024.102506_b18","doi-asserted-by":"crossref","DOI":"10.1016\/j.specom.2024.103037","article-title":"Language fusion via adapters for low-resource speech recognition","volume":"158","author":"Hu","year":"2024","journal-title":"Speech Commun."},{"key":"10.1016\/j.inffus.2024.102506_b19","doi-asserted-by":"crossref","first-page":"317","DOI":"10.1109\/TASLP.2021.3138674","article-title":"Exploiting adapters for cross-lingual low-resource speech recognition","volume":"30","author":"Hou","year":"2022","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.inffus.2024.102506_b20","series-title":"2022 IEEE Spoken Language Technology Workshop","first-page":"798","article-title":"FLEURS: FEW-shot learning evaluation of universal representations of speech","author":"Conneau","year":"2023"},{"key":"10.1016\/j.inffus.2024.102506_b21","series-title":"Proceedings of the 36th International Conference on Machine Learning","first-page":"2790","article-title":"Parameter-efficient transfer learning for NLP","volume":"vol. 97","author":"Houlsby","year":"2019"},{"key":"10.1016\/j.inffus.2024.102506_b22","series-title":"Proceedings of the 39th International Conference on Machine Learning","first-page":"23965","article-title":"Model soups: averaging weights of multiple fine-tuned models improves accuracy without increasing inference time","volume":"vol. 162","author":"Wortsman","year":"2022"},{"key":"10.1016\/j.inffus.2024.102506_b23","series-title":"Findings of the Association for Computational Linguistics: EACL 2023","first-page":"2054","article-title":"AdapterSoup: Weight averaging to improve generalization of pretrained language models","author":"Chronopoulou","year":"2023"},{"key":"10.1016\/j.inffus.2024.102506_b24","series-title":"International Conference on Learning Representations","article-title":"LoRA: Low-rank adaptation of large language models","author":"Hu","year":"2022"},{"key":"10.1016\/j.inffus.2024.102506_b25","series-title":"Proceedings of the Twelfth Language Resources and Evaluation Conference","first-page":"4218","article-title":"Common voice: A massively-multilingual speech corpus","author":"Ardila","year":"2020"},{"key":"10.1016\/j.inffus.2024.102506_b26","series-title":"Proceedings of the 37th International Conference on Machine Learning","first-page":"1597","article-title":"A simple framework for contrastive learning of visual representations","volume":"vol. 119","author":"Chen","year":"2020"},{"key":"10.1016\/j.inffus.2024.102506_b27","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"Momentum contrast for unsupervised visual representation learning","author":"He","year":"2020"},{"key":"10.1016\/j.inffus.2024.102506_b28","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"15750","article-title":"Exploring simple siamese representation learning","author":"Chen","year":"2021"},{"key":"10.1016\/j.inffus.2024.102506_b29","series-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2019"},{"key":"10.1016\/j.inffus.2024.102506_b30","series-title":"ALBERT: A lite BERT for self-supervised learning of language representations","author":"Lan","year":"2020"},{"key":"10.1016\/j.inffus.2024.102506_b31","series-title":"Efficient estimation of word representations in vector space","author":"Mikolov","year":"2013"},{"key":"10.1016\/j.inffus.2024.102506_b32","series-title":"Representation learning with contrastive predictive coding","author":"Oord","year":"2018"},{"key":"10.1016\/j.inffus.2024.102506_b33","series-title":"wav2vec: Unsupervised pre-training for speech recognition","author":"Schneider","year":"2019"},{"key":"10.1016\/j.inffus.2024.102506_b34","series-title":"Effectiveness of self-supervised pre-training for speech recognition","author":"Baevski","year":"2020"},{"key":"10.1016\/j.inffus.2024.102506_b35","series-title":"Categorical reparameterization with gumbel-softmax","author":"Jang","year":"2016"},{"key":"10.1016\/j.inffus.2024.102506_b36","series-title":"Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing","first-page":"3045","article-title":"The power of scale for parameter-efficient prompt tuning","author":"Lester","year":"2021"},{"key":"10.1016\/j.inffus.2024.102506_b37","series-title":"Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining","first-page":"2802","article-title":"LRSpeech: Extremely low-resource speech synthesis and recognition","author":"Xu","year":"2020"},{"key":"10.1016\/j.inffus.2024.102506_b38","series-title":"ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"7008","article-title":"MixSpeech: Data augmentation for low-resource automatic speech recognition","author":"Meng","year":"2021"},{"key":"10.1016\/j.inffus.2024.102506_b39","doi-asserted-by":"crossref","first-page":"394","DOI":"10.1109\/TASLP.2022.3140552","article-title":"Optimizing data usage for low-resource speech recognition","volume":"30","author":"Qian","year":"2022","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.inffus.2024.102506_b40","series-title":"Interspeech 2019","article-title":"SpecAugment: A simple data augmentation method for automatic speech recognition","author":"Park","year":"2019"},{"key":"10.1016\/j.inffus.2024.102506_b41","series-title":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"8187","article-title":"Curriculum optimization for low-resource speech recognition","author":"Kuznetsova","year":"2022"},{"key":"10.1016\/j.inffus.2024.102506_b42","series-title":"Proceedings of the 26th Annual International Conference on Machine Learning","first-page":"41","article-title":"Curriculum learning","author":"Bengio","year":"2009"},{"key":"10.1016\/j.inffus.2024.102506_b43","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","article-title":"Dynamic curriculum learning for imbalanced data classification","author":"Wang","year":"2019"},{"issue":"12","key":"10.1016\/j.inffus.2024.102506_b44","doi-asserted-by":"crossref","first-page":"6827","DOI":"10.1007\/s00034-022-02075-7","article-title":"Cross-lingual self-training to learn multilingual representation for low-resource speech recognition","volume":"41","author":"Zhang","year":"2022","journal-title":"Circuits Systems Signal Process."},{"key":"10.1016\/j.inffus.2024.102506_b45","series-title":"A novel self-training approach for low-resource speech recognition","author":"Singh","year":"2023"},{"key":"10.1016\/j.inffus.2024.102506_b46","series-title":"Multilingual end-to-end speech recognition with a single transformer on low-resource languages","author":"Zhou","year":"2018"},{"issue":"3","key":"10.1016\/j.inffus.2024.102506_b47","doi-asserted-by":"crossref","first-page":"621","DOI":"10.1109\/TASLP.2018.2889606","article-title":"Language-adversarial transfer learning for low-resource speech recognition","volume":"27","author":"Yi","year":"2019","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.inffus.2024.102506_b48","series-title":"ICASSP 2019 - 2019 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"6071","article-title":"Language-invariant bottleneck features from adversarial end-to-end acoustic models for low resource speech recognition","author":"Yi","year":"2019"},{"key":"10.1016\/j.inffus.2024.102506_b49","series-title":"2018 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"4899","article-title":"Adversarial multilingual training for low-resource speech recognition","author":"Yi","year":"2018"},{"key":"10.1016\/j.inffus.2024.102506_b50","series-title":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"7844","article-title":"Meta learning for end-to-end low-resource speech recognition","author":"Hsu","year":"2020"},{"key":"10.1016\/j.inffus.2024.102506_b51","series-title":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"4798","article-title":"Improved meta learning for low resource speech recognition","author":"Singh","year":"2022"},{"key":"10.1016\/j.inffus.2024.102506_b52","series-title":"Proceedings of the 34th International Conference on Machine Learning","first-page":"1126","article-title":"Model-agnostic meta-learning for fast adaptation of deep networks","volume":"vol. 70","author":"Finn","year":"2017"},{"key":"10.1016\/j.inffus.2024.102506_b53","series-title":"Reptile: a scalable metalearning algorithm","author":"Nichol","year":"2018"},{"key":"10.1016\/j.inffus.2024.102506_b54","series-title":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"1","article-title":"Parameter efficient transfer learning for various speech processing tasks","author":"Otake","year":"2023"},{"key":"10.1016\/j.inffus.2024.102506_b55","series-title":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"1","article-title":"FindAdaptNet: Find and insert adapters by learned layer importance","author":"Huang","year":"2023"},{"key":"10.1016\/j.inffus.2024.102506_b56","series-title":"ICML \u201906","first-page":"369","article-title":"Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks","author":"Graves","year":"2006"},{"key":"10.1016\/j.inffus.2024.102506_b57","series-title":"Adam: A method for stochastic optimization","author":"Kingma","year":"2014"},{"key":"10.1016\/j.inffus.2024.102506_b58","series-title":"International Conference on Learning Representations","article-title":"SGDR: Stochastic gradient descent with warm restarts","author":"Loshchilov","year":"2017"},{"key":"10.1016\/j.inffus.2024.102506_b59","series-title":"Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH, Vol. 2018-September","first-page":"2207","article-title":"ESPNet: End-to-end speech processing toolkit","author":"Watanabe","year":"2018"},{"key":"10.1016\/j.inffus.2024.102506_b60","series-title":"Gaussian error linear units (gelus)","author":"Hendrycks","year":"2016"}],"container-title":["Information Fusion"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1566253524002847?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1566253524002847?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2024,7,4]],"date-time":"2024-07-04T10:52:20Z","timestamp":1720090340000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S1566253524002847"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11]]},"references-count":60,"alternative-id":["S1566253524002847"],"URL":"http:\/\/dx.doi.org\/10.1016\/j.inffus.2024.102506","relation":{},"ISSN":["1566-2535"],"issn-type":[{"value":"1566-2535","type":"print"}],"subject":[],"published":{"date-parts":[[2024,11]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"CAM: A cross-lingual adaptation framework for low-resource language speech recognition","name":"articletitle","label":"Article Title"},{"value":"Information Fusion","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.inffus.2024.102506","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2024 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"102506"}}