{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,27]],"date-time":"2026-02-27T16:23:40Z","timestamp":1772209420658,"version":"3.50.1"},"reference-count":76,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"am","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"JHU"},{"name":"CMU"},{"name":"NSF","award":["OCI-1053575"],"award-info":[{"award-number":["OCI-1053575"]}]},{"name":"NSF","award":["ACI-1445606"],"award-info":[{"award-number":["ACI-1445606"]}]},{"name":"Vidi-grant from NWO","award":["276-89-003"],"award-info":[{"award-number":["276-89-003"]}]},{"name":"Delft Technology Fellowship from Delft University of Technology"},{"name":"French ANR"},{"name":"German DFG","award":["ANR-14-CE35-0002"],"award-info":[{"award-number":["ANR-14-CE35-0002"]}]},{"DOI":"10.13039\/501100000781","name":"European Research Council","doi-asserted-by":"publisher","award":["ERC-2011-AdG-295810 BOOTPHON"],"award-info":[{"award-number":["ERC-2011-AdG-295810 BOOTPHON"]}],"id":[{"id":"10.13039\/501100000781","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100000781","name":"European Research Council","doi-asserted-by":"publisher","award":["ANR-10-LABX-0087 IEC"],"award-info":[{"award-number":["ANR-10-LABX-0087 IEC"]}],"id":[{"id":"10.13039\/501100000781","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100000781","name":"European Research Council","doi-asserted-by":"publisher","award":["ANR-10-IDEX-0001-02 PSL*"],"award-info":[{"award-number":["ANR-10-IDEX-0001-02 PSL*"]}],"id":[{"id":"10.13039\/501100000781","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2020]]},"DOI":"10.1109\/taslp.2020.2973896","type":"journal-article","created":{"date-parts":[[2020,2,13]],"date-time":"2020-02-13T23:32:20Z","timestamp":1581636740000},"page":"964-975","source":"Crossref","is-referenced-by-count":17,"title":["Speech Technology for Unwritten Languages"],"prefix":"10.1109","volume":"28","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0693-8852","authenticated-orcid":false,"given":"Odette","family":"Scharenborg","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Laurent","family":"Besacier","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8820-8831","authenticated-orcid":false,"given":"Alan","family":"Black","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5631-2893","authenticated-orcid":false,"given":"Mark","family":"Hasegawa-Johnson","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6663-8600","authenticated-orcid":false,"given":"Florian","family":"Metze","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Graham","family":"Neubig","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sebastian","family":"Stuker","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Pierre","family":"Godard","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Markus","family":"Muller","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lucas","family":"Ondel","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shruti","family":"Palaskar","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Philip","family":"Arthur","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Francesco","family":"Ciannella","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mingxing","family":"Du","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Elin","family":"Larsen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Danny","family":"Merkx","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rachid","family":"Riad","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5562-8386","authenticated-orcid":false,"given":"Liming","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Emmanuel","family":"Dupoux","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref73","article-title":"Google's neural machine translation system: Bridging the gap between human and machine translation","author":"wu","year":"2016","journal-title":"arXiv 1609 08144"},{"key":"ref72","article-title":"Sequence-to-sequence models can directly transcribe foreign speech","author":"weiss","year":"2017","journal-title":"arXiv 1703 08581"},{"key":"ref71","article-title":"Multilingual bottle-neck features and its application for under-resourced languages","author":"vu","year":"2012","journal-title":"Proc Int Workshop Spoken Lang Technol Under-Resourced Lang"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2012.6424246"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2010.5495637"},{"key":"ref74","first-page":"2132","article-title":"A comparative study of BNF and DNN multilingual training on cross-lingual low-resource speech recognition","author":"xu","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref39","first-page":"1700","article-title":"Recurrent continuous translation models","author":"kalchbrenner","year":"2013","journal-title":"Proc Conf Empirical Methods Natural Lang Process"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.503"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/TIT.1975.1055384"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1613\/jair.3994"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.3115\/116580.116613"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2016.2621659"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/PROC.1976.10159"},{"key":"ref36","first-page":"8111","article-title":"A summary of the 2012 JH CLSP Workshop on zero resource speech technologies and models of early language acquisition","author":"jansen","year":"0","journal-title":"Proc Int Conf Acoust Speech Signal Process"},{"key":"ref35","first-page":"153","article-title":"The JHU\/KyotoU speech translation system for IWSLT 2018","author":"inaguma","year":"0","journal-title":"Proc Int Workshop Spoken Lang Translation"},{"key":"ref34","first-page":"119","article-title":"Cross-dialectal data transferring for gaussian mixture model training in arabic speech recognition","author":"huang","year":"0","journal-title":"Proc Int Conf Arabic Lang Process (CITALA) Rabat Morocco"},{"key":"ref60","article-title":"Building an ASR system for a low-resource language through the adaptation of a high-resource language ASR system: Preliminary results","author":"scharenborg","year":"2017","journal-title":"Proc ICNLSP"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.21437\/SLTU.2018-35"},{"key":"ref63","article-title":"Very deep convolutional networks for large-scale image classification","author":"simonyan","year":"2014"},{"key":"ref28","article-title":"Image2speech: Automatically generating audio descriptions of images","author":"hasegawa-johnson","year":"0","journal-title":"Proc Int Conf Natural Lang Signal Speech Process Casablanca Morocco"},{"key":"ref64","article-title":"Innovative technologies for under-resourced language documentation: The Bulb project","author":"st\u00fcker","year":"2016","journal-title":"Proc Collaboration Comput Under-Resourced Lang"},{"key":"ref27","first-page":"1858","article-title":"Unsupervised learning of spoken language with visual context","author":"harwath","year":"0","journal-title":"Proc Adv Neural Inform Process Syst"},{"key":"ref65","first-page":"3104","article-title":"Sequence to sequence learning with neural networks","author":"sutskever","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref66","article-title":"Experiments on cross-language acoustic modelling","author":"waibel","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref29","first-page":"19","article-title":"Image2speech: Automatically generating audio descriptions of images","volume":"1","author":"hasegawa-johnson","year":"0","journal-title":"J Int Sci Gen Appl"},{"key":"ref67","first-page":"b","article-title":"Use of pronouncing dictionary in speech synthesis experiments","author":"teranishi","year":"1968","journal-title":"Proc Int Congr Acoust"},{"key":"ref68","doi-asserted-by":"crossref","first-page":"443","DOI":"10.1109\/PROC.1976.10153","article-title":"linguistic rules for text-to-speech synthesis","volume":"64","author":"umeda","year":"1976","journal-title":"Proceedings of the IEEE"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.3115\/1557690.1557736"},{"key":"ref2","first-page":"368","article-title":"Encoding of phonology in a recurrent neural model of grounded speech","author":"alishani","year":"0","journal-title":"Proc Computational Natural Language Learning"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1016\/j.procs.2016.04.023"},{"key":"ref20","article-title":"Vgg16 in tensorflow","author":"frossard","year":"2016"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"ref21","article-title":"A very low resource language speech corpus for computational language documentation experiments","author":"godard","year":"2017","journal-title":"arXiv 1710 03501"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462396"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2015.7404800"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_40"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682666"},{"key":"ref50","first-page":"2","article-title":"The IWSLT 2018 evaluation campaign","author":"niehues","year":"0","journal-title":"Proc Int Workshop Spoken Lang Translation"},{"key":"ref51","author":"ogden","year":"1923","journal-title":"The Meaning of Meaning A Study of the Influence of Language upon Thought and of the Science of Symbolism"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461761"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-15760-8_22"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2007.909282"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"ref55","first-page":"1987","article-title":"Automatic image captioning","author":"pan","year":"0","journal-title":"Proc IEEE Int Conf Multimedia Expo"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/29.45533"},{"key":"ref53","first-page":"340","article-title":"Experiences from the spoken Dutch Corpus Project","author":"oostdijk","year":"0","journal-title":"Proc Int Conf Lang Resour Eval Las Palmas de Gran Canaria"},{"key":"ref52","first-page":"80","article-title":"Variational inference for acoustic unit discovery","author":"ondel","year":"0","journal-title":"Proc Journal of Computer Science"},{"key":"ref10","article-title":"Listen, attend and spell","author":"chan","year":"2015","journal-title":"arXiv 1508 01211"},{"key":"ref11","first-page":"1109","article-title":"Large-scale online learning of image similarity through ranking","volume":"11","author":"chechik","year":"2010","journal-title":"J Mach Learn Res"},{"key":"ref40","first-page":"73","article-title":"The 2014 KIT IWSLT speech-to-text systems for English, German and Italian","author":"kilgour","year":"0","journal-title":"Proc Int Workshop Spoken Lang Translation"},{"key":"ref12","article-title":"Enhancing Sentence embedding with generalized pooling","author":"chen","year":"2018","journal-title":"arXiv 1806 09828"},{"key":"ref13","article-title":"Corpus based linguistic exploration via forced alignments with a light-weight asr tool","author":"leavitt","year":"0","journal-title":"Proc Lang Technol Conf Human Lang Technol Challenge Comput Sci Linguistics"},{"key":"ref14","article-title":"Construction and analysis of a large scale image ontology","author":"deng","year":"0","journal-title":"Vision Science Society"},{"key":"ref15","first-page":"147","article-title":"Finne-tuning on clean data for end-to-end speech translation: FBK IWSLT 2018","author":"di gangi","year":"0","journal-title":"Proc Int Workshop Spoken Lang Translation"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N16-1109"},{"key":"ref17","article-title":"XNMT: The extensible neural machine translation toolkit","author":"neubig","year":"2018","journal-title":"arXiv 1803 00188"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2017.06.008"},{"key":"ref19","article-title":"Contextual word and syllable pronunciation models","author":"fosler-lussier","year":"0","journal-title":"Proc IEEE Workshop Autom Speech Recognit Understanding"},{"key":"ref4","article-title":"Pre-training on high-resource speech recognition improves low-resource speech-to-text translation","author":"bansal","year":"2018"},{"key":"ref3","article-title":"Neural machine translation by jointly learning to align and translate","author":"bahdanau","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2006.326795"},{"key":"ref5","article-title":"Listen and translate: A proof of concept for end-to-end speech-to-text translation","author":"b\u00e9rard","year":"0","journal-title":"Proc Neural Inf Process Syst Workshop End-to-End Learn Speech Audio Process"},{"key":"ref8","first-page":"1762","article-title":"CLUSTERGEN: A statistical parametric speech synthesizer using trajectory modeling","author":"black","year":"0","journal-title":"Proc Int Conf Spoken Lang Process"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1016\/j.procs.2016.04.030"},{"key":"ref49","article-title":"Neural machine translation and sequence-to-sequence models: A tutorial","author":"neubig","year":"2017","journal-title":"arXiv 1703 01619"},{"key":"ref9","first-page":"1211","article-title":"Random forests for statistical speech synthesis","author":"black","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2014.6854069"},{"key":"ref45","first-page":"2631","article-title":"High-quality speech synthesis for phonetic speech segmentation","author":"malfrere","year":"0","journal-title":"Proc EUROSPEECH"},{"key":"ref48","article-title":"XNMT","author":"neubig","year":"0"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/TASSP.1975.1162632"},{"key":"ref42","article-title":"The CMU pronouncing dictionary","author":"lenzo","year":"2014"},{"key":"ref41","first-page":"8","article-title":"The basic language resource kit (BLARK) as the first milestone for the language resources roadmap","author":"krauwer","year":"0","journal-title":"Proc Int Workshop Speech Comput Moscow Russia"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511753459"},{"key":"ref43","article-title":"Cross-language bootstrapping for unsupervised acoustic model training: Rapid development of a polish speech recognition system","author":"l\u00f6\u00f6f","year":"0","journal-title":"Proc INTERSPEECH"}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"https:\/\/ieeexplore.ieee.org\/ielam\/6570655\/8938144\/8998182-aam.pdf","content-type":"application\/pdf","content-version":"am","intended-application":"syndication"},{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6570655\/8938144\/08998182.pdf?arnumber=8998182","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,4,27]],"date-time":"2022-04-27T17:31:21Z","timestamp":1651080681000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8998182\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020]]},"references-count":76,"URL":"https:\/\/doi.org\/10.1109\/taslp.2020.2973896","relation":{},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"value":"2329-9290","type":"print"},{"value":"2329-9304","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020]]}}}