{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T18:30:29Z","timestamp":1776882629855,"version":"3.51.2"},"reference-count":59,"publisher":"Institute of Electronics, Information and Communications Engineers (IEICE)","issue":"10","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEICE Trans. Inf. &amp; Syst."],"published-print":{"date-parts":[[2021,10,1]]},"DOI":"10.1587\/transinf.2021edp7005","type":"journal-article","created":{"date-parts":[[2021,9,30]],"date-time":"2021-09-30T22:41:41Z","timestamp":1633041701000},"page":"1661-1677","source":"Crossref","is-referenced-by-count":2,"title":["Code-Switching ASR and TTS Using Semisupervised Learning with Machine Speech Chain"],"prefix":"10.1587","volume":"E104.D","author":[{"given":"Sahoko","family":"NAKAYAMA","sequence":"first","affiliation":[{"name":"Augmented Human Communication Lab, Nara Institute of Science and Technology"},{"name":"RIKEN, Center for Advanced Intelligence Project AIP"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Andros","family":"TJANDRA","sequence":"additional","affiliation":[{"name":"Augmented Human Communication Lab, Nara Institute of Science and Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sakriani","family":"SAKTI","sequence":"additional","affiliation":[{"name":"Augmented Human Communication Lab, Nara Institute of Science and Technology"},{"name":"RIKEN, Center for Advanced Intelligence Project AIP"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Satoshi","family":"NAKAMURA","sequence":"additional","affiliation":[{"name":"Augmented Human Communication Lab, Nara Institute of Science and Technology"},{"name":"RIKEN, Center for Advanced Intelligence Project AIP"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"532","reference":[{"key":"1","unstructured":"[1] Japanese Ministry of Health, Labour and Welfare, \u201cOverview of the population statistics in 2017 [in Japanese].\u201d https:\/\/www.mhlw.go.jp\/toukei\/saikin\/hw\/jinkou\/kakutei17\/xls\/29toukei.xls, 2017."},{"key":"2","doi-asserted-by":"crossref","unstructured":"[2] S. Poplack, Code switching: Linguistic, pp.2062-2065, Elsevier, 2001.","DOI":"10.1016\/B0-08-043076-7\/03031-X"},{"key":"3","unstructured":"[3] M. Nakamura, \u201cDeveloping codeswitching patterns of a Japanese\/English bilingual child,\u201d Proc. ISB4, Somerville, MA, USA, pp.1679-1689, 2005."},{"key":"4","unstructured":"[4] S.S. Fotos, \u201cJapanese-English code switching in bilingual children,\u201d JALT Journal, vol.12, no.1, pp.75-98, 1990."},{"key":"5","doi-asserted-by":"crossref","unstructured":"[5] A. Tjandra, S. Sakti, and S. Nakamura, \u201cListening while speaking: Speech chain by deep learning,\u201d Proc. ASRU, Okinawa, Japan, pp.301-308, IEEE, 2017. 10.1109\/asru.2017.8268950","DOI":"10.1109\/ASRU.2017.8268950"},{"key":"6","doi-asserted-by":"crossref","unstructured":"[6] A. Tjandra, S. Sakti, and S. Nakamura, \u201cMachine speech chain with one-shot speaker adaptation,\u201d Proc. INTERSPEECH, Hyderabad, India, pp.887-891, IEEE, 2018. 10.21437\/interspeech.2018-1558","DOI":"10.21437\/Interspeech.2018-1558"},{"key":"7","doi-asserted-by":"crossref","unstructured":"[7] S. Nakayama, A. Tjandra, S. Sakti, and S. Nakamura, \u201cSpeech chain for semi-supervised learning of Japanese-English code-switching ASR and TTS,\u201d Proc. SLT, Athens, Greece, IEEE, 2018. 10.1109\/slt.2018.8639674","DOI":"10.1109\/SLT.2018.8639674"},{"key":"8","doi-asserted-by":"crossref","unstructured":"[8] S. Nakayama, A. Tjandra, S. Sakti, and S. Nakamura, \u201cZero-shot code-switching ASR and TTS with multilingual machine speech chain,\u201d Proc. ASRU, Sentosa, Singapore, IEEE, 2019. 10.1109\/asru46091.2019.9003926","DOI":"10.1109\/ASRU46091.2019.9003926"},{"key":"9","doi-asserted-by":"crossref","unstructured":"[9] D.C. Lyu, T.P. Tan, E.S. Chng, and H. Li, \u201cSEAME: a Mandarin-English code-switching speech corpus in South-East Asia,\u201d Proc.INTERSPEECH, pp.1986-1989, 2010.","DOI":"10.21437\/Interspeech.2010-563"},{"key":"10","doi-asserted-by":"crossref","unstructured":"[10] H. Xu, S. Ding, and S. Watanabe, \u201cImproving end-to-end speech recognition with pronunciation-assisted sub-word modeling,\u201d Proc. ICASSP, pp.7110-7114, IEEE, 2019. 10.1109\/icassp.2019.8682494","DOI":"10.1109\/ICASSP.2019.8682494"},{"key":"11","unstructured":"[11] N.T. Vu, D.-C. Lyu, J. Weiner, D. Telaar, T. Schlippe, F. Blaicher, E.-S. Chng, T. Schultz, and H. Li, \u201cA first speech recognition system for Mandarin-English code-switch conversational speech,\u201d Proc. ICASSP, Kyoto, Japan, pp.4889-4892, IEEE, 2012. 10.1109\/icassp.2012.6289015"},{"key":"12","unstructured":"[12] N. Luo, D. Jiang, S. Zhao, C. Gong, W. Zou, and X. Li, \u201cTowards end-to-end code-switching speech recognition,\u201d CoRR, vol.abs\/1810.13091, 2018."},{"key":"13","doi-asserted-by":"crossref","unstructured":"[13] C. Shan, C. Weng, G. Wang, D. Su, M. Luo, D. Yu, and L. Xie, \u201cInvestigating end-to-end speech recognition for Mandarin-English code-switching,\u201d Proc. ICASSP, pp.6056-6060, IEEE, 2019. 10.1109\/icassp.2019.8682850","DOI":"10.1109\/ICASSP.2019.8682850"},{"key":"14","doi-asserted-by":"crossref","unstructured":"[14] B.H. Ahmed and T.-P. Tan, \u201cAutomatic speech recognition of code switching speech using 1-best rescoring,\u201d Proc. IALP, Hanoi,Vietnam, pp.137-140, 2012. 10.1109\/ialp.2012.28","DOI":"10.1109\/IALP.2012.28"},{"key":"15","doi-asserted-by":"publisher","unstructured":"[15] E. Y\u0131lmaz, H. van den Heuvel, and D. van Leeuwen, \u201cInvestigating bilingual deep neural networks for automatic recognition of code-switching frisian speech,\u201d Procedia Computer Science, vol.81, pp.159-166, 2016. 10.1016\/j.procs.2016.04.044","DOI":"10.1016\/j.procs.2016.04.044"},{"key":"16","doi-asserted-by":"crossref","unstructured":"[16] P. Guo, H. Xu, L. Xie, and E.S. Chng, \u201cStudy of semi-supervised approaches to improving English-Mandarin code-switching speech recognition,\u201d Proc. INTERSPEECH, pp.1928-1932, ISCA, Sept. 2018. 10.21437\/interspeech.2018-1974","DOI":"10.21437\/Interspeech.2018-1974"},{"key":"17","doi-asserted-by":"crossref","unstructured":"[17] M. Chu, H. Peng, Y. Zhao, Z. Niu, and E. Chang, \u201cMicrosoft Mulan-a bilingual TTS system,\u201d Proc. ICASSP, Hong Kong, China, pp.264-267, IEEE, 2003. 10.1109\/icassp.2003.1198768","DOI":"10.1109\/ICASSP.2003.1198768"},{"key":"18","unstructured":"[18] H. Liang, Y. Qian, and F.K. Soong, \u201cAn HMM-based bilingual (Mandarin-English) TTS,\u201d Proc. ISCA SSW6, Bonn, Germany, pp.137-142, 2007."},{"key":"19","unstructured":"[19] S. Sitaram and A.W. Black, \u201cSpeech synthesis of code-mixed text,\u201d Proc. LREC, Miyazaki, Japan, pp.3422-3428, 2016."},{"key":"20","doi-asserted-by":"crossref","unstructured":"[20] S. Sitaram, S. Rallabandi, S. Rijhwani, and A.W. Black, \u201cExperiments with cross-lingual systems for synthesis of code-mixed text,\u201d Proc. ISCA SSW9, Sunnyvale, CA, USA, 2016. 10.21437\/ssw.2016-13","DOI":"10.21437\/SSW.2016-13"},{"key":"21","doi-asserted-by":"crossref","unstructured":"[21] S. Rallabandi and A.W. Black, \u201cOn building mixed lingual speech synthesis systems,\u201d pp.52-56, ISCA, Aug. 2017.","DOI":"10.21437\/Interspeech.2017-1244"},{"key":"22","doi-asserted-by":"crossref","unstructured":"[22] C.M. White, S. Khudanpur, and J.K. Baker, \u201cAn investigation of acoustic models for multilingual code switching,\u201d Proc.INTERSPEECH, Brisbane, Australia, pp.2691-2694, 2008.","DOI":"10.21437\/Interspeech.2008-667"},{"key":"23","doi-asserted-by":"crossref","unstructured":"[23] D. Imseng, H. Bourlard, M. Magimai-Doss, and J. Dines, \u201cLanguage dependent universal phoneme posterior estimation for mixed language speech recognition,\u201d Proc. ICASSP, Prague, Czech Republic, pp.5012-5015, 2011. 10.1109\/icassp.2011.5947482","DOI":"10.1109\/ICASSP.2011.5947482"},{"key":"24","doi-asserted-by":"crossref","unstructured":"[24] H. Seki, S. Watanabe, T. Hori, J.L. Roux, and J.R. Hershey, \u201cAn end-to-end language-tracking speech recognizer for mixed-language speech,\u201d Proc. ICASSP, Calgary, Canada, IEEE, 2018. 10.1109\/icassp.2018.8462180","DOI":"10.1109\/ICASSP.2018.8462180"},{"key":"25","unstructured":"[25] H. Larochelle, D. Erhan, and Y. Bengio, \u201cZero-data learning of new tasks,\u201d Proc. AAAI, p.3, 2008."},{"key":"26","doi-asserted-by":"publisher","unstructured":"[26] M. Johnson, M. Schuster, Q.V. Le, M. Krikun, Y. Wu, Z. Chen, N. Thorat, F. Vi\u00e9gas, M. Wattenberg, G. Corrado, M. Hughes, and J. Dean, \u201cGoogle&apos;s multilingual neural machine translation system: Enabling zero-shot translation,\u201d Transactions of the Association for Computational Linguistics, vol.5, pp.339-351, Dec. 2017. 10.1162\/tacl_a_00065","DOI":"10.1162\/tacl_a_00065"},{"key":"27","doi-asserted-by":"crossref","unstructured":"[27] Y. Jia, M. Johnson, W. Macherey, R.J. Weiss, Y. Cao, C.-C. Chiu, N. Ari, S. Laurenzo, and Y. Wu, \u201cLeveraging weakly supervised data to improve end-to-end speech-to-text translation,\u201d Proc. ICASSP, pp.7180-7184, IEEE, 2019. 10.1109\/icassp.2019.8683343","DOI":"10.1109\/ICASSP.2019.8683343"},{"key":"28","unstructured":"[28] M. Hasegawa-Johnson, A. Black, L. Ondel, O. Scharenborg, and F. Ciannella, \u201cImage2speech: Automatically generating audio descriptions of images,\u201d Proc. ICNLSSP, vol.1, no.1, pp.19-27, 2017."},{"key":"29","doi-asserted-by":"crossref","unstructured":"[29] W. Havard, L. Besacier, and O. Rosec, \u201cSpeech-coco: 600k visually grounded spoken captions aligned to mscoco data set,\u201d ISCA Workshop on Grounding Language Understanding (GLU2017), 2017. 10.21437\/glu.2017-9","DOI":"10.21437\/GLU.2017-9"},{"key":"30","doi-asserted-by":"crossref","unstructured":"[30] G.I. Winata, A. Madotto, C.-S. Wu, and P. Fung, \u201cCode-switched language models using neural based synthetic data from parallel sentences,\u201d Proc. CoNLL, pp.271-280, IEEE, 2019. 10.18653\/v1\/k19-1026","DOI":"10.18653\/v1\/K19-1026"},{"key":"31","doi-asserted-by":"crossref","unstructured":"[31] Y. Sharma, B. Abraham, K. Taneja, and P. Jyothi, \u201cImproving low resource code-switched asr using augmented code-switched TTS,\u201d Proc. INTERSPEECH, pp.4771-4775, IEEE, 2020. 10.21437\/interspeech.2020-2402","DOI":"10.21437\/Interspeech.2020-2402"},{"key":"32","doi-asserted-by":"crossref","unstructured":"[32] Z. Tan, X. Fan, H. Zhu, and E. Lin, \u201cAddressing accent mismatch in Mandarin-English code-switching speech recognition,\u201d Proc. ICASSP, pp.8259-8263, 2020. 10.1109\/icassp40776.2020.9053752","DOI":"10.1109\/ICASSP40776.2020.9053752"},{"key":"33","doi-asserted-by":"publisher","unstructured":"[33] P.B. Denes and E.N. Pinson, The Speech Chain: The Physics And Biology Of Spoken Language, Anchor books, Worth Publishers, 1993. 10.1017\/s0025100300004904","DOI":"10.1017\/S0025100300004904"},{"key":"34","doi-asserted-by":"crossref","unstructured":"[34] A. Tjandra, S. Sakti, and S. Nakamura, \u201cEnd-to-end feedback loss in speech chain framework via straight-through estimator,\u201d Proc. ICASSP, Brighton, UK, pp.6281-6285, IEEE, 2019. 10.1109\/icassp.2019.8683480","DOI":"10.1109\/ICASSP.2019.8683480"},{"key":"35","doi-asserted-by":"crossref","unstructured":"[35] D. Bahdanau, J. Chorowski, D. Serdyuk, P. Brakel, and Y. Bengio, \u201cEnd-to-end attention-based large vocabulary speech recognition,\u201d Proc. ICASSP, pp.4945-4949, IEEE, 2016. 10.1109\/icassp.2016.7472618","DOI":"10.1109\/ICASSP.2016.7472618"},{"key":"36","doi-asserted-by":"crossref","unstructured":"[36] W. Chan, N. Jaitly, Q.V. Le, and O. Vinyals, \u201cListen, attend and spell: A neural network for large vocabulary conversational speech recognition,\u201d Proc. ICASSP, Shanghai, China, pp.4960-4964, IEEE, 2016. 10.1109\/icassp.2016.7472621","DOI":"10.1109\/ICASSP.2016.7472621"},{"key":"37","unstructured":"[37] Y. Wang, R. Skerry-Ryan, D. Stanton, Y. Wu, R.J. Weiss, N. Jaitly, Z. Yang, Y. Xiao, Z. Chen, S. Bengio, Q. Le, Y. Agiomyrgiannakis,R. Clark, and R.A. Saurous, \u201cTacotron: A fully end-to-end text-to-speech synthesis model,\u201d Proc. INTERSPEECH, Stockholm,Sweden, pp.4006-4010, IEEE, 2017."},{"key":"38","unstructured":"[38] C. Li, X. Ma, B. Jiang, X. Li, X. Zhang, X. Liu, Y. Cao, A. Kannan, and Z. Zhu, \u201cDeep speaker: an end-to-end neural speaker embedding system,\u201d CoRR, vol.abs\/1705.02304, 2017."},{"key":"39","unstructured":"[39] D. Bahdanau, K. Cho, and Y. Bengio, \u201cNeural machine translation by jointly learning to align and translate,\u201d Proc. ICLR, pp.1-15, San Diego, CA, USA, 2015."},{"key":"40","doi-asserted-by":"crossref","unstructured":"[40] T. Luong, H. Pham, and C.D. Manning, \u201cEffective approaches to attention-based neural machine translation,\u201d Proc. EMNLP, Lisbon, Portugal, pp.1412-1421, Association for Computational Linguistics, Sept. 2015. 10.18653\/v1\/d15-1166","DOI":"10.18653\/v1\/D15-1166"},{"key":"41","unstructured":"[41] Y. Wang, R.J. Skerry-Ryan, D. Stanton, Y. Wu, R.J. Weiss, N. Jaitly, Z. Yang, Y. Xiao, Z. Chen, S. Bengio, Q.V. Le, Y.Agiomyrgiannakis, R. Clark, and R.A. Saurous, \u201cTacotron: A fully end-to-end text-to-speech synthesis model,\u201d CoRR, vol.abs\/1703.10135, 2017."},{"key":"42","unstructured":"[42] B. Xu, N. Wang, T. Chen, and M. Li, \u201cEmpirical evaluation of rectified activations in convolutional network,\u201d CoRR, 2015."},{"key":"43","unstructured":"[43] T. Takezawa, G. Kikui, M. Mizushima, and E. Sumita, \u201cMultilingual spoken language corpus development for communication research,\u201d The Association for Computational Linguistics and Chinese Language Processing, vol.12, no.3, pp.303-324, 2007."},{"key":"44","doi-asserted-by":"crossref","unstructured":"[44] G. Kikui, E. Sumita, T. Takezawa, and S. Yamamoto, \u201cCreating corpora for speech-to-speech translation,\u201d Proc. ISCA EUROSPEECH, Geneva, Switzerland, pp.381-384, 2003.","DOI":"10.21437\/Interspeech.2004-157"},{"key":"45","doi-asserted-by":"crossref","unstructured":"[45] S. Nakayama, T. Kano, Q.T. Do, S. Sakti, and S. Nakamura, \u201cJapanese-English code-switching speech data construction,\u201d Proc. O-COCOSDA, Miyazaki, Japan, IEEE, 2018. 10.1109\/icsda.2018.8693044","DOI":"10.1109\/ICSDA.2018.8693044"},{"key":"46","unstructured":"[46] P.N. Durette, \u201cgTTS-Google Text-to-Speech.\u201d https:\/\/pypi.org\/project\/gTTS\/."},{"key":"47","doi-asserted-by":"crossref","unstructured":"[47] V. Panayotov, G. Chen, D. Povey, and S. Khudanpur, \u201cLibriSpeech: an ASR corpus based on public domain audio books,\u201d Proc. ICASSP, pp.5206-5210, 2015. 10.1109\/icassp.2015.7178964","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"48","doi-asserted-by":"crossref","unstructured":"[48] H. Bu, J. Du, X. Na, B. Wu, and H. Zheng, \u201cAISHELL-1: An open-source mandarin speech corpus and a speech recognition baseline,\u201d Proc. O-COCOSDA, pp.1-5, 2017. 10.1109\/icsda.2017.8384449","DOI":"10.1109\/ICSDA.2017.8384449"},{"key":"49","doi-asserted-by":"crossref","unstructured":"[49] Z. Zeng, Y. Khassanov, V.T. Pham, H. Xu, E.S. Chng, and H. Li, \u201cOn the end-to-end solution to Mandarin-English code-switching speech recognition,\u201d Proc. INTERSPEECH, pp.2165-2169, 2019. 10.21437\/interspeech.2019-1429","DOI":"10.21437\/Interspeech.2019-1429"},{"key":"50","doi-asserted-by":"crossref","unstructured":"[50] T. Ko, V. Peddinti, D. Povey, and S. Khudanpur, \u201cAudio augmentation for speech recognition,\u201d Proc. INTERSPEECH, pp.3586-3589, 2015.","DOI":"10.21437\/Interspeech.2015-711"},{"key":"51","doi-asserted-by":"crossref","unstructured":"[51] T. Ko, V. Peddinti, D. Povey, M.L. Seltzer, and S. Khudanpur, \u201cA study on data augmentation of reverberant speech for robust speech recognition,\u201d Proc. ICASSP, pp.5220-5224, IEEE, 2017. 10.1109\/icassp.2017.7953152","DOI":"10.1109\/ICASSP.2017.7953152"},{"key":"52","unstructured":"[52] B. McFee, M. McVicar, O. Nieto, S. Balke, C. Thome, D. Liang, E. Battenberg, J. Moore, R. Bittner, R. Yamamoto, et al., \u201clibrosa 0.5.0,\u201d https:\/\/librosa.github.io\/librosa\/0.5.0\/index.html, 2017."},{"key":"53","unstructured":"[53] T. Kudo, \u201cMecab: Yet another part-of-speech and morphological analyzer,\u201d http:\/\/taku910.github.io\/mecab, 2006."},{"key":"54","unstructured":"[54] H. Miura, \u201cpykakasi-kakasi library in python.\u201d https:\/\/pypi.org\/project\/pykakasi\/."},{"key":"55","unstructured":"[55] H. Huang, \u201cpypinyin-pinyin library in python.\u201d https:\/\/pypi.org\/project\/pypinyin\/."},{"key":"56","unstructured":"[56] A. Paszke, S. Gross, S. Chintala, G. Chanan, E. Yang, Z. DeVito, Z. Lin, A. Desmaison, L. Antiga, and A. Lerer, \u201cAutomatic differentiation in pytorch,\u201d Proc. NIPS Autodiff Workshop, 2017."},{"key":"57","unstructured":"[57] D.S. Pallet, W.M. Fisher, and J.G. Fiscus, \u201cTools for the analysis of benchmark speech recognition tests,\u201d Proc. ICASSP, pp.97-100, IEEE, 1990. 10.1109\/icassp.1990.115546"},{"key":"58","unstructured":"[58] P. Gage, \u201cA new algorithm for data compression,\u201d C Users Journal, vol.12, no.2, pp.23-38, 1994."},{"key":"59","doi-asserted-by":"crossref","unstructured":"[59] S. Nakayama, \u201cSpeech chain for semi-supervised learning of Japanese-English code-switching ASR,\u201d Master&apos;s thesis, Nara Institute of Science and Technology, Japan, 2019.","DOI":"10.1109\/SLT.2018.8639674"}],"container-title":["IEICE Transactions on Information and Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E104.D\/10\/E104.D_2021EDP7005\/_pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,9]],"date-time":"2024-09-09T03:28:07Z","timestamp":1725852487000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E104.D\/10\/E104.D_2021EDP7005\/_article"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,1]]},"references-count":59,"journal-issue":{"issue":"10","published-print":{"date-parts":[[2021]]}},"URL":"https:\/\/doi.org\/10.1587\/transinf.2021edp7005","relation":{},"ISSN":["0916-8532","1745-1361"],"issn-type":[{"value":"0916-8532","type":"print"},{"value":"1745-1361","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,10,1]]},"article-number":"2021EDP7005"}}