{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2022,8,24]],"date-time":"2022-08-24T21:40:22Z","timestamp":1661377222428},"reference-count":61,"publisher":"Institute of Electronics, Information and Communications Engineers (IEICE)","issue":"6","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEICE Trans. Inf. &amp; Syst."],"published-print":{"date-parts":[[2018,6,1]]},"DOI":"10.1587\/transinf.2017edp7367","type":"journal-article","created":{"date-parts":[[2018,5,31]],"date-time":"2018-05-31T22:50:13Z","timestamp":1527807013000},"page":"1591-1604","source":"Crossref","is-referenced-by-count":0,"title":["Submodular Based Unsupervised Data Selection"],"prefix":"10.1587","volume":"E101.D","author":[{"given":"Aiying","family":"ZHANG","sequence":"first","affiliation":[{"name":"Institute of Remote Sensing and Digital Earth, Chinese Academy of Sciences"},{"name":"University of Chinese Academy of Sciences"}]},{"given":"Chongjia","family":"NI","sequence":"additional","affiliation":[{"name":"Institute for Infocomm Research, A<sup>*<\/sup>STAR"}]}],"member":"532","reference":[{"key":"1","doi-asserted-by":"publisher","unstructured":"[1] J.G. Wilpon, L.R. Rabiner, C.-H. Lee, and E.R. Goldman, \u201cAutomatic recognition of keywords in unconstrained speech using hidden Markov models,\u201d IEEE Trans. Acoust., Speech, Signal Process., vol.38, no.11, pp.1870-1878, 1990. 10.1109\/29.103088","DOI":"10.1109\/29.103088"},{"key":"2","doi-asserted-by":"crossref","unstructured":"[2] R.C. Rose and D.B. Paul, \u201cA hidden Markov model based keyword recognition system,\u201d Proc. ICASSP 1990, pp.129-132, 1990. 10.1109\/icassp.1990.115555","DOI":"10.1109\/ICASSP.1990.115555"},{"key":"3","doi-asserted-by":"crossref","unstructured":"[3] J. Mamou, B. Ramabhadran, and O. Siohan, \u201cVocabulary independent spoken term detection,\u201d Proc. SIGIR 2007, pp.615-622, 2007. 10.1145\/1277741.1277847","DOI":"10.1145\/1277741.1277847"},{"key":"4","doi-asserted-by":"crossref","unstructured":"[4] D.R.H. Miller, M. Kleber, C.-L. Kao, O. Kimball, T. Colthurst, and S.A. Lowe, \u201cRapid and accurate spoken term detection,\u201d Proc. Interspeech 2007, pp.314-317, 2007.","DOI":"10.21437\/Interspeech.2007-174"},{"key":"5","unstructured":"[5] J.G. Fiscus, J. Ajot, J.S. Garofolo, and G. Doddintion, \u201cResults of the 2006 spoken term detection evaluation,\u201d Proc. ACM SIGIR 2007, Workshop in Searching Spontaneous Conversational Speech (SSCS 2007), pp.51-56, 2007."},{"key":"6","unstructured":"[6] I. Szoeke, M. Fapso, and L. Burget, \u201cHybrid word-subword decoding for spoken term detection,\u201d Proc. SIGIR 2008, pp.121-129, 2008."},{"key":"7","doi-asserted-by":"crossref","unstructured":"[7] N.F. Chen, C. Ni, I.-F. Chen, S. Sivadas, V.T. Pham, H. Xu, X. Xiao, T.S. Lau, S.J. Leow, B.P. Lim, C.-C. Leung, L. Wang, C.-H. Lee, A. Goh, E.S. Chng, B. Ma, and H. Li, \u201cLow-resource keyword search strategies for Tamil,\u201d Proc. ICASSP 2015, pp.5366-5370, 2015. 10.1109\/icassp.2015.7178996","DOI":"10.1109\/ICASSP.2015.7178996"},{"key":"8","doi-asserted-by":"crossref","unstructured":"[8] C. Ni, C.-C. Leung, L. Wang, H. Liu, F. Rao, L. Lu, N.F. Chen, B. Ma, and H. Li, \u201cCross-lingual deep neural network based submodular unbiased data selection for low-resource keyword search,\u201d Proc. ICASSP 2016, pp.6015-6019, 2016. 10.1109\/icassp.2016.7472832","DOI":"10.1109\/ICASSP.2016.7472832"},{"key":"9","doi-asserted-by":"crossref","unstructured":"[9] N.F. Chen, S. Sivadas, B.P. Lim, H.G. Ngo, H. Xu, V.T. Pham, B. Ma, and H. Li, \u201cStrategies for Vietnamese keyword search,\u201d Proc. ICASSP 2014, pp.4121-4125, 2014. 10.1109\/icassp.2014.6854377","DOI":"10.1109\/ICASSP.2014.6854377"},{"key":"10","doi-asserted-by":"crossref","unstructured":"[10] N.F. Chen, V.T. Pharri, H. Xu, X. Xiao, V.H. Do, C. Ni, I.-F. Chen, S. Sivadas, C.-H. Lee, E.S. Chng, B. Ma, and H. Li, \u201cExemplar-inspired strategies for low-resource spoken keyword search in Swahili,\u201d Proc. ICASSP 2016, pp.6040-6044, 2016. 10.1109\/icassp.2016.7472837","DOI":"10.1109\/ICASSP.2016.7472837"},{"key":"11","unstructured":"[11] C. Ni, C.-C. Leung, L. Wang, N.F. Chen, and B. Ma, \u201cUnsupervised data selection and word-morph mixed language model for Tamil low resource spoken keyword spotting,\u201d Proc. ICASSP 2015, pp.4714-4718, 2015."},{"key":"12","doi-asserted-by":"crossref","unstructured":"[12] C. Ni, L. Wang, H. Liu, C.-C. Leung, L. Lu, and B. Ma, \u201cSubmodular data selection with acoustic and phonetic features for automatic speech recognition,\u201d Proc. ICASSP 2015, pp.4629-4633, 2015. 10.1109\/icassp.2015.7178848","DOI":"10.1109\/ICASSP.2015.7178848"},{"key":"13","doi-asserted-by":"crossref","unstructured":"[13] Y. Zhang, E. Chuangsuwanich, and J. Glass, \u201cLanguage ID-based training of multilingual stacked bottleneck features,\u201d Proc. Interspeech 2014, pp.1-5, 2014.","DOI":"10.21437\/Interspeech.2014-1"},{"key":"14","doi-asserted-by":"crossref","unstructured":"[14] K. Vesely, M. Karafiat, F. Grezl, M. Janda, and E. Egorova, \u201cThe language-independent bottleneck features,\u201d Proc. SLT 2012, pp.336-340, 2012. 10.1109\/slt.2012.6424246","DOI":"10.1109\/SLT.2012.6424246"},{"key":"15","doi-asserted-by":"crossref","unstructured":"[15] K.M. Knill, M.J.F. Gales, S.P. Rath, P.C. Woodland, C. Zhang, and S.-X. Zhang, \u201cInvestigation of multilingual deep neural networks for spoken term detection,\u201d Proc. ASRU 2013, pp.138-143, 2013. 10.1109\/asru.2013.6707719","DOI":"10.1109\/ASRU.2013.6707719"},{"key":"16","doi-asserted-by":"crossref","unstructured":"[16] Z. Tuske, D. Nolden, R. Schluter, and H. Ney, \u201cMultilingual MRASTA features for low-resource keyword search and speech recognition systems,\u201d Proc. ICASSP 2014, pp.7854-7858, 2014. 10.1109\/icassp.2014.6855129","DOI":"10.1109\/ICASSP.2014.6855129"},{"key":"17","doi-asserted-by":"crossref","unstructured":"[17] A. Ghoshal, P. Swietojanski, and S. Renals, \u201cMultilingual training of deep neural networks,\u201d Proc. ICASSP 2013, pp.7319-7323, 2013. 10.1109\/icassp.2013.6639084","DOI":"10.1109\/ICASSP.2013.6639084"},{"key":"18","doi-asserted-by":"crossref","unstructured":"[18] J.-T. Huang, J. Li, D. Yu, L. Deng, and Y. Gong, \u201cCross-language knowledge transfer using multilingual deep neural network with shared hidden layers,\u201d Proc. ICASSP 2013, pp.7304-7308, 2013. 10.1109\/icassp.2013.6639081","DOI":"10.1109\/ICASSP.2013.6639081"},{"key":"19","unstructured":"[19] P. Golik, Z. Tuske, R. Schluter, and H. Ney, \u201cMultilingual features based keyword search for very low-resource languages,\u201d Proc. Interspeech 2015, pp.1260-1264, 2015."},{"key":"20","unstructured":"[20] Z. Tuske, P. Golik, D. Nolden, R. Schluter, and H. Ney, \u201cData augmentation, feature combination, and multilingual neural networks to improve ASR and KWS performance for low-resource languages,\u201d Proc. Interspeech 2014, pp.1420-1424, 2014."},{"key":"21","doi-asserted-by":"crossref","unstructured":"[21] N.T. Vu, D. Imseng, D. Povey, P. Motlicek, T. Schultz, and H. Bourlard, \u201cMultilingual deep neural network based acoustic modeling for rapid language adaptation,\u201d Proc. ICASSP 2014, pp.7639-7643, 2014. 10.1109\/icassp.2014.6855086","DOI":"10.1109\/ICASSP.2014.6855086"},{"key":"22","doi-asserted-by":"crossref","unstructured":"[22] J. Cui, B. Kingsbury, B. Ramabhadran, A. Sethy, K. Audhkhasi, X. Cui, E. Kislal, L. Mangu, M. Nussbaum-Thom, M. Picheny, Z. T\u00fcske, P. Golik, R. Schl\u00fcter, H. Ney, M.J.F. Gales, K.M. Knill, A. Ragni, H. Wang, and P.C. Woodland, \u201cMultilingual representations for low resource speech recognition and keyword search,\u201d Proc. ASRU 2015, pp.259-266, 2015. 10.1109\/asru.2015.7404803","DOI":"10.1109\/ASRU.2015.7404803"},{"key":"23","doi-asserted-by":"crossref","unstructured":"[23] Q.B. Nguyen, J. Gehring, M. Muller, S. Stuker, and A. Waibel, \u201cMultilingual shifting deep bottleneck features for low-resource ASR,\u201d Proc. ICASSP 2014, pp.5607-5611, 2014. 10.1109\/icassp.2014.6854676","DOI":"10.1109\/ICASSP.2014.6854676"},{"key":"24","doi-asserted-by":"crossref","unstructured":"[24] A.R. Syed, A. Rosenberg, and M. Mandel, \u201cActive learning for low-resource speech recognition: Impact of selection size and language modeling data,\u201d Proc. ICASSP 2017, pp.5315-5319, 2017. 10.1109\/icassp.2017.7953171","DOI":"10.1109\/ICASSP.2017.7953171"},{"key":"25","doi-asserted-by":"publisher","unstructured":"[25] G. Tur, D. Hakkani-T\u00fcr, and R.E. Schapire, \u201cCombining active and semi-supervised learning for spoken language understanding,\u201d Speech Communication, vol.45, no.2, pp.171-186, 2005. 10.1016\/j.specom.2004.08.002","DOI":"10.1016\/j.specom.2004.08.002"},{"key":"26","doi-asserted-by":"publisher","unstructured":"[26] D. Yu, B. Varadarajan, L. Deng, and A. Acero, \u201cActive learning and semi-supervised learning for speech recognition: A unified framework using the global entropy reduction maximization criterion,\u201d Computer Speech and Language, vol.24, no.3, pp.433-444, 2010. 10.1016\/j.csl.2009.03.004","DOI":"10.1016\/j.csl.2009.03.004"},{"key":"27","doi-asserted-by":"crossref","unstructured":"[27] N. Itoh, T.N. Sainath, D.N. Jiang, J. Zhou, and B. Ramabhadran, \u201cN-best entropy based data selection for acoustic modeling,\u201d Proc. ICASSP 2012, pp.4133-4136, 2012. 10.1109\/icassp.2012.6288828","DOI":"10.1109\/ICASSP.2012.6288828"},{"key":"28","doi-asserted-by":"crossref","unstructured":"[28] O. Siohan and M. Bacchiani, \u201ciVector-based acoustic data selection,\u201d Proc. Interspeech 2013, pp.657-661, 2013.","DOI":"10.21437\/Interspeech.2013-188"},{"key":"29","doi-asserted-by":"crossref","unstructured":"[29] O. Siohan, \u201cTraining data selection based on context-dependent state matching,\u201d Proc. ICASSP 2014, pp.3316-3319, 2014. 10.1109\/icassp.2014.6854214","DOI":"10.1109\/ICASSP.2014.6854214"},{"key":"30","doi-asserted-by":"crossref","unstructured":"[30] Y. Wu, R. Zhang, and A. Rudnicky, \u201cData selection for speech recognition,\u201d Proc. ASRU 2007, pp.562-565, 2007. 10.1109\/asru.2007.4430173","DOI":"10.1109\/ASRU.2007.4430173"},{"key":"31","doi-asserted-by":"crossref","unstructured":"[31] K. Wei, Y. Liu, K. Kirchhoff, C. Bartels, and J. Bilmes, \u201cSubmodular subset selection for large-scale speech training data,\u201d Proc. ICASSP 2014, pp.3311-3315, 2014. 10.1109\/icassp.2014.6854213","DOI":"10.1109\/ICASSP.2014.6854213"},{"key":"32","doi-asserted-by":"crossref","unstructured":"[32] C. Ni, C.-C. Leung, L. Wang, N.F. Chen, and B. Ma, \u201cEfficient methods to train multilingual bottleneck feature extractors for low resource keyword search,\u201d Proc. ICASSP 2017, pp.5650-5664, 2017. 10.1109\/icassp.2017.7953238","DOI":"10.1109\/ICASSP.2017.7953238"},{"key":"33","doi-asserted-by":"crossref","unstructured":"[33] E. Chuangsuwanich, Y. Zhang, and J. Glass, \u201cMultilingual data selection for training stacked bottleneck features,\u201d Proc. ICASSP 2016, pp.5410-5414, 2016. 10.1109\/icassp.2016.7472711","DOI":"10.1109\/ICASSP.2016.7472711"},{"key":"34","unstructured":"[34] D. Hakkani-Tur, G. Riccardi, and A. Gorin, \u201cActive learning for automatic speech recognition,\u201d Proc. ICASSP 2002, pp.3904-3907, 2002."},{"key":"35","unstructured":"[35] G. Riccardi and D. Hakkani-Tur, \u201cActive and unsupervised learning for automatic speech recognition,\u201d Proc. Eurospeech 2003, pp.1825-1828, 2003."},{"key":"36","doi-asserted-by":"crossref","unstructured":"[36] G. Tur, R.E. Schapire, and D. Hakkani-Tur, \u201cActive learning for spoken language understanding,\u201d Proc. ICASSP 2003, pp.I-276-I-279, 2003. 10.1109\/icassp.2003.1198771","DOI":"10.1109\/ICASSP.2003.1198771"},{"key":"37","doi-asserted-by":"crossref","unstructured":"[37] T.M. Kamm and G.G.L. Meyer, \u201cSelective sampling of training data for speech recognition,\u201d Proc. Human Language Technology Conf., San Diego, CA, 2002.","DOI":"10.3115\/1289189.1289248"},{"key":"38","unstructured":"[38] Y. Huang, D. Yu, Y. Gong, and C. Liu, \u201cSemi-supervised GMM and DNN acoustic model training with multi-system combination and confidence re-calibration,\u201d Proc. Interspeech 2013, pp.2360-2364, 2013."},{"key":"39","doi-asserted-by":"crossref","unstructured":"[39] K. Vesely, M. Hannemann, and L. Burget, \u201cSemi-supervised training of deep neural networks,\u201d Proc. ASRU 2013, pp.267-272, 2013. 10.1109\/asru.2013.6707741","DOI":"10.1109\/ASRU.2013.6707741"},{"key":"40","doi-asserted-by":"crossref","unstructured":"[40] D. Charlet, \u201cConfidence-measure-driven unsupervised incremental adaptation for HMM-based speech recognition,\u201d Proc. ICASSP 2001, pp.357-360, 2001. 10.1109\/icassp.2001.940841","DOI":"10.1109\/ICASSP.2001.940841"},{"key":"41","unstructured":"[41] X. Zhu, \u201cSemi-supervised learning literature survey,\u201d Computer Sciences Technical Report 1530, University of Wisconsin-Madison, 2005."},{"key":"42","unstructured":"[42] K. Wei, Y. Liu, K. Kirchhoff, and J. Bilmes, \u201cUsing document summarization techniques for speech data subset selection,\u201d Proc. NAACL\/HLT-2013, pp.721-726, 2013."},{"key":"43","doi-asserted-by":"crossref","unstructured":"[43] Y. Shinohara, \u201cA submodular optimization approach to sentence set selection,\u201d Proc. ICASSP 2014, pp.4140-4143, 2014.","DOI":"10.1109\/ICASSP.2014.6854375"},{"key":"44","unstructured":"[44] H. Lin and J. Bilmes, \u201cHow to select a good training-data subset for transcription: Submodular active selection for sequences,\u201d Proc. Interspeech 2009, pp.2859-2862, 2009."},{"key":"45","doi-asserted-by":"crossref","unstructured":"[45] K. Wei, Y. Liu, K. Kirchhoff, and J. Bilmes, \u201cUnsupervised submodular subset selection for speech data,\u201d Proc. ICASSP 2014, pp.4107-4111, 2014. 10.1109\/icassp.2014.6854374","DOI":"10.1109\/ICASSP.2014.6854374"},{"key":"46","unstructured":"[46] M. Doulaty, O. Saz, and T. Hain, \u201cData-selective transfer learning for multi-domain speech recognition,\u201d Proc. Interspeech 2015, pp.2897-2901, 2015."},{"key":"47","unstructured":"[47] T. Asami, R. Masumura, H. Masataki, M. Okamoto, and S. Sakauchi, \u201cTraining data selection for acoustic modeling via submodular optimization of joint Kullback-Leibler divergence,\u201d Proc. Interspeech 2015, pp.3645-3649, 2015."},{"key":"48","doi-asserted-by":"crossref","unstructured":"[48] S. Thomas, K. Audhkhasi, J. Cui, B. Kingsbury, and B.Ramabhadran, \u201cMultilingual data selection for low resource speech recognition,\u201d Proc. Interspeech 2016, pp.3853-3857, 2016. 10.21437\/interspeech.2016-598","DOI":"10.21437\/Interspeech.2016-598"},{"key":"49","doi-asserted-by":"crossref","unstructured":"[49] C. Ni, L. Wang, C.-C. Leung, F. Rao, L. Lu, B. Ma, and H. Li, \u201cRapid update of multilingual deep neural network for low-resource keyword search,\u201d Proc. Interspeech 2016, pp.3698-3702, 2016. 10.21437\/interspeech.2016-53","DOI":"10.21437\/Interspeech.2016-53"},{"key":"50","doi-asserted-by":"publisher","unstructured":"[50] G.L. Nemhauser, L.A. Wolsey, and M.L. Fisher, \u201cAn analysis of approximations for maximizing submodular set functions-I,\u201d Mathematical Programming, vol.14, no.1, pp.265-294, 1978. 10.1007\/bf01588971","DOI":"10.1007\/BF01588971"},{"key":"51","doi-asserted-by":"publisher","unstructured":"[51] U. Feige, \u201cA threshold of ln n for approximating set cover,\u201d Journal of the ACM (JACM), vol.45, no.4, pp.634-652, 1998. 10.1145\/285055.285059","DOI":"10.1145\/285055.285059"},{"key":"52","doi-asserted-by":"publisher","unstructured":"[52] T. Kinnunen and H. Li, \u201cAn overview of text independent speaker recognition: From features to supervectors,\u201d Speech Communication, vol.52, no.1, pp.12-40, 2010. 10.1016\/j.specom.2009.08.009","DOI":"10.1016\/j.specom.2009.08.009"},{"key":"53","unstructured":"[53] H. Wang, T. Lee, C.-C. Leung, B. Ma, and H. Li, \u201cUnsupervised mining of acoustic subword units with segment-level Gaussian posteriorgrams,\u201d Proc. Interspeech 2013, pp.2297-2301, 2013."},{"key":"54","doi-asserted-by":"publisher","unstructured":"[54] H. Wang, T. Lee, C.-C. Leung, B. Ma, and H. Li, \u201cAcoustic segment modeling with spectral clustering methods,\u201d IEEE Trans. Audio, Speech, Language Process., vol.23, no.2, pp.264-277, 2015. 10.1109\/taslp.2014.2387382","DOI":"10.1109\/TASLP.2014.2387382"},{"key":"55","doi-asserted-by":"crossref","unstructured":"[55] L. Zheng, C.-C. Leung, L. Xie, B. Ma, and H. Li, \u201cAcoustic TextTiling for story segmentation of spoken documents,\u201d Proc. ICASSP 2012, pp.5121-5124, 2012. 10.1109\/icassp.2012.6289073","DOI":"10.1109\/ICASSP.2012.6289073"},{"key":"56","doi-asserted-by":"publisher","unstructured":"[56] P.K. Ghosh, A. Tsiartas, and S. Narayanan, \u201cRobust voice activity detection using long-term signal variability,\u201d IEEE Trans. Audio, Speech, Language Process., vol.19, no.3, pp.600-613, 2011. 10.1109\/tasl.2010.2052803","DOI":"10.1109\/TASL.2010.2052803"},{"key":"57","doi-asserted-by":"crossref","unstructured":"[57] C.D. Manning, P. Raghavan, and H. Sch\u00fctze, Introduction to Information Retrieval, Cambridge University Press, 2008.","DOI":"10.1017\/CBO9780511809071"},{"key":"58","unstructured":"[58] https:\/\/www.ethnologue.com\/"},{"key":"59","unstructured":"[59] J. Roman, F. Gunnar, and H. Morris, Preliminaries to Speech Analysis: The Distinctive Features and Their Correlates,\u201d MIT Press, Cambridge, MA, 1952."},{"key":"60","unstructured":"[60] J. Gonzalez-Dominguez, I. Lopez-Moreno, H. Sak, J. Gonzalez-Rodriguez, and P.J. Moreno, \u201cAutomatic language identification using long short-term memory recurrent neural networks,\u201d Proc. Interspeech 2014, pp.2155-2159, 2014."},{"key":"61","unstructured":"[61] D. Povey, A. Ghoshal, G. Boulianne, L. Burget, O. Glembek, N. Goel, M. Hannemann, P. Motlicek, Y. Qian, P. Schwarz, J. Silovsky, G. Stemmer, and K. Vesely, \u201cThe Kaldi speech recognition toolkit,\u201d Proc. ASRU 2011, 2011."}],"container-title":["IEICE Transactions on Information and Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E101.D\/6\/E101.D_2017EDP7367\/_pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,8,24]],"date-time":"2022-08-24T21:16:57Z","timestamp":1661375817000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E101.D\/6\/E101.D_2017EDP7367\/_article"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,6,1]]},"references-count":61,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2018]]}},"URL":"https:\/\/doi.org\/10.1587\/transinf.2017edp7367","relation":{},"ISSN":["0916-8532","1745-1361"],"issn-type":[{"value":"0916-8532","type":"print"},{"value":"1745-1361","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018,6,1]]}}}