{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T19:38:33Z","timestamp":1776886713146,"version":"3.51.2"},"reference-count":172,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2019,7,27]],"date-time":"2019-07-27T00:00:00Z","timestamp":1564185600000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2019,7,27]],"date-time":"2019-07-27T00:00:00Z","timestamp":1564185600000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Artif Intell Rev"],"published-print":{"date-parts":[[2020,4]]},"DOI":"10.1007\/s10462-019-09739-y","type":"journal-article","created":{"date-parts":[[2019,7,27]],"date-time":"2019-07-27T19:02:16Z","timestamp":1564254136000},"page":"2483-2520","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["A survey on structured discriminative spoken keyword spotting"],"prefix":"10.1007","volume":"53","author":[{"given":"Shima","family":"Tabibian","sequence":"first","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2019,7,27]]},"reference":[{"key":"9739_CR1","doi-asserted-by":"crossref","unstructured":"Ahmad AR, Viard-Gaudin C, Khalid M (2009) Lexicon-based word recognition using support vector machine and hidden Markov model. In: International conference on document analysis and recognition (ICDAR\u201909), pp 161\u2013165","DOI":"10.1109\/ICDAR.2009.248"},{"key":"9739_CR2","doi-asserted-by":"crossref","unstructured":"Akyol A, Erdogan H (2004) Filler model based confidence measures for spoken dialogue systems: a case study for Turkish. In: International conference on acoustics, speech, and signal processing (ICASSP\u201904), pp 781\u2013784","DOI":"10.1109\/ICASSP.2004.1326102"},{"key":"9739_CR3","doi-asserted-by":"crossref","unstructured":"Alvarez R, Park H-J (2019) End-to-end streaming keyword spotting. In: International conference on acoustics, speech and signal processing (ICASSP), IEEE, pp 6336\u20136340","DOI":"10.1109\/ICASSP.2019.8683557"},{"key":"9739_CR4","unstructured":"Amodei D et al. (2015) Deep speech 2: end-to-end speech recognition in english and mandarin. arXiv preprint arXiv:151202595"},{"key":"9739_CR5","unstructured":"Ao C-W, Lee H-Y (2018) Query-by-example spoken term detection using attention-based multi-hop networks. In: IEEE international conference on acoustics, speech and signal processing (ICASSP), pp 6264\u20136268"},{"key":"9739_CR6","doi-asserted-by":"crossref","unstructured":"Arik SO et al. (2017) Convolutional recurrent neural networks for small-footprint keyword spotting. arXiv preprint arXiv:170305390","DOI":"10.21437\/Interspeech.2017-1737"},{"key":"9739_CR7","doi-asserted-by":"crossref","unstructured":"Ayed YB, Fohr D, Haton JP, Chollet G (2002) Keyword spotting using support vector machines. In: International conference on text, speech and dialogue, pp 285\u2013292","DOI":"10.1007\/3-540-46154-X_39"},{"key":"9739_CR8","doi-asserted-by":"crossref","unstructured":"Bahi H, Benati N (2009) A new keyword spotting approach. In: International conference on multimedia computing and systems (ICMCS\u201909), pp 77\u201380","DOI":"10.1109\/MMCS.2009.5256728"},{"key":"9739_CR9","doi-asserted-by":"crossref","unstructured":"Bahl L, Brown P, De Souza P, Mercer R (1986) Maximum mutual information estimation of hidden Markov model parameters for speech recognition. In: International conference on acoustics, speech, and signal processing (ICASSP\u201986), pp 49\u201352","DOI":"10.1109\/ICASSP.1986.1169179"},{"key":"9739_CR10","doi-asserted-by":"crossref","unstructured":"Bai Y, Yi J, Ni H, Wen Z, Liu B, Li Y, Tao J (2016) End-to-end keywords spotting based on connectionist temporal classification for Mandarin. In: International symposium on chinese spoken language processing (ISCSLP), pp 1\u20135","DOI":"10.1109\/ISCSLP.2016.7918460"},{"key":"9739_CR11","volume-title":"Modelling out-of-vocabulary words for robust speech recognition","author":"I Bazzi","year":"2002","unstructured":"Bazzi I (2002) Modelling out-of-vocabulary words for robust speech recognition. Massachusetts Institute of Technology, Cambridge"},{"key":"9739_CR12","unstructured":"Benayed Y, Fohr D, Haton JP, Chollet G (2003a) Confidence measures for keyword spotting using support vector machines. In: International conference on acoustics, speech, and signal processing (ICASSP\u201903), pp 588\u2013591"},{"key":"9739_CR13","unstructured":"Benayed Y, Fohr D, Haton JP, Chollet G (2003b) Improving the performance of a keyword spotting system by using support vector machines. In: IEEE workshop on automatic speech recognition and understanding (ASRU\u201903), pp 145\u2013149"},{"key":"9739_CR14","volume-title":"Connectionist speech recognition: a hybrid approach","author":"HA Bourlard","year":"2012","unstructured":"Bourlard HA, Morgan N (2012) Connectionist speech recognition: a hybrid approach, vol 247. Springer, Berlin"},{"key":"9739_CR15","doi-asserted-by":"crossref","unstructured":"Bourlard H, D\u2019hoore B, Boite J-M (1994) Optimizing recognition and rejection performance in wordspotting systems. In: International conference on acoustics, speech, and signal processing (ICASSP-94), pp I\/373\u2013I\/376","DOI":"10.1109\/ICASSP.1994.389278"},{"key":"9739_CR16","unstructured":"Bridle JS (1973) An efficient elastic-template method for detecting given words in running speech. In: British Acoustical Society meeting, pp 1\u20134"},{"key":"9739_CR17","doi-asserted-by":"crossref","unstructured":"Burger S, MacLaren V, Yu H (2002) The ISL meeting corpus: the impact of meeting type on speech style. In: International conference on spoken language processing (IICSLP)","DOI":"10.21437\/ICSLP.2002-140"},{"key":"9739_CR18","doi-asserted-by":"crossref","unstructured":"Burget L et al. (2008) Combination of strongly and weakly constrained recognizers for reliable detection of OOVs. In: International conference on acoustics, speech and signal processing (ICASSP\u201908), pp 4081\u20134084","DOI":"10.1109\/ICASSP.2008.4518551"},{"key":"9739_CR19","unstructured":"Butko T, Camprub\u00ed CN, Schulz H (2010) Albayzin-2010 audio segmentation evaluation: evaluation setup and results. In: VI Jornadas en Tecnolog\u00eda del Habla and II Iberian SLTech workshop, pp 305\u2013308"},{"key":"9739_CR20","unstructured":"Cernocky J et al. (2007) Search in speech for public security and defense. In: IEEE workshop on signal processing applications for public security and forensics (SAFE), pp 1\u20137"},{"key":"9739_CR21","first-page":"27","volume":"2","author":"CC Chang","year":"2011","unstructured":"Chang CC, Lin CJ (2011) LIBSVM: a library for support vector machines. ACM Trans Intell Syst Technol (TIST) 2:27","journal-title":"ACM Trans Intell Syst Technol (TIST)"},{"key":"9739_CR22","first-page":"720","volume":"2","author":"M Chavan","year":"2012","unstructured":"Chavan M, Chougule S (2012) Speaker features and recognition techniques: a review. Int J Comput Eng Res 2:720\u2013728","journal-title":"Int J Comput Eng Res"},{"key":"9739_CR23","doi-asserted-by":"crossref","first-page":"257","DOI":"10.1109\/TASL.2006.876717","volume":"15","author":"CP Chen","year":"2007","unstructured":"Chen CP, Bilmes JA (2007) MVA processing of speech features. IEEE Trans Audio Speech Lang Process 15:257\u2013270","journal-title":"IEEE Trans Audio Speech Lang Process"},{"key":"9739_CR24","unstructured":"Chen JC, Chien JT (2009) Bayesian large margin hidden Markov models for speech recognition. In: International conference on acoustics, speech and signal processing (ICASSP\u201909), pp 3765\u20133768"},{"key":"9739_CR25","doi-asserted-by":"crossref","unstructured":"Chen G, Parada C, Heigold G (2014) Small-footprint keyword spotting using deep neural networks. In: International conference on acoustics, speech and signal processing (ICASSP\u201914), pp 4087\u20134091","DOI":"10.1109\/ICASSP.2014.6854370"},{"key":"9739_CR26","doi-asserted-by":"crossref","unstructured":"Chen G, Parada C, Sainath TN (2015) Query-by-example keyword spotting using long short-term memory networks. In: International conference on acoustics, speech and signal processing (ICASSP), pp 5236\u20135240","DOI":"10.1109\/ICASSP.2015.7178970"},{"key":"9739_CR27","volume-title":"Fisher english training speech part 1 transcripts LDC2004T19 web download","author":"C Cieri","year":"2004","unstructured":"Cieri C, Graff D, Kimball O, Miller D, Walker K (2004) Fisher english training speech part 1 transcripts LDC2004T19 web download. Linguistic Data Consortium, Philadelphia"},{"key":"9739_CR28","volume-title":"Fisher english training part 2, transcripts LDC2005T19","author":"C Cieri","year":"2005","unstructured":"Cieri C, Graff D, Kimball O, Miller D, Walker K (2005) Fisher english training part 2, transcripts LDC2005T19. Linguistic Data Consortium, Philadelphia"},{"key":"9739_CR29","volume-title":"Wake-up word detection using LSTM neural networks","author":"WH Clemens Vayda","year":"2016","unstructured":"Clemens Vayda WH (2016) Wake-up word detection using LSTM neural networks. Graz University of Technology, Graz"},{"key":"9739_CR30","unstructured":"Cortes C, Mohri M (2005) Confidence intervals for the area under the roc curve. In: Advances in neural information processing systems (NIPS), Proceedings of the 2004 Conference. The MIT Press, Cambridge, MA, vol 17, No. 6, pp 305\u2013312"},{"key":"9739_CR31","volume-title":"An introduction to support vector machines","author":"N Cristianini","year":"2000","unstructured":"Cristianini N, Shawe Taylor J (2000) An introduction to support vector machines. Cambridge University Press, Cambridge"},{"key":"9739_CR32","volume-title":"Application in automatic speech recognition: keyword spotting based on online garbage modeling","author":"Z De Greve","year":"2006","unstructured":"De Greve Z (2006) Application in automatic speech recognition: keyword spotting based on online garbage modeling. Faculti Polytechnique de Mons, IDIAP Research Institute, Martigny"},{"key":"9739_CR33","unstructured":"Dekel O, Keshet J, Singer Y (2004) An online algorithm for hierarchical phoneme classification. In: International workshop on machine learning for multimodal interaction, pp 146\u2013158"},{"key":"9739_CR34","doi-asserted-by":"crossref","unstructured":"Dymarski P, Wydra S (2008) Large margin hidden Markov models in command recognition and speaker verification problems. In: International conference on systems, signals and image processing (IWSSIP\u201908), pp 221\u2013224","DOI":"10.1109\/IWSSIP.2008.4604407"},{"key":"9739_CR35","doi-asserted-by":"crossref","first-page":"861","DOI":"10.1016\/j.patrec.2005.10.010","volume":"27","author":"T Fawcett","year":"2006","unstructured":"Fawcett T (2006) An introduction to ROC analysis. Pattern Recognit Lett 27:861\u2013874","journal-title":"Pattern Recognit Lett"},{"key":"9739_CR36","doi-asserted-by":"crossref","unstructured":"Fern\u00e1ndez S, Graves A, Schmidhuber J (2007) An application of recurrent neural networks to discriminative keyword spotting. In: International conference on artificial neural networks, pp 220\u2013229","DOI":"10.1007\/978-3-540-74695-9_23"},{"key":"9739_CR37","doi-asserted-by":"crossref","unstructured":"Ferrer L, Estienne C (2001) Improving performance of a keyword spotting system by using a new confidence measure. In: INTERSPEECH, pp 2561\u20132564","DOI":"10.21437\/Eurospeech.2001-599"},{"key":"9739_CR38","unstructured":"Fiscus JG, Ajot J, Garofolo JS, Doddingtion G (2007) Results of the 2006 spoken term detection evaluation. In: Proceedings of SIGIR, pp 51\u201357"},{"key":"9739_CR39","unstructured":"Fisher WM (1986) The DARPA speech recognition research database: specifications and status. In: Fisher WM, Doddington GR, Goudie-Marshall KM (eds) Proceedings of DARPA workshop on speech recognition, pp 93\u201399"},{"key":"9739_CR40","doi-asserted-by":"crossref","first-page":"195","DOI":"10.1561\/2000000004","volume":"1","author":"M Gales","year":"2008","unstructured":"Gales M, Young S (2008) The application of hidden Markov models in speech recognition. Found Trends Signal Process 1:195\u2013304","journal-title":"Found Trends Signal Process"},{"key":"9739_CR41","doi-asserted-by":"crossref","first-page":"70","DOI":"10.1109\/MSP.2012.2207140","volume":"29","author":"MJF Gales","year":"2012","unstructured":"Gales MJF, Watanabe S, Fosler-Lussier E (2012) Structured discriminative models for speech recognition: an overview. IEEE Signal Process Mag 29:70\u201381","journal-title":"IEEE Signal Process Mag"},{"key":"9739_CR42","unstructured":"Gales MJ, Knill KM, Ragni A, Rath SP (2014a) Speech recognition and keyword spotting for low-resource languages: BABEL project research at CUED. In: Spoken language technologies for under-resourced languages, pp 16\u201323"},{"key":"9739_CR43","unstructured":"Gales MJ, Knill KM, Ragni A, Rath SP (2014b) Speech recognition and keyword spotting for low-resource languages: Babel project research at CUED. In: SLTU, pp 16\u201323"},{"key":"9739_CR44","volume-title":"CSR-I (WSJ0) complete LDC93S6A web download","author":"J Garofolo","year":"1993","unstructured":"Garofolo J, Graff D, Paul D, Pallett D (1993a) CSR-I (WSJ0) complete LDC93S6A web download. Linguistic Data Consortium, Philadelphia"},{"key":"9739_CR45","doi-asserted-by":"crossref","DOI":"10.6028\/NIST.IR.4930","volume-title":"DARPA TIMIT acoustic phonetic continuous speech corpus","author":"J Garofolo","year":"1993","unstructured":"Garofolo J, Lamel L, Fisher W, Fiscus J, Pallett D, Dahlgren N (1993b) DARPA TIMIT acoustic phonetic continuous speech corpus, vol LDC93S1. Linguistic Data Consortium, Philadelphia"},{"key":"9739_CR46","doi-asserted-by":"crossref","unstructured":"Glass J, Hazen TJ, Hetherington L, Wang C (2004) Analysis and processing of lecture audio data: preliminary investigations. In: Proceedings of the workshop on interdisciplinary approaches to speech indexing and retrieval at HLT-NAACL, pp 9\u201312","DOI":"10.3115\/1626307.1626309"},{"key":"9739_CR47","volume-title":"Switchboard-1 release 2 LDC97S62","author":"EHJ Godfrey","year":"1993","unstructured":"Godfrey EHJ (1993) Switchboard-1 release 2 LDC97S62. Linguistic Data Consortium, Philadelphia"},{"key":"9739_CR48","doi-asserted-by":"crossref","unstructured":"Graves A, Fern\u00e1ndez S, Gomez F, Schmidhuber J (2006) Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In: Proceedings of the 23rd international conference on Machine learning, ACM, pp 369\u2013376","DOI":"10.1145\/1143844.1143891"},{"key":"9739_CR49","doi-asserted-by":"crossref","unstructured":"Guo J, Kumatani K, Sun M, Wu M, Raju A, Str\u00f6m\u00a0N, Mandal A (2018) Time-delayed bottleneck highway networks using a DFT feature for keyword spotting. In: IEEE international conference on acoustics, speech and signal processing (ICASSP), pp 5489\u20135493","DOI":"10.1109\/ICASSP.2018.8462166"},{"key":"9739_CR50","doi-asserted-by":"crossref","unstructured":"He Y, Prabhavalkar R, Rao K, Li W, Bakhtin A, McGraw I (2017) Streaming small-footprint keyword spotting using sequence-to-sequence models. In: Automatic speech recognition and understanding workshop (ASRU), IEEE, pp 474\u2013481","DOI":"10.1109\/ASRU.2017.8268974"},{"key":"9739_CR51","doi-asserted-by":"crossref","unstructured":"Heracleous P, Shimizu T (2003) An efficient keyword spotting technique using a complementary language for filler models training. In: European conference on speech communication and technology (EuroSpeech), pp 921\u2013924","DOI":"10.21437\/Eurospeech.2003-323"},{"key":"9739_CR52","doi-asserted-by":"crossref","first-page":"578","DOI":"10.1109\/89.326616","volume":"2","author":"H Hermansky","year":"1994","unstructured":"Hermansky H, Morgan N (1994) RASTA processing of speech. IEEE Trans Speech Audio Process 2:578\u2013589","journal-title":"IEEE Trans Speech Audio Process"},{"key":"9739_CR53","doi-asserted-by":"crossref","unstructured":"Hermansky H, Morgan N, Bayya A, Kohn P (1991) Compensation for the effect of the communication channel in auditory-like analysis of speech (RASTA-PLP). In: European conference on speech communication and technology (EuroSpeech), pp 1367\u20131370","DOI":"10.21437\/Eurospeech.1991-312"},{"key":"9739_CR54","doi-asserted-by":"crossref","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter S, Schmidhuber J (1997) Long short-term memory. Neural Comput 9:1735\u20131780","journal-title":"Neural Comput"},{"key":"9739_CR55","doi-asserted-by":"crossref","unstructured":"Huang H, Zhu J (2006) Kernel based non-linear feature extraction methods for speech recognition. In: International conference on intelligent systems design and applications (ISDA\u201906), pp 749\u2013754","DOI":"10.1109\/ISDA.2006.253706"},{"key":"9739_CR56","volume-title":"Spoken language processing: a guide to theory, algorithm, and system development","author":"X Huang","year":"2001","unstructured":"Huang X, Acero A, Hon H-W (2001) Spoken language processing: a guide to theory, algorithm, and system development. Prentice Hall PTR, Upper Saddle River"},{"key":"9739_CR57","unstructured":"Hwang K, Lee M, Sung W (2015) Online keyword spotting with a character-level recurrent neural network. arXiv preprint arXiv:151208903"},{"key":"9739_CR58","doi-asserted-by":"crossref","first-page":"116","DOI":"10.1016\/j.cviu.2006.10.019","volume":"108","author":"A Jaimes","year":"2007","unstructured":"Jaimes A, Sebe N (2007) Multimodal human\u2013computer interaction: a survey. Comput Vis Image Underst 108:116\u2013134","journal-title":"Comput Vis Image Underst"},{"key":"9739_CR59","doi-asserted-by":"crossref","unstructured":"Janin A et al. (2003) The ICSI meeting corpus. In: IEEE international conference on acoustics, speech, and signal processing (ICASSP\u201903), IEEE, pp 364\u2013367","DOI":"10.1109\/ICASSP.2003.1198793"},{"key":"9739_CR60","doi-asserted-by":"crossref","first-page":"1584","DOI":"10.1109\/TASL.2006.879805","volume":"14","author":"H Jiang","year":"2006","unstructured":"Jiang H, Li X, Liu C (2006) Large margin hidden Markov models for speech recognition. IEEE Trans Audio Speech Lang Process 14:1584\u20131595","journal-title":"IEEE Trans Audio Speech Lang Process"},{"key":"9739_CR61","doi-asserted-by":"crossref","first-page":"3043","DOI":"10.1109\/78.175747","volume":"40","author":"B-H Juang","year":"1992","unstructured":"Juang B-H, Katagiri S (1992) Discriminative learning for minimum error classification (pattern recognition). IEEE Trans Signal Process 40:3043\u20133054","journal-title":"IEEE Trans Signal Process"},{"key":"9739_CR62","doi-asserted-by":"crossref","unstructured":"Junkawitsch J, Ruske G, H\u00f6ge H (1997) Efficient methods for detecting keywords in continuous speech. In: EUROSPEECH, pp 259\u2013262","DOI":"10.21437\/Eurospeech.1997-92"},{"key":"9739_CR63","unstructured":"Kamper H, Shakhnarovich G, Livescu K (2017) Semantic keyword spotting by learning from images and speech. arXiv preprint arXiv:171001949"},{"key":"9739_CR64","doi-asserted-by":"crossref","first-page":"e2772","DOI":"10.1016\/j.na.2009.06.089","volume":"71","author":"V K\u00ebpuska","year":"2009","unstructured":"K\u00ebpuska V, Klein T (2009) A novel wake-up-word speech recognition system, wake-up-word recognition task, technology and evaluation. Nonlinear Analysis: Theory Methods Appl 71:e2772\u2013e2789","journal-title":"Nonlinear Analysis: Theory Methods Appl"},{"key":"9739_CR65","unstructured":"Keshet J (2007) Theoretical foundations for large-margin kernel-based continuous speech recognition. IDIAP"},{"key":"9739_CR66","doi-asserted-by":"crossref","DOI":"10.1002\/9780470742044","volume-title":"Automatic speech and speaker recognition: large margin and kernel methods","author":"J Keshet","year":"2009","unstructured":"Keshet J, Bengio S (2009) Automatic speech and speaker recognition: large margin and kernel methods. Wiley, London"},{"key":"9739_CR67","doi-asserted-by":"crossref","unstructured":"Keshet J, Shalev-Shwartz S, Singer Y, Chazan D (2005) Phoneme alignment based on discriminative learning. In: INTERSPEECH, pp 2961\u20132964","DOI":"10.21437\/Interspeech.2005-129"},{"key":"9739_CR68","doi-asserted-by":"crossref","unstructured":"Keshet J, Bengio S, Chazan D, Shalev-Shwartz S, Singer Y (2006) Discriminative kernel-based phoneme sequence recognition. IDIAP","DOI":"10.21437\/Interspeech.2006-217"},{"key":"9739_CR69","doi-asserted-by":"crossref","first-page":"2373","DOI":"10.1109\/TASL.2007.903928","volume":"15","author":"J Keshet","year":"2007","unstructured":"Keshet J, Shalev-Shwartz S, Singer Y, Chazan D (2007) A large margin algorithm for speech-to-phoneme and music-to-score alignment. IEEE Trans Audio Speech Lang Process 15:2373\u20132382","journal-title":"IEEE Trans Audio Speech Lang Process"},{"key":"9739_CR70","doi-asserted-by":"crossref","first-page":"317","DOI":"10.1016\/j.specom.2008.10.002","volume":"51","author":"J Keshet","year":"2009","unstructured":"Keshet J, Grangier D, Bengio S (2009) Discriminative keyword spotting. Speech Commun 51:317\u2013329","journal-title":"Speech Commun"},{"key":"9739_CR71","doi-asserted-by":"crossref","unstructured":"Ketabdar H, Vepa J, Bengio S, Bourlard H (2006) Posterior based keyword spotting with a priori thresholds. In: International conference on spoken language processing (ICSLP), vol LIDIAP-CONF-2006-017, pp 633\u2013636","DOI":"10.21437\/Interspeech.2006-458"},{"key":"9739_CR72","doi-asserted-by":"crossref","unstructured":"Knill KM, Gales MJ, Rath SP, Woodland PC, Zhang C, Zhang SX (2013) Investigation of multilingual deep neural networks for spoken term detection. In: IEEE workshop on automatic speech recognition and understanding (ASRU), pp 138\u2013143","DOI":"10.1109\/ASRU.2013.6707719"},{"key":"9739_CR73","doi-asserted-by":"crossref","unstructured":"Kumatani K, Panchapagesan S, Wu M, Kim M, Strom N, Tiwari G, Mandai A (2017) Direct modeling of raw audio with DNNS for wake word detection. In: IEEE automatic speech recognition and understanding workshop (ASRU), pp 252\u2013257","DOI":"10.1109\/ASRU.2017.8268943"},{"key":"9739_CR74","unstructured":"Kuo J-W, Lo H-Y, Wang H-M (2007) Improved HMM\/SVM methods for automatic phoneme segmentation. In: Interspeech, Citeseer, pp 2057\u20132060"},{"key":"9739_CR75","unstructured":"Lafferty J, McCallum A, Pereira F (2001) Conditional random fields: probabilistic models for segmenting and labeling sequence data. In: Proceedings of the eighteenth international conference on machine learning, ICML, pp 282\u2013289"},{"key":"9739_CR76","doi-asserted-by":"crossref","unstructured":"Lee A, Shikano K, Kawahara T (2004) Real-time word confidence scoring using local posterior probabilities on tree trellis search. In: International conference on acoustics, speech, and signal processing (ICASSP\u201904), vol 791, pp I-793\u2013796","DOI":"10.1109\/ICASSP.2004.1326105"},{"key":"9739_CR77","unstructured":"Lengerich C, Hannun A (2016) An end-to-end architecture for keyword spotting and voice activity detection. arXiv preprint arXiv:161109405"},{"key":"9739_CR78","doi-asserted-by":"crossref","unstructured":"Li K, Naylor J, Rossen M (1992) A whole word recurrent neural network for keyword spotting. In: International conference on acoustics, speech, and signal processing (ICASSP-92), pp 81\u201384","DOI":"10.1109\/ICASSP.1992.226115"},{"key":"9739_CR79","doi-asserted-by":"crossref","first-page":"745","DOI":"10.1109\/TASLP.2014.2304637","volume":"22","author":"J Li","year":"2014","unstructured":"Li J, Deng L, Gong Y, Haeb-Umbach R (2014) An overview of noise-robust automatic speech recognition. IEEE\/ACM Trans Audio Speech Lang Process 22:745\u2013777","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process"},{"key":"9739_CR80","first-page":"145","volume":"10","author":"CY Lin","year":"2005","unstructured":"Lin CY, Jang JSR, Chen KT (2005) Automatic segmentation and labeling for Mandarin Chinese speech corpora for concatenation-based TTS. Int J Comput Linguist Chin Lang Process Spec Issue Annot Speech Corpora 10:145\u2013166","journal-title":"Int J Comput Linguist Chin Lang Process Spec Issue Annot Speech Corpora"},{"key":"9739_CR81","doi-asserted-by":"crossref","unstructured":"Lin H, Bilmes J, Vergyri D, Kirchhoff K (2007) OOV detection by joint word\/phone lattice alignment. In: IEEE workshop on automatic speech recognition & understanding, (ASRU), pp 478\u2013483","DOI":"10.1109\/ASRU.2007.4430159"},{"key":"9739_CR82","volume-title":"CSR-II (wsj1) complete","author":"Linguistic Data Consortium","year":"1994","unstructured":"Linguistic Data Consortium (1994) CSR-II (wsj1) complete, vol LDC94S13A. Linguistic Data Consortium, Philadelphia"},{"key":"9739_CR83","unstructured":"Manos AS, Zue VW (1997) A segment-based wordspotter using phonetic filler models. In: International conference on acoustics, speech, and signal processing (ICASSP-97), pp 899\u2013902"},{"key":"9739_CR84","doi-asserted-by":"crossref","unstructured":"Marcus JN (1992) A novel algorithm for HMM word spotting performance evaluation and error analysis. In: International conference on acoustics, speech, and signal processing (ICASSP-92), IEEE, pp 89\u201392","DOI":"10.1109\/ICASSP.1992.226113"},{"key":"9739_CR85","volume-title":"The DET curve in assessment of detection task performance DTIC document","author":"A Martin","year":"1997","unstructured":"Martin A, Doddington G, Kamm T, Ordowski M, Przybocki M (1997) The DET curve in assessment of detection task performance DTIC document. National Institute of Standards and Technology, Gaithersburg"},{"key":"9739_CR86","doi-asserted-by":"crossref","unstructured":"Matejka P, Zhang L, Ng T, Mallidi HS, Glembek O, Ma\u00a0J, Zhang B (2014) Neural network bottleneck features for language identification. In: Proceedings of Odyssey, pp 299\u2013304","DOI":"10.21437\/Odyssey.2014-45"},{"key":"9739_CR87","doi-asserted-by":"crossref","first-page":"1066","DOI":"10.1016\/j.csl.2013.12.004","volume":"28","author":"F Metze","year":"2014","unstructured":"Metze F, Anguera X, Barnard E, Davel M, Gravier G (2014) Language independent search in MediaEval\u2019s spoken web search task. Comput Speech Lang 28:1066\u20131082","journal-title":"Comput Speech Lang"},{"key":"9739_CR88","doi-asserted-by":"crossref","unstructured":"Michel M, Ajot J, Fiscus J (2006) The NIST meeting room corpus 2 phase 1. In: International workshop on machine learning for multimodal interaction, Springer, pp 13\u201323","DOI":"10.1007\/11965152_2"},{"key":"9739_CR89","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1186\/1687-4722-2014-2","volume":"2014","author":"M Miki","year":"2014","unstructured":"Miki M, Kitaoka N, Miyajima C, Nishino T, Takeda K (2014) Improvement of multimodal gesture and speech recognition performance using time intervals between gestures and accompanying speech. EURASIP J Audio Speech Music Process 2014:1\u20137","journal-title":"EURASIP J Audio Speech Music Process"},{"key":"9739_CR90","doi-asserted-by":"crossref","unstructured":"Miller DR et al. (2007) Rapid and accurate spoken term detection. In: Annual conference of the international speech communication association (INTERSPEECH), pp 314\u2013317","DOI":"10.21437\/Interspeech.2007-174"},{"key":"9739_CR91","doi-asserted-by":"crossref","first-page":"1065","DOI":"10.1016\/j.specom.2012.05.002","volume":"54","author":"MH Moattar","year":"2012","unstructured":"Moattar MH, Homayounpour MM (2012) A review on speaker diarization systems and approaches. Speech Commun 54:1065\u20131103","journal-title":"Speech Commun"},{"key":"9739_CR92","doi-asserted-by":"crossref","unstructured":"Molau S, Hilger F, Ney H (2003) Feature space normalization in adverse acoustic conditions. In: International conference on acoustics, speech, and signal processing (ICASSP\u201903), pp I-656\u2013I-659","DOI":"10.1109\/ICASSP.2003.1198866"},{"key":"9739_CR93","doi-asserted-by":"crossref","unstructured":"Motlicek P, Valente F, Szoke I (2012) Improving acoustic based keyword spotting using LVCSR lattices. In: 2012 IEEE international conference on acoustics, speech and signal processing (ICASSP), IEEE, pp 4413\u20134416","DOI":"10.1109\/ICASSP.2012.6288898"},{"key":"9739_CR94","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1186\/1687-4722-2012-1","volume":"2012","author":"NS Nehe","year":"2012","unstructured":"Nehe NS, Holambe RS (2012) DWT and LPC based feature extraction methods for isolated word recognition. EURASIP J Audio Speech Music Process 2012:1\u20137","journal-title":"EURASIP J Audio Speech Music Process"},{"key":"9739_CR95","doi-asserted-by":"crossref","first-page":"417","DOI":"10.1016\/j.sigpro.2011.08.006","volume":"92","author":"K Ngo","year":"2012","unstructured":"Ngo K, Spriet A, Moonen M, Wouters J, Jensen SH (2012) A combined multi-channel Wiener filter-based noise reduction and dynamic range compression in hearing aids. Sig Process 92:417\u2013426","journal-title":"Sig Process"},{"key":"9739_CR96","volume-title":"NIST open keyword search 2013 evaluation (OpenKWS13)","author":"NIST","year":"2013","unstructured":"NIST (2013) NIST open keyword search 2013 evaluation (OpenKWS13), 1st edn. National Institute of Standards and Technology (NIST), Washington DC","edition":"1"},{"key":"9739_CR97","volume-title":"NIST open keyword search 2014 evaluation (OpenKWS14)","author":"NIST","year":"2014","unstructured":"NIST (2014) NIST open keyword search 2014 evaluation (OpenKWS14), 1st edn. National Institute of Standards and Technology (NIST), Washington DC","edition":"1"},{"key":"9739_CR98","volume-title":"NIST open keyword search 2015 evaluation (OpenKWS15)","author":"NIST","year":"2015","unstructured":"NIST (2015) NIST open keyword search 2015 evaluation (OpenKWS15), 1st edn. National Institute of Standards and Technology (NIST), Washington DC","edition":"1"},{"key":"9739_CR99","volume-title":"NIST open keyword search 2016 evaluation (OpenKWS16)","author":"NIST","year":"2016","unstructured":"NIST (2016) NIST open keyword search 2016 evaluation (OpenKWS16), 1st edn. National Institute of Standards and Technology (NIST), Washington DC","edition":"1"},{"key":"9739_CR100","doi-asserted-by":"crossref","unstructured":"Ou Z, Luo H (2012) CRF-based confidence measures of recognized candidates for lattice-based audio indexing. In: 2012 IEEE international conference on acoustics, speech and signal processing (ICASSP), IEEE, pp 4933\u20134936","DOI":"10.1109\/ICASSP.2012.6289026"},{"key":"9739_CR101","doi-asserted-by":"crossref","unstructured":"Panayotov V, Chen G, Povey D, Khudanpur S (2015) Librispeech: an ASR corpus based on public domain audio books. In: International conference on acoustics, speech and signal processing (ICASSP), pp 5206\u20135210","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"9739_CR102","volume-title":"A large set of audio features for sound description (similarity and classification) in the CUIDADO project, Cuidado project report","author":"G Peeters","year":"2004","unstructured":"Peeters G (2004) A large set of audio features for sound description (similarity and classification) in the CUIDADO project, Cuidado project report. IRCAM, Paris"},{"key":"9739_CR103","volume-title":"Automatic speech recognition using Kaldi","author":"O Pl\u00e1tek","year":"2014","unstructured":"Pl\u00e1tek O (2014) Automatic speech recognition using Kaldi. Charles University in Prague, Prague"},{"key":"9739_CR104","unstructured":"Platt JC (1999) Fast training of support vector machines using sequential minimal optimization. In: Advances in kernel methods: support vector learning. MIT Press, pp 185\u2013208"},{"key":"9739_CR105","unstructured":"Povey D, Woodland PC (2002) Minimum phone error and I-smoothing for improved discriminative training. In: International conference on acoustics, speech, and signal processing (ICASSP\u201902), pp I-105\u2013I-108"},{"key":"9739_CR106","first-page":"37","volume":"2","author":"DM Powers","year":"2011","unstructured":"Powers DM (2011) Evaluation: from precision, recall and F-measure to ROC, informedness, markedness and correlation. J Mach Learn Technol 2:37\u201363","journal-title":"J Mach Learn Technol"},{"key":"9739_CR107","volume-title":"Fundamentals of speech recognition","author":"L Rabiner","year":"1993","unstructured":"Rabiner L, Juang B-H (1993) Fundamentals of speech recognition. PTR Prentice Hall, Englewood Cliffs"},{"key":"9739_CR108","doi-asserted-by":"crossref","unstructured":"Ramabhadran B, Sethy A, Mamou J, Kingsbury B, Chaudhari U (2009) Fast decoding for open vocabulary spoken term detection. In: Proceedings of human language technologies: the 2009 annual conference of the North American Chapter of the Association for Computational Linguistics, companion, volume: short papers, Association for Computational Linguistics, pp 277\u2013280","DOI":"10.3115\/1620853.1620930"},{"key":"9739_CR109","doi-asserted-by":"crossref","unstructured":"Rastrow A, Sethy A, Ramabhadran B (2009) A new method for OOV detection using hybrid word\/fragment system. In: 2009 IEEE international conference on acoustics, speech and signal processing, IEEE, pp 3953\u20133956","DOI":"10.1109\/ICASSP.2009.4960493"},{"key":"9739_CR110","doi-asserted-by":"crossref","first-page":"373","DOI":"10.1016\/j.csl.2006.06.006","volume":"21","author":"B Roark","year":"2007","unstructured":"Roark B, Saraclar M, Collins M (2007) Discriminative n-gram language modeling. Comput Speech Lang 21:373\u2013392","journal-title":"Comput Speech Lang"},{"key":"9739_CR111","doi-asserted-by":"crossref","first-page":"309","DOI":"10.1006\/csla.1995.0015","volume":"9","author":"R Rose","year":"1995","unstructured":"Rose R (1995) Keyword detection in conversational speech utterances using hidden Markov model based continuous speech recognition. Comput Speech Lang 9:309\u2013333","journal-title":"Comput Speech Lang"},{"key":"9739_CR112","unstructured":"Rose RC, Paul DB (1990) A hidden Markov model based keyword recognition system. In: International conference on acoustics, speech, and signal processing (ICASSP-90), pp 129\u2013132"},{"key":"9739_CR113","unstructured":"Sainath TN, Parada C (2015) Convolutional neural networks for small-footprint keyword spotting. In: INTERSPEECH, pp 1478\u20131482"},{"key":"9739_CR114","doi-asserted-by":"crossref","first-page":"2673","DOI":"10.1109\/78.650093","volume":"45","author":"M Schuster","year":"1997","unstructured":"Schuster M, Paliwal KK (1997) Bidirectional recurrent neural networks. IEEE Trans Signal Process 45:2673\u20132681","journal-title":"IEEE Trans Signal Process"},{"key":"9739_CR115","doi-asserted-by":"crossref","unstructured":"Seigel MS, Woodland PC, Gales M (2013) A confidence-based approach for improving keyword hypothesis scores. In: International conference on acoustics, speech and signal processing (ICASSP\u201913), pp 8565\u20138569","DOI":"10.1109\/ICASSP.2013.6639337"},{"key":"9739_CR116","doi-asserted-by":"crossref","unstructured":"Shan C, Zhang J, Wang Y, Xie L (2018) Attention-based end-to-end models for small-footprint keyword spotting. arXiv preprint arXiv:180310916","DOI":"10.21437\/Interspeech.2018-1777"},{"key":"9739_CR117","doi-asserted-by":"crossref","first-page":"1327","DOI":"10.1109\/JPROC.2003.817145","volume":"91","author":"R Sharma","year":"2003","unstructured":"Sharma R et al (2003) Speech-gesture driven multimodal interfaces for crisis management. Proc IEEE 91:1327\u20131354","journal-title":"Proc IEEE"},{"key":"9739_CR118","doi-asserted-by":"crossref","unstructured":"Shokri A, Tabibian S, Akbari A, Nasersharif B, Kabudian J (2011) A robust keyword spotting system for Persian conversational telephone speech using feature and score normalization and ARMA filter. In: GCC conference and exhibition (GCC), pp 497\u2013500","DOI":"10.1109\/IEEEGCC.2011.5752589"},{"key":"9739_CR119","volume-title":"1997 Mandarin broadcast news speech (HUB4-NE) LDC98S73","author":"JL Shudong Huang","year":"1998","unstructured":"Shudong Huang JL, Xuling Wu, Lei Wu, Yan Yongmin, Qin Zhoakai (1998) 1997 Mandarin broadcast news speech (HUB4-NE) LDC98S73. Linguistic Data Consortium, Philadelphia"},{"key":"9739_CR120","doi-asserted-by":"crossref","unstructured":"Sun M et al. (2016) Max-pooling loss training of long short-term memory networks for small-footprint keyword spotting. In: Spoken language technology workshop (SLT), IEEE, pp 474\u2013480","DOI":"10.1109\/SLT.2016.7846306"},{"key":"9739_CR121","doi-asserted-by":"crossref","unstructured":"Sun M, Schwarz A, Wu M, Strom N, Matsoukas S, Vitaladevuni S (2017a) An empirical study of cross-lingual transfer learning techniques for small-footprint keyword spotting. In: International conference on machine learning and applications (ICMLA), IEEE, pp 255\u2013260","DOI":"10.1109\/ICMLA.2017.0-150"},{"key":"9739_CR122","doi-asserted-by":"crossref","unstructured":"Sun M et al. (2017b) Compressed time delay neural network for small-footprint keyword spotting. In: INTERSPEECH, pp 3607\u20133611","DOI":"10.21437\/Interspeech.2017-480"},{"key":"9739_CR123","volume-title":"Hybrid word-subword spoken term detection","author":"I Sz\u00f6ke","year":"2010","unstructured":"Sz\u00f6ke I (2010) Hybrid word-subword spoken term detection. Faculty of Information Technology, BUT, Brno"},{"key":"9739_CR124","doi-asserted-by":"crossref","unstructured":"Sz\u00f6ke I, Schwarz P, Mat\u011bjka P, Burget L, Karafi\u00e1t M, \u010cernock\u00fd J (2005a) Phoneme based acoustics keyword spotting in informal continuous speech. In: International conference on text, speech and dialogue. Springer, pp 302\u2013309","DOI":"10.1007\/11551874_39"},{"key":"9739_CR125","doi-asserted-by":"crossref","unstructured":"Sz\u00f6ke I, Schwarz P, Matejka P, Burget L, Karafi\u00e1t M, Fapso M, Cernock\u00fd J (2005b) Comparison of keyword spotting approaches for informal continuous speech. In: Interspeech, Citeseer, pp 633\u2013636","DOI":"10.1007\/11551874_39"},{"key":"9739_CR126","doi-asserted-by":"crossref","first-page":"1018","DOI":"10.1016\/j.procs.2010.12.167","volume":"3","author":"S Tabibian","year":"2011","unstructured":"Tabibian S, Shokri A, Akbari A, Nasersharif B (2011) Performance evaluation for an HMM-based keyword spotter and a large-margin based one in noisy environments. Proc Comput Sci 3:1018\u20131022","journal-title":"Proc Comput Sci"},{"key":"9739_CR127","doi-asserted-by":"crossref","first-page":"1660","DOI":"10.1016\/j.engappai.2013.03.009","volume":"26","author":"S Tabibian","year":"2013","unstructured":"Tabibian S, Akbari A, Nasersharif B (2013) Keyword spotting using an evolutionary-based classifier and discriminative features. Eng Appl Artif Intell 26:1660\u20131670","journal-title":"Eng Appl Artif Intell"},{"key":"9739_CR128","doi-asserted-by":"crossref","first-page":"195","DOI":"10.1007\/s11063-013-9299-4","volume":"39","author":"S Tabibian","year":"2014","unstructured":"Tabibian S, Akbari A, Nasersharif B (2014) Extension of a kernel-based classifier for discriminative spoken keyword spotting. Neural Process Lett 39:195\u2013218","journal-title":"Neural Process Lett"},{"key":"9739_CR129","doi-asserted-by":"crossref","first-page":"184","DOI":"10.1016\/j.sigpro.2014.06.027","volume":"106","author":"S Tabibian","year":"2015","unstructured":"Tabibian S, Akbari A, Nasersharif B (2015) Speech enhancement using a wavelet thresholding method based on symmetric Kullback\u2013Leibler divergence. Sig Process 106:184\u2013197","journal-title":"Sig Process"},{"key":"9739_CR130","doi-asserted-by":"crossref","first-page":"45","DOI":"10.1016\/j.ins.2015.12.010","volume":"336","author":"S Tabibian","year":"2016","unstructured":"Tabibian S, Akbari A, Nasersharif B (2016) A fast hierarchical search algorithm for discriminative keyword spotting. Inf Sci 336:45\u201359","journal-title":"Inf Sci"},{"key":"9739_CR131","doi-asserted-by":"crossref","first-page":"157","DOI":"10.1016\/j.ins.2017.09.052","volume":"423","author":"S Tabibian","year":"2018","unstructured":"Tabibian S, Akbari A, Nasersharif B (2018) Discriminative keyword spotting using triphones Information and N-best Search. Inf Sci 423:157\u2013171","journal-title":"Inf Sci"},{"key":"9739_CR132","unstructured":"Tamura S, Iwano K, Furui S (2005) Toward robust multimodal speech recognition. In: Symposium on large scale knowledge resources (LKR2005), pp 163\u2013166"},{"key":"9739_CR133","doi-asserted-by":"crossref","unstructured":"Tang R, Lin J (2018) Deep residual learning for small-footprint keyword spotting. In: International conference on acoustics, speech and signal processing (ICASSP), IEEE, pp 5484\u20135488","DOI":"10.1109\/ICASSP.2018.8462688"},{"key":"9739_CR134","doi-asserted-by":"crossref","first-page":"980","DOI":"10.1016\/j.specom.2008.03.005","volume":"50","author":"J Tejedor","year":"2008","unstructured":"Tejedor J, Wang D, Frankel J, King S, Col\u00e1s J (2008) A comparison of grapheme and phoneme-based units for Spanish spoken term detection. Speech Commun 50:980\u2013991","journal-title":"Speech Commun"},{"key":"9739_CR135","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1186\/s13636-017-0119-z","volume":"2017","author":"J Tejedor","year":"2017","unstructured":"Tejedor J et al (2017) ALBAYZIN 2016 spoken term detection evaluation: an international open competitive evaluation in Spanish. EURASIP J Audio Speech Music Process 2017:1\u201322","journal-title":"EURASIP J Audio Speech Music Process"},{"key":"9739_CR136","volume-title":"Acoustic keyword spotting in speech with applications to data mining","author":"AJ Thambiratnam","year":"2005","unstructured":"Thambiratnam AJ (2005) Acoustic keyword spotting in speech with applications to data mining. Queensland University of Technology, Brisbane"},{"key":"9739_CR137","unstructured":"Toh AM, Togneri R, Nordholm S (2005) Spectral entropy as speech features for speech recognition. Paper presented at the proceedings of post graduate electrical engineering and computing symposium (PEECS)"},{"key":"9739_CR138","doi-asserted-by":"crossref","first-page":"617","DOI":"10.1109\/TSA.2003.813579","volume":"11","author":"DT Toledano","year":"2003","unstructured":"Toledano DT, G\u00f3mez LAH, Grande LV (2003) Automatic phonetic segmentation. IEEE Trans Speech Audio Process 11:617\u2013625","journal-title":"IEEE Trans Speech Audio Process"},{"key":"9739_CR139","first-page":"1453","volume":"6","author":"I Tsochantaridis","year":"2005","unstructured":"Tsochantaridis I, Joachims T, Hofmann T, Altun Y (2005) Large margin methods for structured and interdependent output variables. J Mach Learn Res 6:1453\u20131484","journal-title":"J Mach Learn Res"},{"key":"9739_CR140","doi-asserted-by":"crossref","unstructured":"Tucker G, Wu M, Sun M, Panchapagesan S, Fu G, Vitaladevuni S (2016) Model compression applied to small-footprint keyword spotting. In: INTERSPEECH, pp 1878\u20131882","DOI":"10.21437\/Interspeech.2016-1393"},{"key":"9739_CR141","doi-asserted-by":"crossref","unstructured":"T\u00fcske Z, Golik P, Schl\u00fcter R, Drepper FR (2011) Non-stationary feature extraction for automatic speech recognition. In: International conference on acoustics, speech and signal processing (ICASSP\u201911), pp 5204\u20135207","DOI":"10.1109\/ICASSP.2011.5947530"},{"key":"9739_CR142","volume-title":"Statistical learning theory","author":"VN Vapnik","year":"1998","unstructured":"Vapnik VN, Vapnik V (1998) Statistical learning theory, vol 1. Wiley, New York"},{"key":"9739_CR143","doi-asserted-by":"crossref","DOI":"10.1002\/9780470740156","volume-title":"Advanced digital signal processing and noise reduction","author":"SV Vaseghi","year":"2008","unstructured":"Vaseghi SV (2008) Advanced digital signal processing and noise reduction. Wiley, London"},{"key":"9739_CR144","doi-asserted-by":"crossref","unstructured":"Viikki O, Bye D, Laurila K (1998) A recursive feature vector normalization approach for robust speech recognition in noise. In: International conference on acoustics, speech and signal processing (ICASSP\u201998), pp 733\u2013736","DOI":"10.1109\/ICASSP.1998.675369"},{"key":"9739_CR145","first-page":"378","volume":"5","author":"C Vimala","year":"2014","unstructured":"Vimala C, Radha V (2014) Suitable feature extraction and speech recognition technique for isolated tamil spoken words. Int J Comput Sci Inf Technol (IJCSIT) 5:378\u2013383","journal-title":"Int J Comput Sci Inf Technol (IJCSIT)"},{"key":"9739_CR146","volume-title":"Out-of-vocabulary spoken term detection","author":"D Wang","year":"2010","unstructured":"Wang D (2010) Out-of-vocabulary spoken term detection. University of Edinburgh, Edinburgh"},{"key":"9739_CR147","doi-asserted-by":"crossref","unstructured":"Wang D, Tejedor J, Frankel J, King S, Col\u00e1s\u00a0J (2009) Posterior-based confidence measures for spoken term detection. In: International conference on acoustics, speech and signal processing(ICASSP\u201909), pp 4889\u20134892","DOI":"10.1109\/ICASSP.2009.4960727"},{"key":"9739_CR148","doi-asserted-by":"crossref","first-page":"358","DOI":"10.1007\/s11390-012-1228-x","volume":"27","author":"D Wang","year":"2012","unstructured":"Wang D, Tejedor J, King S, Frankel J (2012) Term-dependent confidence normalisation for out-of-vocabulary spoken term detection. J Comput Sci Technol 27:358\u2013375","journal-title":"J Comput Sci Technol"},{"key":"9739_CR149","doi-asserted-by":"crossref","first-page":"483","DOI":"10.1002\/dac.2681","volume":"28","author":"Y Wang","year":"2015","unstructured":"Wang Y, Yang J, Lu J, Liu H, Wang L (2015) Hierarchical deep belief networks based point process model for keywords spotting in continuous speech. Int J Commun Syst 28:483\u2013496","journal-title":"Int J Commun Syst"},{"key":"9739_CR150","unstructured":"Wang Z, Li X, Zhou J (2017) Small-footprint keyword spotting using deep neural network and connectionist temporal classifier. arXiv preprint arXiv:170903665"},{"key":"9739_CR151","doi-asserted-by":"crossref","unstructured":"Weintraub M (1995) LVCSR log-likelihood ratio scoring for keyword spotting. In: International conference on acoustics, speech, and signal processing (ICASSP-95), pp 297\u2013300","DOI":"10.1109\/ICASSP.1995.479532"},{"key":"9739_CR152","doi-asserted-by":"crossref","unstructured":"Wolf JJ (1980) Speech signal processing and feature extraction. In: Spoken language generation and understanding. Springer, pp 103\u2013128","DOI":"10.1007\/978-94-009-9091-3_6"},{"key":"9739_CR153","doi-asserted-by":"crossref","unstructured":"Wollmer M, Eyben F, Keshet J, Graves A, Schuller B, Rigoll G (2009) Robust discriminative keyword spotting for emotionally colored spontaneous speech using bidirectional LSTM networks. In: International conference on acoustics, speech and signal processing (ICASSP\u201909), pp 3949\u20133952","DOI":"10.1109\/ICASSP.2009.4960492"},{"key":"9739_CR154","doi-asserted-by":"crossref","first-page":"253","DOI":"10.1007\/s11571-011-9166-9","volume":"5","author":"M Wollmer","year":"2011","unstructured":"Wollmer M, Marchi E, Squartini S, Schuller B (2011) Multi-stream LSTM-HMM decoding and histogram equalization for noise robust keyword spotting. Cognit Neurodyn 5:253\u2013264","journal-title":"Cognit Neurodyn"},{"key":"9739_CR155","doi-asserted-by":"crossref","first-page":"252","DOI":"10.1016\/j.specom.2012.08.006","volume":"55","author":"M Wollmer","year":"2013","unstructured":"Wollmer M, Schuller B, Rigoll G (2013) Keyword spotting exploiting long short-term memory. Speech Commun 55:252\u2013265","journal-title":"Speech Commun"},{"key":"9739_CR156","doi-asserted-by":"crossref","unstructured":"Wu M et al. (2018) Monophone-based background modeling for two-stage on-device wake word detection. In: International conference on acoustics, speech and signal processing (ICASSP), IEEE, pp 5494\u20135498","DOI":"10.1109\/ICASSP.2018.8462227"},{"key":"9739_CR157","unstructured":"Xiong X (2009) Robust speech features and acoustic models for speech recognition. Nanyang Technological University, Ph.D. Thesis"},{"key":"9739_CR158","doi-asserted-by":"crossref","first-page":"1026","DOI":"10.1016\/j.patcog.2005.10.029","volume":"39","author":"Y Xu","year":"2006","unstructured":"Xu Y, Zhang D, Jin Z, Li M, Yang J-Y (2006) A fast kernel-based nonlinear discriminant analysis for multi-class problems. Pattern Recogn 39:1026\u20131033","journal-title":"Pattern Recogn"},{"key":"9739_CR159","doi-asserted-by":"crossref","unstructured":"Xu H, Su H, Chng ES, Li H (2014) Semi-supervised training for bottle-neck feature based DNN-HMM hybrid systems. In: Fifteenth annual conference of the international speech communication association, pp 2078\u20132082","DOI":"10.21437\/Interspeech.2014-472"},{"key":"9739_CR160","first-page":"415","volume":"56","author":"J Yang","year":"2004","unstructured":"Yang J, Frangi AF (2004) Yang J-y. A new kernel Fisher discriminant algorithm with application to face recognition Neurocomputing 56:415\u2013421","journal-title":"A new kernel Fisher discriminant algorithm with application to face recognition Neurocomputing"},{"key":"9739_CR161","volume-title":"Garbage modeling techniques for a Turkish keyword spotting system","author":"\u00dc Yapanel","year":"2000","unstructured":"Yapanel \u00dc (2000) Garbage modeling techniques for a Turkish keyword spotting system. Bo\u011fazi\u00e7i University, Istanbul"},{"key":"9739_CR162","doi-asserted-by":"crossref","unstructured":"Yoshizawa S, Hayasaka N, Wada N, Miyanaga Y (2004) Cepstral gain normalization for noise robust speech recognition. In: International conference on acoustics, speech, and signal processing (ICASSP\u201904), pp I-209\u2013I-212","DOI":"10.1109\/ICASSP.2004.1325959"},{"key":"9739_CR163","volume-title":"Automatic speech recognition: a deep learning approach","author":"D Yu","year":"2014","unstructured":"Yu D, Deng L (2014) Automatic speech recognition: a deep learning approach. Springer, New York"},{"key":"9739_CR164","volume-title":"Keyword spotting on word lattices","author":"DG Zacharie","year":"2007","unstructured":"Zacharie DG, Pinto JP (2007) Keyword spotting on word lattices. IDIAP, Martigny"},{"key":"9739_CR165","doi-asserted-by":"crossref","unstructured":"Zhang S-X (2014) Structured support vector machines for speech recognition. University of Cambridge, Department of Engineering, Doctor of Philosophy thesis","DOI":"10.1109\/ICASSP.2014.6854215"},{"key":"9739_CR166","doi-asserted-by":"crossref","unstructured":"Zhang Y, Glass JR (2009) Unsupervised spoken keyword spotting via segmental DTW on Gaussian posteriorgrams. In: IEEE workshop on automatic speech recognition & understanding, IEEE, pp 398\u2013403","DOI":"10.1109\/ASRU.2009.5372931"},{"key":"9739_CR167","doi-asserted-by":"crossref","first-page":"151","DOI":"10.1007\/s11063-011-9170-4","volume":"33","author":"R Zhang","year":"2011","unstructured":"Zhang R, Wang W (2011) Learning linear and nonlinear PCA with linear programming. Neural Process Lett 33:151\u2013170","journal-title":"Neural Process Lett"},{"key":"9739_CR168","unstructured":"Zhang S-X, Liu C, Yao K, Gong Y (2015) Deep neural support vector machines for speech recognition. In: International conference on acoustics, speech and signal processing (ICASSP\u201915), pp 4275\u20134279"},{"key":"9739_CR169","unstructured":"Zhang Y, Suda N, Lai L, Chandra V (2017) Hello edge: keyword spotting on microcontrollers. arXiv preprint arXiv:171107128"},{"key":"9739_CR170","first-page":"393","volume":"6","author":"H Zhao","year":"2012","unstructured":"Zhao H, Xiao Y (2012) A novel robust MFCC extraction method using sample-ISOMAP for speech recognition. Int J Digit Content Technol Appl 6:393\u2013400","journal-title":"Int J Digit Content Technol Appl"},{"key":"9739_CR171","doi-asserted-by":"crossref","first-page":"49","DOI":"10.1007\/s11063-004-0036-x","volume":"22","author":"W Zheng","year":"2005","unstructured":"Zheng W, Zou C, Zhao L (2005) An improved algorithm for kernel principal component analysis. Neural Process Lett 22:49\u201356","journal-title":"Neural Process Lett"},{"key":"9739_CR172","doi-asserted-by":"crossref","unstructured":"Zhuang Y, Chang X, Qian Y, Yu K (2016) Unrestricted vocabulary keyword spotting using LSTM-CTC. In: INTERSPEECH, pp 938\u2013942","DOI":"10.21437\/Interspeech.2016-753"}],"container-title":["Artificial Intelligence Review"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10462-019-09739-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10462-019-09739-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10462-019-09739-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,7,21]],"date-time":"2024-07-21T12:21:20Z","timestamp":1721564480000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10462-019-09739-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,7,27]]},"references-count":172,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2020,4]]}},"alternative-id":["9739"],"URL":"https:\/\/doi.org\/10.1007\/s10462-019-09739-y","relation":{},"ISSN":["0269-2821","1573-7462"],"issn-type":[{"value":"0269-2821","type":"print"},{"value":"1573-7462","type":"electronic"}],"subject":[],"published":{"date-parts":[[2019,7,27]]},"assertion":[{"value":"27 July 2019","order":1,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}