{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,2]],"date-time":"2025-08-02T05:27:17Z","timestamp":1754112437999,"version":"3.37.3"},"reference-count":43,"publisher":"Springer Science and Business Media LLC","issue":"8","license":[{"start":{"date-parts":[[2019,11,11]],"date-time":"2019-11-11T00:00:00Z","timestamp":1573430400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2019,11,11]],"date-time":"2019-11-11T00:00:00Z","timestamp":1573430400000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Sign Process Syst"],"published-print":{"date-parts":[[2020,8]]},"DOI":"10.1007\/s11265-019-01484-3","type":"journal-article","created":{"date-parts":[[2019,11,11]],"date-time":"2019-11-11T05:04:12Z","timestamp":1573448652000},"page":"805-817","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["Spoken Language Understanding of Human-Machine Conversations for Language Learning Applications"],"prefix":"10.1007","volume":"92","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1855-9630","authenticated-orcid":false,"given":"Yao","family":"Qian","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rutuja","family":"Ubale","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Patrick","family":"Lange","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Keelan","family":"Evanini","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Vikram","family":"Ramanarayanan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Frank K.","family":"Soong","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2019,11,11]]},"reference":[{"issue":"3","key":"1484_CR1","doi-asserted-by":"publisher","first-page":"530","DOI":"10.1109\/TASLP.2014.2383614","volume":"23","author":"G Mesnil","year":"2015","unstructured":"Mesnil, G., Dauphin, Y., Yao, K., Bengio, Y., Deng, L., Hakkani- Tur, D., He, X., Heck, L., Tur, D.Y.G., Zweig, G. (2015). Using recurrent neural networks for slot filling in spoken language understanding. IEEE\/ACM Transactions on Audio, Speech and Language Processing, 23(3), 530\u2013539.","journal-title":"IEEE\/ACM Transactions on Audio, Speech and Language Processing"},{"key":"1484_CR2","doi-asserted-by":"crossref","unstructured":"Xu, P., & Sarikaya, R. (2013). Convolutional neural network based triangular CRF for joint intent detection and slot filling. In Proc. of ASRU (pp. 78\u201383).","DOI":"10.1109\/ASRU.2013.6707709"},{"key":"1484_CR3","doi-asserted-by":"crossref","unstructured":"Tur, G., Hakkani-Tur, D., Heck, L., Parthasarathy, S. (2011). Sentence simplification for spoken language understanding. In Proc. of ICASSP (pp. 5628\u20135631).","DOI":"10.1109\/ICASSP.2011.5947636"},{"issue":"3","key":"1484_CR4","doi-asserted-by":"publisher","first-page":"374","DOI":"10.1016\/j.specom.2005.06.001","volume":"48","author":"Q Huang","year":"2006","unstructured":"Huang, Q., & Cox, S. (2006). Task-independent call-routing. Speech Communication, 48(3), 374\u2013389.","journal-title":"Speech Communication"},{"key":"1484_CR5","unstructured":"Gorin, A.L., Petrovska-Delacretaz, D., Riccardi, G., Wright, J.H. (1999). Learning spoken language without transcriptions. In Proc. of ASRU (Vol. 99)."},{"key":"1484_CR6","doi-asserted-by":"crossref","unstructured":"Alshawi, H. (2003). Effective utterance classification with unsupervised phonotactic models. In Proc. of NAACL HLT, (Vol. 1 pp. 1\u20137).","DOI":"10.3115\/1073445.1073446"},{"key":"1484_CR7","unstructured":"Wang, Y.Y., Lee, J., Acero, A. (2006). Speech utterance classification model training without manual transcriptions. In Proc. of ICASSP, (Vol. 1 pp. 553\u2013556)."},{"key":"1484_CR8","doi-asserted-by":"crossref","unstructured":"Wang, Y., Skerry-Ryan, R.J., Stanton, D., Wu, Y., Weiss, R.J., Jaitly, N., Yang, Z., Xiao, Y., Chen, Z., Bengio, S., Le, Q. (2017). Tacotron: Towards end-to-end speech synthesis. In Proc. Interspeechpp 4006\u20134010.","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"1484_CR9","doi-asserted-by":"crossref","unstructured":"Shen, J., Pang, R., Weiss, R.J., Schuster, M., Jaitly, N., Yang, Z., Chen, Z., Zhang, Y., Wang, Y., Skerrv-Ryan, R., Saurous, R.A. (2018). Natural tts synthesis by conditioning wavenet on mel spectrogram predictions. In 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (pp. 4779\u20134783). IEEE.","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"1484_CR10","unstructured":"Sotelo, J., Mehri, S., Kumar, K., Santos, J.F., Kastner, K., Courville, A., Bengio, Y. (2017). Char2wav: End-to-end speech synthesis. In ICLR 2017 workshop."},{"key":"1484_CR11","doi-asserted-by":"crossref","unstructured":"Heigold, G., Moreno, I., Bengio, S., Shazeer, N. (2016). End-to-end text-dependent speaker verification. In 2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (pp. 5115\u20135119). IEEE.","DOI":"10.1109\/ICASSP.2016.7472652"},{"key":"1484_CR12","unstructured":"Zhang, S.X., Chen, Z., Zhao, Y., Li, J., Gong, Y. (2016). End-to-end attention based text-dependent speaker verification. In: Proceedings of the IEEE Workshop on Spoken Language Technology (SLT 2016) (pp. 171\u2013178). IEEE."},{"key":"1484_CR13","doi-asserted-by":"crossref","unstructured":"Ubale, R., Qian, Y., Evanini, K. (2018). Exploring end-to-end attention-based neural networks for native language identification. In Proceedings of the 2018 IEEE Spoken Language Technology Workshop (SLT) (pp. 84\u201391). IEEE.","DOI":"10.1109\/SLT.2018.8639689"},{"key":"1484_CR14","unstructured":"Geng, W., Wang, W., Zhao, Y., Cai, X., Xu, B. (2016). End-to-End Language Identification Using Attention-Based Recurrent Neural Networks. In: Proc. INTERSPEECH (pp. 2944\u20132948)."},{"key":"1484_CR15","unstructured":"Trigeorgis, G., Ringeval, F., Brueckner, R., Marchi, E., Nicolaou, M.A., Schuller, B., Zafeiriou, S. (2016). Adieu features? end-to-end speech emotion recognition using a deep convolutional recurrent network. In 2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (pp. 5200\u20135204). IEEE. https:\/\/www.overleaf.com\/project\/5d41b311fb69574cddcacef7."},{"issue":"8","key":"1484_CR16","doi-asserted-by":"publisher","first-page":"1351","DOI":"10.1109\/JSTSP.2017.2759726","volume":"11","author":"K Audhkhasi","year":"2017","unstructured":"Audhkhasi, K., Rosenberg, A., Sethy, A., Ramabhadran, B., Kingsbury, B. (2017). End-to-end ASR-free keyword search from speech. IEEE Journal of Selected Topics in Signal Processing, 11(8), 1351\u20131359. IEEE.","journal-title":"IEEE Journal of Selected Topics in Signal Processing"},{"key":"1484_CR17","unstructured":"Lengerich, C., & Hannun, A. (2016). An end-to-end architecture for keyword spotting and voice activity detection. arXiv:1611.09405."},{"key":"1484_CR18","doi-asserted-by":"crossref","unstructured":"Li, B., Sainath, T.N., Sim, K.C., Bacchiani, M., Weinstein, E., Nguyen, P., Chen, Z., Wu, Y., Rao, K. (2018). Multi-dialect speech recognition with a single sequence-to-sequence model. In 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (pp. 4749\u20134753). IEEE.","DOI":"10.1109\/ICASSP.2018.8461886"},{"key":"1484_CR19","doi-asserted-by":"crossref","unstructured":"Toshniwal, S., Sainath, T.N., Weiss, R.J., Li, B., Moreno, P., Weinstein E., Rao, K. (2018). Multilingual speech recognition with a single end-to-end model. In 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (pp. 4904\u20134908). IEEE.","DOI":"10.1109\/ICASSP.2018.8461972"},{"key":"1484_CR20","unstructured":"Chan, W., Jaitly, N., Le, Q., Vinyals, O. (2016). Listen, attend and spell: A neural network for large vocabulary conversational speech recognition, In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (pp. 4960\u20134964). IEEE."},{"key":"1484_CR21","doi-asserted-by":"crossref","unstructured":"Prabhavalkar, R., Rao, K., Sainath, T.N., Li, B., Johnson, L., Jaitly, N. (2017). A comparison of sequence-to-sequence models for speech recognition. In Proc. Interspeech (pp. 939\u2013943).","DOI":"10.21437\/Interspeech.2017-233"},{"key":"1484_CR22","doi-asserted-by":"crossref","unstructured":"Chiu, C.C., Sainath, T.N., Wu, Y., Prabhavalkar, R., Nguyen, P., Chen, Z., Kannan, A., Weiss, R.J., Rao, K., Gonina, E., Jaitly, N. (2018). State-of-the-art speech recognition with sequence-to-sequence models. In 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (pp. 4774\u20134778) IEEE.","DOI":"10.1109\/ICASSP.2018.8462105"},{"key":"1484_CR23","doi-asserted-by":"crossref","unstructured":"Prabhavalkar, R., Sainath, T.N., Wu, Y., Nguyen, P., Chen, Z., Chiu, C.C., Kannan, A. (2018). Minimum word error rate training for attention-based sequence-to-sequence models. In 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (pp. 4839\u20134843). IEEE.","DOI":"10.1109\/ICASSP.2018.8461809"},{"key":"1484_CR24","doi-asserted-by":"crossref","unstructured":"Sainath, T.N., Chiu, C.C., Prabhavalkar, R., Kannan, A., Wu, Y., Nguyen, P., Chen, Z. (2018). Improving the performance of online neural transducer models. In 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (pp. 5864\u20135868). IEEE.","DOI":"10.1109\/ICASSP.2018.8462366"},{"key":"1484_CR25","doi-asserted-by":"crossref","unstructured":"Qian, Y., Ubale, R., Ramanaryanan, V., Lange, P., Suendermann-Oeft, D., Evanini, K., Tsuprun, E. (2017). Exploring ASR-free end-to-end modeling to improve spoken language understanding in a cloud-based dialog system. In 2017 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU) (pp. 569\u2013576). IEEE.","DOI":"10.1109\/ASRU.2017.8268987"},{"key":"1484_CR26","doi-asserted-by":"crossref","unstructured":"Qian, Y., Ubale, R., Lange, P., Evanini, K., Soong, F. (2018). From speech signals to semantics - tagging performance at acoustic, phonetic and word levels. In 2018 11th International Symposium on Chinese Spoken Language Processing (ISCSLP). IEEE.","DOI":"10.1109\/ISCSLP.2018.8706581"},{"key":"1484_CR27","unstructured":"Serdyuk, D., Wang, Y., Fuegen, C., Kumar, A., Liu, B., Bengio, Y. (2018). Towards end-to-end spoken language understanding. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (pp. 5754\u20135758). IEEE."},{"key":"1484_CR28","unstructured":"Haghani, P., Narayanan, A., Bacchiani, M., Chuang, G., Gaur, N., Moreno, P., Prabhavalkar, R., Qu, Z., Waters, A. (2018). From Audio to Semantics:, Approaches to end-to-end spoken language understanding. arXiv:1809.09190."},{"key":"1484_CR29","doi-asserted-by":"publisher","unstructured":"Ramanarayanan, V., Suendermann-Oeft, D., Lange, P., Ivanov, A.V., Evanini, K., Yu, Z., Tsuprun, E., Qian, Y. (2016). Bootstrapping development of a Cloud-Based spoken dialog system in the educational domain from scratch using crowdsourced data. ETS Research Report Series, Wiley, https:\/\/doi.org\/10.1002\/ets2.12105.","DOI":"10.1002\/ets2.12105"},{"key":"1484_CR30","first-page":"295","volume-title":"Assembling the Jigsaw: How Multiple Open Standards Are Synergistically Combined in the HALEF Multimodal Dialog System. Multimodal Interaction with W3C Standards","author":"V Ramanarayanan","year":"2017","unstructured":"Ramanarayanan, V., Suendermann-Oeft, D., Lange, P., Mundkowsky, R., Ivanov, A., Yu, Z., Qian, Y., Evanini, K. (2017). Assembling the Jigsaw: How Multiple Open Standards Are Synergistically Combined in the HALEF Multimodal Dialog System. Multimodal Interaction with W3C Standards, (pp. 295\u2013310). Berlin: Springer."},{"key":"1484_CR31","doi-asserted-by":"publisher","first-page":"14","DOI":"10.1016\/j.specom.2015.07.006","volume":"73","author":"J Cheng","year":"2015","unstructured":"Cheng, J., Chen, X., Metallinou, A. (2015). Deep neural network acoustic models for spoken assessment applications. Speech Communication, 73, 14\u201327.","journal-title":"Speech Communication"},{"issue":"8","key":"1484_CR32","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., & Schmidhuber, J. (1997). Long short-term memory. Neural Computation, 9(8), 1735\u20131780.","journal-title":"Neural Computation"},{"key":"1484_CR33","doi-asserted-by":"crossref","unstructured":"Chan, W., Jaitly, N., Le, Q.V., Vinyals, O. (2016). Listen, attend and spell: A neural network for large vocabulary conversational speech recognition. In Proc. of ICASSP.","DOI":"10.1109\/ICASSP.2016.7472621"},{"key":"1484_CR34","doi-asserted-by":"crossref","unstructured":"Audhkhasi, K., Rosenberg, A., Sethy, A., Ramabhadran, B., Kingsbury, B. (2017). End-to-end ASR-free keyword search from speech. In Proc. of ICASSP.","DOI":"10.1109\/ICASSP.2017.7953076"},{"key":"1484_CR35","doi-asserted-by":"crossref","unstructured":"Chung, Y., Wu, C., Shen, C., Lee, H., Lee, L. (2016). Audio Word2Vec: unsupervised learning of audio segment representations using sequence-to-sequence autoencoder. In Proc. of Interspeech.","DOI":"10.21437\/Interspeech.2016-82"},{"issue":"8","key":"1484_CR36","doi-asserted-by":"publisher","first-page":"1798","DOI":"10.1109\/TPAMI.2013.50","volume":"35","author":"Y Bengio","year":"2013","unstructured":"Bengio, Y., Courville, A., Vincent, P. (2013). Representation learning: a review and new perspectives. IEEE Transactions on Pattern Analysis and Machine Intelligence, 35(8), 1798\u20131828.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"1484_CR37","doi-asserted-by":"crossref","unstructured":"Siohan, O., & Bacchiani, M. (2005). Fast vocabulary-independent audio search using path-based graph indexing. In Proc. of Interspeech.","DOI":"10.21437\/Interspeech.2005-52"},{"key":"1484_CR38","unstructured":"Povey, D., Ghoshal, A., Boulianne, G., Burget, L., Glembek, O., Goel, N., Hannemann, M., Motlicek, P., Qian, Y., Schwarz, P., Silovsky, J., Stemmer, G., Vesely, K. (2011). The kaldi speech recognition toolkit. In Proc. of ASRU."},{"key":"1484_CR39","unstructured":"Cieri, J., Miller, D., Walker, K. (2004). The fisher corpus: a resource for the next generations of speech-to-text. In LREC, (Vol. 4 pp. 69\u201371)."},{"key":"1484_CR40","doi-asserted-by":"crossref","unstructured":"Peddinti, V., Povey, D., Khudanpur, S. (2015). A time delay neural network architecture for efficient modeling of long temporal contexts. In Proc. of INTERSPEECH (pp. 3214\u20133218).","DOI":"10.21437\/Interspeech.2015-647"},{"key":"1484_CR41","doi-asserted-by":"crossref","unstructured":"Qian, Y., Wang, X., Evanini, K., Suendermann- Oeft, D. (2016). Self-adaptive dnn for improving spoken language proficiency assessment. In Proc. of Interspeech (pp. 3122\u20133126).","DOI":"10.21437\/Interspeech.2016-291"},{"issue":"2","key":"1484_CR42","doi-asserted-by":"publisher","first-page":"161","DOI":"10.1007\/s10772-012-9171-3","volume":"16","author":"E Tetariy","year":"2013","unstructured":"Tetariy, E., Gishri, M., Har-Lev, B., Aharonson, V., Moyal, A. (2013). An efficient lattice-based phonetic search method for accelerating keyword spotting in large speech databases. International Journal of Speech Technology, 16(2), 161\u2013169.","journal-title":"International Journal of Speech Technology"},{"key":"1484_CR43","unstructured":"Saraclar, M., & Sproat, R. (2004). Lattice-based search for spoken utterance retrieval. In Proc. of ACL (pp. 129\u2013136)."}],"container-title":["Journal of Signal Processing Systems"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11265-019-01484-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11265-019-01484-3\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11265-019-01484-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,4]],"date-time":"2022-10-04T07:48:39Z","timestamp":1664869719000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11265-019-01484-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,11,11]]},"references-count":43,"journal-issue":{"issue":"8","published-print":{"date-parts":[[2020,8]]}},"alternative-id":["1484"],"URL":"https:\/\/doi.org\/10.1007\/s11265-019-01484-3","relation":{},"ISSN":["1939-8018","1939-8115"],"issn-type":[{"type":"print","value":"1939-8018"},{"type":"electronic","value":"1939-8115"}],"subject":[],"published":{"date-parts":[[2019,11,11]]},"assertion":[{"value":"15 February 2019","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 August 2019","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 September 2019","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 November 2019","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}