{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,21]],"date-time":"2026-05-21T16:45:47Z","timestamp":1779381947023,"version":"3.53.1"},"reference-count":105,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2024,2,16]],"date-time":"2024-02-16T00:00:00Z","timestamp":1708041600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,2,16]],"date-time":"2024-02-16T00:00:00Z","timestamp":1708041600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100003452","name":"Innovation and Technology Commission","doi-asserted-by":"crossref","award":["PRP\/026\/21FX"],"award-info":[{"award-number":["PRP\/026\/21FX"]}],"id":[{"id":"10.13039\/501100003452","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["The VLDB Journal"],"published-print":{"date-parts":[[2024,7]]},"DOI":"10.1007\/s00778-024-00837-0","type":"journal-article","created":{"date-parts":[[2024,2,16]],"date-time":"2024-02-16T10:02:32Z","timestamp":1708077752000},"page":"1179-1201","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":8,"title":["Speech-to-SQL: toward speech-driven SQL query generation from natural language question"],"prefix":"10.1007","volume":"33","author":[{"given":"Yuanfeng","family":"Song","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Raymond Chi-Wing","family":"Wong","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7154-9338","authenticated-orcid":false,"given":"Xuefang","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,2,16]]},"reference":[{"key":"837_CR1","unstructured":"Serenade ai. (Last accessed 16 Oct. 2022). https:\/\/serenade.ai\/"},{"key":"837_CR2","unstructured":"Talon voice. (Last accessed 16 Oct. 2022). https:\/\/talonvoice.com\/"},{"issue":"5","key":"837_CR3","doi-asserted-by":"publisher","first-page":"793","DOI":"10.1007\/s00778-019-00567-8","volume":"28","author":"K Affolter","year":"2019","unstructured":"Affolter, K., Stockinger, K., Bernstein, A.: A comparative survey of recent natural language interfaces for databases. VLDB J. 28(5), 793\u2013819 (2019)","journal-title":"VLDB J."},{"key":"837_CR4","doi-asserted-by":"crossref","unstructured":"Alateeq, A., Roantree, M., Gurrin, C.: Voxento: A prototype voice-controlled interactive search engine for lifelogs. In: Proceedings of the Third Annual Workshop on Lifelog Search Challenge, pp. 77\u201381 (2020)","DOI":"10.1145\/3379172.3391728"},{"issue":"8","key":"837_CR5","doi-asserted-by":"publisher","first-page":"1351","DOI":"10.1109\/JSTSP.2017.2759726","volume":"11","author":"K Audhkhasi","year":"2017","unstructured":"Audhkhasi, K., Rosenberg, A., Sethy, A., Ramabhadran, B., Kingsbury, B.: End-to-end asr-free keyword search from speech. IEEE J. Selected Top. Signal Process. 11(8), 1351\u20131359 (2017)","journal-title":"IEEE J. Selected Top. Signal Process."},{"key":"837_CR6","unstructured":"Ba, J.L., Kiros, J.R., Hinton, G.E.: Layer normalization. In: NIPS 2016 Deep Learning Symposium (2016)"},{"key":"837_CR7","doi-asserted-by":"crossref","unstructured":"Bahdanau, D., Chorowski, J., Serdyuk, D., Brakel, P., Bengio, Y.: End-to-end attention-based large vocabulary speech recognition. In: 2016 ICASSP, pp. 4945\u20134949. IEEE (2016)","DOI":"10.1109\/ICASSP.2016.7472618"},{"key":"837_CR8","doi-asserted-by":"crossref","unstructured":"Bansal, S., Kamper, H., Lopez, A., Goldwater, S.: Towards speech-to-text translation without speech recognition. In: Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 2, Short Papers, pp. 474\u2013479 (2017)","DOI":"10.18653\/v1\/E17-2076"},{"key":"837_CR9","doi-asserted-by":"crossref","unstructured":"Black, D., Rapos, E.J., Stephan, M.: Voice-driven modeling: Software modeling using automated speech recognition. In: 2019 ACM\/IEEE 22nd International Conference on Model Driven Engineering Languages and Systems Companion (MODELS-C), pp. 252\u2013258. IEEE (2019)","DOI":"10.1109\/MODELS-C.2019.00040"},{"key":"837_CR10","doi-asserted-by":"crossref","unstructured":"Blunschi, L., Jossen, C., Kossmann, D., Mori, M., Stockinger, K.: Data-thirsty business analysts need soda: search over data warehouse. In: Proceedings of the 20th ACM international conference on Information and knowledge management, pp. 2525\u20132528 (2011)","DOI":"10.1145\/2063576.2064009"},{"key":"837_CR11","doi-asserted-by":"crossref","unstructured":"Bogin, B., Berant, J., Gardner, M.: Representing schema structure with graph neural networks for text-to-sql parsing. In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, pp. 4560\u20134565 (2019)","DOI":"10.18653\/v1\/P19-1448"},{"key":"837_CR12","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., Mann, B., Ryder, N., Subbiah, M., Kaplan, J.D., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A., et al.: Language models are few-shot learners. Adv. Neural Inf. Process. Syst. 33, 1877\u20131901 (2020)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"837_CR13","doi-asserted-by":"crossref","unstructured":"Chan, W., Jaitly, N., Le, Q., Vinyals, O.: Listen, attend and spell: a neural network for large vocabulary conversational speech recognition. In: 2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4960\u20134964. IEEE (2016)","DOI":"10.1109\/ICASSP.2016.7472621"},{"key":"837_CR14","doi-asserted-by":"crossref","unstructured":"Chazan, D., Hoory, R., Cohen, G., Zibulski, M.: Speech reconstruction from mel frequency cepstral coefficients and pitch frequency. In: 2000 IEEE International Conference on Acoustics, Speech, and Signal Processing. Proceedings (Cat. No. 00CH37100), vol.\u00a03, pp. 1299\u20131302. IEEE (2000)","DOI":"10.1109\/ICASSP.2000.861816"},{"key":"837_CR15","doi-asserted-by":"crossref","unstructured":"Chen, F., Hwang, S.w., Choo, J., Ha, J.W., Kim, S.: Nl2psql: Generating pseudo-sql queries from under-specified natural language questions. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), pp. 2603\u20132613 (2019)","DOI":"10.18653\/v1\/D19-1262"},{"key":"837_CR16","doi-asserted-by":"crossref","unstructured":"Chen, T., Wong, R.C.W.: Handling information loss of graph neural networks for session-based recommendation. In: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp. 1172\u20131180 (2020)","DOI":"10.1145\/3394486.3403170"},{"key":"837_CR17","doi-asserted-by":"crossref","unstructured":"Cho, K., van Merrienboer, B., G\u00fcl\u00e7ehre, \u00c7., Bahdanau, D., Bougares, F., Schwenk, H., Bengio, Y.: Learning phrase representations using rnn encoder-decoder for statistical machine translation. In: Conference on Empirical Methods in Natural Language Processing (EMNLP 2014) (2014)","DOI":"10.3115\/v1\/D14-1179"},{"key":"837_CR18","doi-asserted-by":"crossref","unstructured":"Currey, A., Heafield, K.: Incorporating source syntax into transformer-based neural machine translation. In: Proceedings of the Fourth Conference on Machine Translation (Volume 1: Research Papers), pp. 24\u201333 (2019)","DOI":"10.18653\/v1\/W19-5203"},{"key":"837_CR19","doi-asserted-by":"crossref","unstructured":"D\u00e9silets, A., Fox, D.C., Norton, S.: Voicecode: An innovative speech interface for programming-by-voice. In: CHI\u201906 Extended Abstracts on Human Factors in Computing Systems, pp. 239\u2013242 (2006)","DOI":"10.1145\/1125451.1125502"},{"key":"837_CR20","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: Pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers) (2019)"},{"key":"837_CR21","doi-asserted-by":"crossref","unstructured":"Du, Z., Qian, Y., Liu, X., Ding, M., Qiu, J., Yang, Z., Tang, J.: Glm: General language model pretraining with autoregressive blank infilling. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 320\u2013335 (2022)","DOI":"10.18653\/v1\/2022.acl-long.26"},{"key":"837_CR22","doi-asserted-by":"crossref","unstructured":"Fan, Y., Qian, Y., Xie, F.L., Soong, F.K.: Tts synthesis with bidirectional lstm based recurrent neural networks. In: Fifteenth Annual Conference of the International Speech Communication Association (2014)","DOI":"10.21437\/Interspeech.2014-443"},{"key":"837_CR23","doi-asserted-by":"crossref","unstructured":"Foote, J.T.: Content-based retrieval of music and audio. In: Multimedia Storage and Archiving Systems II, vol. 3229, pp. 138\u2013147. International Society for Optics and Photonics (1997)","DOI":"10.1117\/12.290336"},{"key":"837_CR24","doi-asserted-by":"crossref","unstructured":"Gan, Y., Chen, X., Huang, Q., Purver, M., Woodward, J.R., Xie, J., Huang, P.: Towards robustness of text-to-sql models against synonym substitution. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), pp. 2505\u20132515 (2021)","DOI":"10.18653\/v1\/2021.acl-long.195"},{"key":"837_CR25","doi-asserted-by":"crossref","unstructured":"Gkini, O., Belmpas, T., Koutrika, G., Ioannidis, Y.: An in-depth benchmarking of text-to-sql systems. In: Proceedings of the 2021 International Conference on Management of Data, pp. 632\u2013644 (2021)","DOI":"10.1145\/3448016.3452836"},{"key":"837_CR26","unstructured":"Glorot, X., Bordes, A., Bengio, Y.: Deep sparse rectifier neural networks. In: Proceedings of the Fourteenth International Conference on Artificial Intelligence and Statistics, pp. 315\u2013323. JMLR Workshop and Conference Proceedings (2011)"},{"key":"837_CR27","doi-asserted-by":"crossref","unstructured":"Graves, A.: Long short-term memory. In: Supervised Sequence Labelling with Recurrent Neural Networks, pp. 37\u201345. Springer (2012)","DOI":"10.1007\/978-3-642-24797-2_4"},{"key":"837_CR28","doi-asserted-by":"crossref","unstructured":"Guo, J., Zhan, Z., Gao, Y., Xiao, Y., Lou, J.G., Liu, T., Zhang, D.: Towards complex text-to-sql in cross-domain database with intermediate representation. In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, pp. 4524\u20134535 (2019)","DOI":"10.18653\/v1\/P19-1444"},{"key":"837_CR29","doi-asserted-by":"crossref","unstructured":"Hernandez, F., Nguyen, V., Ghannay, S., Tomashenko, N., Est\u00e8ve, Y.: Ted-lium 3: twice as much data and corpus repartition for experiments on speaker adaptation. In: International Conference on Speech and Computer, pp. 198\u2013208. Springer (2018)","DOI":"10.1007\/978-3-319-99579-3_21"},{"key":"837_CR30","doi-asserted-by":"crossref","unstructured":"Herzig, J., Nowak, P.K., Mueller, T., Piccinno, F., Eisenschlos, J.: Tapas: Weakly supervised table parsing via pre-training. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 4320\u20134333 (2020)","DOI":"10.18653\/v1\/2020.acl-main.398"},{"key":"837_CR31","doi-asserted-by":"crossref","unstructured":"Iacob, R.C.A., Brad, F., Apostol, E.S., Truic\u0103, C.O., Hosu, I.A., Rebedea, T.: Neural approaches for natural language interfaces to databases: a survey. In: Proceedings of the 28th International Conference on Computational Linguistics, pp. 381\u2013395 (2020)","DOI":"10.18653\/v1\/2020.coling-main.34"},{"key":"837_CR32","unstructured":"Ioffe, S., Szegedy, C.: Batch normalization: Accelerating deep network training by reducing internal covariate shift. In: International Conference on Machine Learning, pp. 448\u2013456. PMLR (2015)"},{"key":"837_CR33","unstructured":"Kedar, S.: Database Management System. Technical Publications (2009)"},{"issue":"10","key":"837_CR34","doi-asserted-by":"publisher","first-page":"1737","DOI":"10.14778\/3401960.3401970","volume":"13","author":"H Kim","year":"2020","unstructured":"Kim, H., So, B.H., Han, W.S., Lee, H.: Natural language to sql: Where are we today? Proceedings of the VLDB Endowment 13(10), 1737\u20131750 (2020)","journal-title":"Proceedings of the VLDB Endowment"},{"key":"837_CR35","unstructured":"Kingma, D.P., Ba, J.: Adam: A method for stochastic optimization. Tech. rep. (2014)"},{"key":"837_CR36","unstructured":"Kipf, T.N., Welling, M.: Semi-supervised classification with graph convolutional networks. In: 5th International Conference on Learning Representations, ICLR 2017, Toulon, France, April 24\u201326, 2017, Conference Track Proceedings (2017)"},{"key":"837_CR37","first-page":"1097","volume":"25","author":"A Krizhevsky","year":"2012","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: Imagenet classification with deep convolutional neural networks. Adv. Neural Inf. Process. Syst. 25, 1097\u20131105 (2012)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"837_CR38","unstructured":"Kumar, K., Kumar, R., de\u00a0Boissiere, T., Gestin, L., Teoh, W.Z., Sotelo, J., de\u00a0Br\u00e9bisson, A., Bengio, Y., Courville, A.C.: Melgan: Generative adversarial networks for conditional waveform synthesis. Adv. Neural Inf. Process. Syst. 32 (2019)"},{"key":"837_CR39","unstructured":"Lakew, S.M., Cettolo, M., Federico, M.: A comparison of transformer and recurrent neural networks on multilingual neural machine translation. In: Proceedings of the 27th International Conference on Computational Linguistics, pp. 641\u2013652 (2018)"},{"key":"837_CR40","doi-asserted-by":"crossref","unstructured":"Le, H., Sahoo, D., Chen, N., Hoi, S.: Multimodal transformer networks for end-to-end video-grounded dialogue systems. In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, pp. 5612\u20135623 (2019)","DOI":"10.18653\/v1\/P19-1564"},{"key":"837_CR41","unstructured":"Lee, H., Fenwick\u00a0Jr, J.B., Klima, R.E., McRae, A.A., Vahlbusch, J.: Disability assistive programming: using voice input to write code. Ph.D. thesis, Appalachian State University (2019)"},{"key":"837_CR42","doi-asserted-by":"crossref","unstructured":"Lei, W., Wang, W., Ma, Z., Gan, T., Lu, W., Kan, M.Y., Chua, T.S.: Re-examining the role of schema linking in text-to-sql. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 6943\u20136954 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.564"},{"issue":"1","key":"837_CR43","doi-asserted-by":"publisher","first-page":"73","DOI":"10.14778\/2735461.2735468","volume":"8","author":"F Li","year":"2014","unstructured":"Li, F., Jagadish, H.: Constructing an interactive natural language interface for relational databases. Proc. VLDB Endow. 8(1), 73\u201384 (2014)","journal-title":"Proc. VLDB Endow."},{"key":"837_CR44","doi-asserted-by":"crossref","unstructured":"Li, F., Jagadish, H.V.: Nalir: an interactive natural language interface for querying relational databases. In: Proceedings of the 2014 ACM SIGMOD International Conference on Management of Data, pp. 709\u2013712 (2014)","DOI":"10.1145\/2588555.2594519"},{"key":"837_CR45","doi-asserted-by":"crossref","unstructured":"Li, G., Muller, M., Thabet, A., Ghanem, B.: Deepgcns: Can gcns go as deep as cnns? In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9267\u20139276 (2019)","DOI":"10.1109\/ICCV.2019.00936"},{"key":"837_CR46","doi-asserted-by":"crossref","unstructured":"Li, G., Zhou, X., Cao, L.: Ai meets database: Ai4db and db4ai. In: Proceedings of the 2021 International Conference on Management of Data, pp. 2859\u20132866 (2021)","DOI":"10.1145\/3448016.3457542"},{"issue":"3","key":"837_CR47","doi-asserted-by":"publisher","first-page":"517","DOI":"10.1109\/JSTSP.2020.2987417","volume":"14","author":"J Li","year":"2020","unstructured":"Li, J., Zhang, X., Jia, C., Xu, J., Zhang, L., Wang, Y., Ma, S., Gao, W.: Direct speech-to-image translation. IEEE J. Selected Top. Signal Process. 14(3), 517\u2013529 (2020)","journal-title":"IEEE J. Selected Top. Signal Process."},{"key":"837_CR48","doi-asserted-by":"crossref","unstructured":"Luong, T., Pham, H., Manning, C.D.: Effective approaches to attention-based neural machine translation. In: Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing, pp. 1412\u20131421 (2015)","DOI":"10.18653\/v1\/D15-1166"},{"key":"837_CR49","doi-asserted-by":"crossref","unstructured":"Lyons, G., Tran, V., Binnig, C., Cetintemel, U., Kraska, T.: Making the case for query-by-voice with echoquery. In: Proceedings of the 2016 International Conference on Management of Data, pp. 2129\u20132132 (2016)","DOI":"10.1145\/2882903.2899394"},{"key":"837_CR50","unstructured":"Medsker, L.R., Jain, L.: Recurrent neural networks. Des. Appl. 5 (2001)"},{"key":"837_CR51","unstructured":"Nguyen, D.Q., et\u00a0al.: Investigating the impact of asr errors on spoken implicit discourse relation recognition. In: Proceedings of the First Workshop On Transcript Understanding, pp. 34\u201339 (2022)"},{"issue":"1","key":"837_CR52","doi-asserted-by":"publisher","first-page":"65","DOI":"10.1109\/78.258122","volume":"42","author":"TQ Nguyen","year":"1994","unstructured":"Nguyen, T.Q.: Near-perfect-reconstruction pseudo-qmf banks. IEEE Trans. Signal Process. 42(1), 65\u201376 (1994)","journal-title":"IEEE Trans. Signal Process."},{"issue":"2","key":"837_CR53","first-page":"600","volume":"8","author":"N Nihalani","year":"2011","unstructured":"Nihalani, N., Silakari, S., Motwani, M.: Natural language interface for database: a brief review. Int. J. Comput. Sci. Issues (IJCSI) 8(2), 600 (2011)","journal-title":"Int. J. Comput. Sci. Issues (IJCSI)"},{"key":"837_CR54","doi-asserted-by":"crossref","unstructured":"Obaido, G., Ade-Ibijola, A., Vadapalli, H.: Talksql: A tool for the synthesis of sql queries from verbal specifications. In: 2020 2nd International Multidisciplinary Information Technology and Engineering Conference (IMITEC), pp. 1\u201310. IEEE (2020)","DOI":"10.1109\/IMITEC50163.2020.9334088"},{"key":"837_CR55","unstructured":"OpenAI: Chatgpt (2023). https:\/\/openai.com\/blog\/chatgpt"},{"key":"837_CR56","unstructured":"OpenAI: Gpt-4 technical report (2023)"},{"key":"837_CR57","doi-asserted-by":"crossref","unstructured":"Panayotov, V., Chen, G., Povey, D., Khudanpur, S.: Librispeech: an asr corpus based on public domain audio books. In: 2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5206\u20135210. IEEE (2015)","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"837_CR58","doi-asserted-by":"crossref","unstructured":"Peng, Z., Mo, K., Zhu, X., Chen, J., Chen, Z., Xu, Q., Ma, X.: Understanding user perceptions of robot\u2019s delay, voice quality-speed trade-off and gui during conversation. In: Extended Abstracts of the 2020 CHI Conference on Human Factors in Computing Systems, pp. 1\u20138 (2020)","DOI":"10.1145\/3334480.3382792"},{"key":"837_CR59","doi-asserted-by":"crossref","unstructured":"Pennington, J., Socher, R., Manning, C.D.: Glove: Global vectors for word representation. In: Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 1532\u20131543 (2014)","DOI":"10.3115\/v1\/D14-1162"},{"key":"837_CR60","unstructured":"Povey, D., Ghoshal, A., Boulianne, G., Burget, L., Glembek, O., Goel, N., Hannemann, M., Motlicek, P., Qian, Y., Schwarz, P., et\u00a0al.: The kaldi speech recognition toolkit. In: IEEE 2011 Workshop on Automatic Speech Recognition and Understanding, CONF. IEEE Signal Processing Society (2011)"},{"key":"837_CR61","doi-asserted-by":"crossref","unstructured":"Rao, K., Sak, H., Prabhavalkar, R.: Exploring architectures, data and units for streaming end-to-end speech recognition with rnn-transducer. In: 2017 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), pp. 193\u2013199. IEEE (2017)","DOI":"10.1109\/ASRU.2017.8268935"},{"key":"837_CR62","unstructured":"Ren, Y., Hu, C., Tan, X., Qin, T., Zhao, S., Zhao, Z., Liu, T.Y.: Fastspeech 2: Fast and high-quality end-to-end text to speech. In: International Conference on Learning Representations (2020)"},{"key":"837_CR63","unstructured":"Rousseau, A., Del\u00e9glise, P., Esteve, Y.: Ted-lium: an automatic speech recognition dedicated corpus. In: LREC, pp. 125\u2013129 (2012)"},{"issue":"12","key":"837_CR64","doi-asserted-by":"publisher","first-page":"2747","DOI":"10.14778\/3407790.3407858","volume":"13","author":"J Sen","year":"2020","unstructured":"Sen, J., Lei, C., Quamar, A., \u00d6zcan, F., Efthymiou, V., Dalmia, A., Stager, G., Mittal, A., Saha, D., Sankaranarayanan, K.: Athena++ natural language querying for complex nested sql queries. Proc. VLDB Endow. 13(12), 2747\u20132759 (2020)","journal-title":"Proc. VLDB Endow."},{"key":"837_CR65","doi-asserted-by":"crossref","unstructured":"Shah, V., Li, S., Kumar, A., Saul, L.: Speakql: Towards speech-driven multimodal querying of structured data. In: Proceedings of the 2020 ACM SIGMOD International Conference on Management of Data, pp. 2363\u20132374 (2020)","DOI":"10.1145\/3318464.3389777"},{"key":"837_CR66","doi-asserted-by":"crossref","unstructured":"Shah, V., Li, S., Yang, K., Kumar, A., Saul, L.: Demonstration of speakql: speech-driven multimodal querying of structured data. In: Proceedings of the 2019 International Conference on Management of Data, pp. 2001\u20132004 (2019)","DOI":"10.1145\/3299869.3320224"},{"key":"837_CR67","doi-asserted-by":"publisher","first-page":"39","DOI":"10.1016\/j.websem.2014.06.002","volume":"30","author":"S Shekarpour","year":"2015","unstructured":"Shekarpour, S., Marx, E., Ngomo, A.C.N., Auer, S.: Sina: Semantic interpretation of user queries for question answering on interlinked data. J. Web Semant. 30, 39\u201351 (2015)","journal-title":"J. Web Semant."},{"key":"837_CR68","doi-asserted-by":"crossref","unstructured":"Song, Y., Jiang, D., Huang, X., Li, Y., Xu, Q., Wong, R.C.W., Yang, Q.: Goldenretriever: A speech recognition system powered by modern information retrieval. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 4500\u20134502 (2020)","DOI":"10.1145\/3394171.3414392"},{"key":"837_CR69","doi-asserted-by":"crossref","unstructured":"Song, Y., Jiang, D., Zhao, X., Xu, Q., Wong, R.C.W., Fan, L., Yang, Q.: L2rs: A learning-to-rescore mechanism for hybrid speech recognition. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 1157\u20131166 (2021)","DOI":"10.1145\/3474085.3481542"},{"key":"837_CR70","doi-asserted-by":"crossref","unstructured":"Song, Y., Wong, R.C.W., Xuefang, Z., Jiang, D.: Voicequerysystem: a voice-driven database querying system using natural language questions. In: Proceedings of the 2022 ACM SIGMOD International Conference on Management of Data (2022)","DOI":"10.1145\/3514221.3520158"},{"key":"837_CR71","doi-asserted-by":"crossref","unstructured":"Stolcke, A.: Srilm-an extensible language modeling toolkit. In: Seventh International Conference on Spoken Language Processing (2002)","DOI":"10.21437\/ICSLP.2002-303"},{"key":"837_CR72","unstructured":"Sun, N., Yang, X., Liu, Y.: Tableqa: a large-scale chinese text-to-sql dataset for table-aware sql generation. arXiv pp. arXiv\u20132006 (2020)"},{"key":"837_CR73","doi-asserted-by":"crossref","unstructured":"Sun, Y., Tang, D., Duan, N., Ji, J., Cao, G., Feng, X., Qin, B., Liu, T., Zhou, M.: Semantic parsing with syntax-and table-aware sql generation. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 361\u2013372 (2018)","DOI":"10.18653\/v1\/P18-1034"},{"key":"837_CR74","unstructured":"Sutskever, I., Vinyals, O., Le, Q.V.: Sequence to sequence learning with neural networks. Adv. Neural Inf. Process. Syst. 27 (2014)"},{"key":"837_CR75","doi-asserted-by":"publisher","first-page":"4395","DOI":"10.21437\/Interspeech.2019-2203","volume":"2019","author":"Z Tian","year":"2019","unstructured":"Tian, Z., Yi, J., Tao, J., Bai, Y., Wen, Z.: Self-attention transducers for end-to-end speech recognition. Proc. Interspeech 2019, 4395\u20134399 (2019)","journal-title":"Proc. Interspeech"},{"key":"837_CR76","unstructured":"Touvron, H., Lavril, T., Izacard, G., Martinet, X., Lachaux, M.A., Lacroix, T., Rozi\u00e8re, B., Goyal, N., Hambro, E., Azhar, F., et\u00a0al.: Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"issue":"12","key":"837_CR77","doi-asserted-by":"publisher","first-page":"2869","DOI":"10.14778\/3415478.3415496","volume":"13","author":"I Trummer","year":"2020","unstructured":"Trummer, I.: Demonstrating the voice-based exploration of large data sets with cicerodb-zero. Proc. VLDB Endow. 13(12), 2869\u20132872 (2020)","journal-title":"Proc. VLDB Endow."},{"key":"837_CR78","unstructured":"Utama, P., Weir, N., Binnig, C., Cetintemel, U.: Voice-based data exploration: Chatting with your database. In: Proceedings of the Workshop on Search-Oriented Conversational AI (SCAI) (2017)"},{"key":"837_CR79","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141., Polosukhin, I.: Attention is all you need. In: Proceedings of the 31st International Conference on Neural Information Processing Systems, pp. 6000\u20136010 (2017)"},{"key":"837_CR80","first-page":"2692","volume":"28","author":"O Vinyals","year":"2015","unstructured":"Vinyals, O., Fortunato, M., Jaitly, N.: Pointer networks. Adv. Neural Inf. Process. Syst. 28, 2692\u20132700 (2015)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"837_CR81","volume-title":"Verbmobil: Foundations of Speech-to-Speech Translation","author":"W Wahlster","year":"2013","unstructured":"Wahlster, W.: Verbmobil: Foundations of Speech-to-Speech Translation. Springer Science & Business Media, Berlin (2013)"},{"issue":"3","key":"837_CR82","doi-asserted-by":"publisher","first-page":"328","DOI":"10.1109\/29.21701","volume":"37","author":"A Waibel","year":"1989","unstructured":"Waibel, A., Hanazawa, T., Hinton, G., Shikano, K., Lang, K.J.: Phoneme recognition using time-delay neural networks. IEEE Trans. Acoust. Speech Signal Process. 37(3), 328\u2013339 (1989)","journal-title":"IEEE Trans. Acoust. Speech Signal Process."},{"key":"837_CR83","doi-asserted-by":"crossref","unstructured":"Wang, X., Qiao, T., Zhu, J., Hanjalic, A., Scharenborg, O.: S2igan: Speech-to-image generation via adversarial learning. In: INTERSPEECH 2020, pp. 2292\u20132296. ISCA (2020)","DOI":"10.21437\/Interspeech.2020-1759"},{"key":"837_CR84","doi-asserted-by":"publisher","first-page":"850","DOI":"10.1109\/TASLP.2021.3053391","volume":"29","author":"X Wang","year":"2021","unstructured":"Wang, X., Qiao, T., Zhu, J., Hanjalic, A., Scharenborg, O.: Generating images from spoken descriptions. IEEE\/ACM Trans. Audio Speech Language Process. 29, 850\u2013865 (2021)","journal-title":"IEEE\/ACM Trans. Audio Speech Language Process."},{"issue":"8","key":"837_CR85","doi-asserted-by":"publisher","first-page":"1240","DOI":"10.1109\/JSTSP.2017.2763455","volume":"11","author":"S Watanabe","year":"2017","unstructured":"Watanabe, S., Hori, T., Kim, S., Hershey, J.R., Hayashi, T.: Hybrid ctc\/attention architecture for end-to-end speech recognition. IEEE J. Selected Top. Signal Process. 11(8), 1240\u20131253 (2017)","journal-title":"IEEE J. Selected Top. Signal Process."},{"key":"837_CR86","first-page":"24824","volume":"35","author":"J Wei","year":"2022","unstructured":"Wei, J., Wang, X., Schuurmans, D., Bosma, M., Xia, F., Chi, E., Le, Q.V., Zhou, D., et al.: Chain-of-thought prompting elicits reasoning in large language models. Adv. Neural Inf. Process. Syst. 35, 24824\u201324837 (2022)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"837_CR87","doi-asserted-by":"crossref","unstructured":"Weller, O., Sperber, M., Pires, T., Setiawan, H., Gollan, C., Telaar, D., Paulik, M.: End-to-end speech translation for code switched speech. In: Findings of the Association for Computational Linguistics: ACL 2022, pp. 1435\u20131448 (2022)","DOI":"10.18653\/v1\/2022.findings-acl.113"},{"key":"837_CR88","doi-asserted-by":"crossref","unstructured":"Xu, J., Tan, X., Ren, Y., Qin, T., Li, J., Zhao, S., Liu, T.Y.: Lrspeech: Extremely low-resource speech synthesis and recognition. In: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp. 2802\u20132812 (2020)","DOI":"10.1145\/3394486.3403331"},{"key":"837_CR89","unstructured":"Xu, X., Liu, C., Song, D.: Sqlnet: Generating structured queries from natural language without reinforcement learning. arXiv preprint arXiv:1711.04436 (2017)"},{"key":"837_CR90","doi-asserted-by":"crossref","unstructured":"Yin, P., Neubig, G., Yih, W.t., Riedel, S.: Tabert: Pretraining for joint understanding of textual and tabular data. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 8413\u20138426 (2020)","DOI":"10.18653\/v1\/2020.acl-main.745"},{"key":"837_CR91","doi-asserted-by":"crossref","unstructured":"Yu, D., Deng, L.: AUTOMATIC SPEECH RECOGNITION. Springer (2016)","DOI":"10.1007\/978-1-4471-5779-3"},{"key":"837_CR92","doi-asserted-by":"crossref","unstructured":"Yu, T., Li, Z., Zhang, Z., Zhang, R., Radev, D.: Typesql: Knowledge-based type-aware neural text-to-sql generation. In: Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 2 (Short Papers), pp. 588\u2013594 (2018)","DOI":"10.18653\/v1\/N18-2093"},{"key":"837_CR93","unstructured":"Yu, T., Wu, C.S., Lin, X.V., Tan, Y.C., Yang, X., Radev, D., Xiong, C., et\u00a0al.: Grappa: Grammar-augmented pre-training for table semantic parsing. In: International Conference on Learning Representations (2020)"},{"key":"837_CR94","doi-asserted-by":"crossref","unstructured":"Yu, T., Yasunaga, M., Yang, K., Zhang, R., Wang, D., Li, Z., Radev, D.: Syntaxsqlnet: Syntax tree networks for complex and cross-domain text-to-sql task. In: Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, pp. 1653\u20131663 (2018)","DOI":"10.18653\/v1\/D18-1193"},{"key":"837_CR95","unstructured":"Yu, T., Zhang, R., Polozov, A., Meek, C., Awadallah, A.H.: Score: Pre-training for context representation in conversational semantic parsing. In: International Conference on Learning Representations (2021)"},{"key":"837_CR96","doi-asserted-by":"crossref","unstructured":"Yu, T., Zhang, R., Yang, K., Yasunaga, M., Wang, D., Li, Z., Ma, J., Li, I., Yao, Q., Roman, S., et\u00a0al.: Spider: A large-scale human-labeled dataset for complex and cross-domain semantic parsing and text-to-sql task. In: Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, pp. 3911\u20133921 (2018)","DOI":"10.18653\/v1\/D18-1425"},{"key":"837_CR97","unstructured":"Zeng, A., Liu, X., Du, Z., Wang, Z., Lai, H., Ding, M., Yang, Z., Xu, Y., Zheng, W., Xia, X., et\u00a0al.: Glm-130b: An open bilingual pre-trained model. arXiv preprint arXiv:2210.02414 (2022)"},{"issue":"3","key":"837_CR98","doi-asserted-by":"publisher","first-page":"166","DOI":"10.1016\/j.websem.2009.07.005","volume":"7","author":"G Zenz","year":"2009","unstructured":"Zenz, G., Zhou, X., Minack, E., Siberski, W., Nejdl, W.: From keywords to semantic queries-incremental query construction on the semantic web. J. Web Semant. 7(3), 166\u2013176 (2009)","journal-title":"J. Web Semant."},{"key":"837_CR99","doi-asserted-by":"crossref","unstructured":"Zeyer, A., Bahar, P., Irie, K., Schl\u00fcter, R., Ney, H.: A comparison of transformer and lstm encoder decoder models for asr. In: 2019 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), pp. 8\u201315. IEEE (2019)","DOI":"10.1109\/ASRU46091.2019.9004025"},{"key":"837_CR100","doi-asserted-by":"crossref","unstructured":"Zhang, R., Yu, T., Er, H., Shim, S., Xue, E., Lin, X.V., Shi, T., Xiong, C., Socher, R., Radev, D.: Editing-based sql query generation for cross-domain context-dependent questions. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), pp. 5338\u20135349 (2019)","DOI":"10.18653\/v1\/D19-1537"},{"key":"837_CR101","doi-asserted-by":"crossref","unstructured":"Zhao, X., Wang, L., He, R., Yang, T., Chang, J., Wang, R.: Multiple knowledge syncretic transformer for natural dialogue generation. In: Proceedings of The Web Conference 2020, pp. 752\u2013762 (2020)","DOI":"10.1145\/3366423.3380156"},{"key":"837_CR102","doi-asserted-by":"crossref","unstructured":"Zheng, W., Cheng, H., Zou, L., Yu, J.X., Zhao, K.: Natural language question\/answering: Let users talk with the knowledge graph. In: Proceedings of the 2017 ACM on Conference on Information and Knowledge Management, pp. 217\u2013226 (2017)","DOI":"10.1145\/3132847.3132977"},{"key":"837_CR103","unstructured":"Zhong, V., Xiong, C., Socher, R.: Seq2sql: Generating structured queries from natural language using reinforcement learning. arXiv preprint arXiv:1709.00103 (2017)"},{"key":"837_CR104","doi-asserted-by":"crossref","unstructured":"Zhou, S., Dong, L., Xu, S., Xu, B.: A comparison of modeling units in sequence-to-sequence speech recognition with the transformer on mandarin chinese. In: International Conference on Neural Information Processing, pp. 210\u2013220. Springer (2018)","DOI":"10.1007\/978-3-030-04221-9_19"},{"key":"837_CR105","unstructured":"Zhou, X., Chai, C., Li, G., Sun, J.: Database meets artificial intelligence: A survey. IEEE Trans. Knowl. Data Eng. (2020)"}],"container-title":["The VLDB Journal"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00778-024-00837-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00778-024-00837-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00778-024-00837-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,11]],"date-time":"2024-11-11T16:43:26Z","timestamp":1731343406000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00778-024-00837-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,2,16]]},"references-count":105,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2024,7]]}},"alternative-id":["837"],"URL":"https:\/\/doi.org\/10.1007\/s00778-024-00837-0","relation":{},"ISSN":["1066-8888","0949-877X"],"issn-type":[{"value":"1066-8888","type":"print"},{"value":"0949-877X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,2,16]]},"assertion":[{"value":"28 January 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 December 2023","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 December 2023","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"16 February 2024","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}