{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,27]],"date-time":"2025-10-27T15:23:22Z","timestamp":1761578602221},"reference-count":57,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2016,6,16]],"date-time":"2016-06-16T00:00:00Z","timestamp":1466035200000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/creativecommons.org\/licenses\/by\/4.0"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Vietnam J Comput Sci"],"published-print":{"date-parts":[[2016,11]]},"DOI":"10.1007\/s40595-016-0071-3","type":"journal-article","created":{"date-parts":[[2016,6,16]],"date-time":"2016-06-16T08:44:29Z","timestamp":1466066669000},"page":"247-257","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":16,"title":["Speech classification using SIFT features on spectrogram images"],"prefix":"10.1007","volume":"3","author":[{"given":"Quang Trung","family":"Nguyen","sequence":"first","affiliation":[]},{"given":"The Duy","family":"Bui","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2016,6,16]]},"reference":[{"key":"71_CR1","doi-asserted-by":"crossref","first-page":"431","DOI":"10.1037\/h0020279","volume":"74","author":"AM Liberman","year":"1967","unstructured":"Liberman, A.M., Cooper, F.S., Shankweiler, D.P., Studdert-Kennedy, M.: Perception of speech code. Psychol. Rev. 74, 431\u2013461 (1967)","journal-title":"Psychol. Rev."},{"key":"71_CR2","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1016\/0010-0277(85)90021-6","volume":"21","author":"AM Liberman","year":"1985","unstructured":"Liberman, A.M., Mattingly, I.G.: The motor theory of speech perception revised. Cognition 21, 1\u201336 (1985)","journal-title":"Cognition"},{"key":"71_CR3","unstructured":"Cole, R., Fanty, M.: ISOLET (Isolated Letter Speech Recognition), Department of Computer Science and Engineering, September 12 (1994)"},{"key":"71_CR4","doi-asserted-by":"crossref","unstructured":"Massaro, D.W.: Testing between the TRACE Model and the Fuzzy Logical Model of Speech perception. Cognitive Psychology, pp. 398\u2013421 (1989)","DOI":"10.1016\/0010-0285(89)90014-5"},{"key":"71_CR5","doi-asserted-by":"crossref","unstructured":"McClelland, J.L., Elman, J.L.:The TRACE model of speech perception. Cognitive Psychology (1986)","DOI":"10.1016\/0010-0285(86)90015-0"},{"key":"71_CR6","doi-asserted-by":"crossref","first-page":"71","DOI":"10.1016\/0010-0277(87)90005-9","volume":"25","author":"W Wilson","year":"1984","unstructured":"Wilson, W., Marslen, M.: Functional parallelism in spoken word-recognition. Cognition 25, 71\u2013102 (1984)","journal-title":"Cognition"},{"key":"71_CR7","doi-asserted-by":"crossref","unstructured":"Patel, I.: Speech recognition using HMM with MFCC\u2014an analysis using frequency spectral decomposition technique. Signal & Image Proc Int J (SIPIJ). 1(2) (2010)","DOI":"10.5121\/sipij.2010.1209"},{"key":"71_CR8","unstructured":"Paul, D.B.: Speech Recognition Using Hidden Markov Models. Lincoln Lab. J. 3(1) (1990)"},{"issue":"12","key":"71_CR9","first-page":"0975","volume":"42","author":"TB Adam","year":"2012","unstructured":"Adam, T.B.: Spoken english alphabet recognition with mel frequency cepstral coefficients and back propagation neural networks. Int J Comput Appl. 42(12), 0975\u20138887 (2012)","journal-title":"Int J Comput Appl."},{"key":"71_CR10","first-page":"364","volume":"8","author":"MSH Salam","year":"2011","unstructured":"Salam, M.S.H., Mohamad, D., Salleh, S.: Malay isolated speech recognition using neural network: a work in finding number of hidden nodes and learning parameters. Int Arab J Info Technol 8, 364\u2013371 (2011)","journal-title":"Int Arab J Info Technol"},{"key":"71_CR11","doi-asserted-by":"crossref","unstructured":"Sakoe, H., Chiba, S.: Dynamic programming algorithm optimization for spoken word recognition. In: IEEE Transactions on Acoustics, Speech and Signal Processing, pp. 43\u201349 (1978)","DOI":"10.1109\/TASSP.1978.1163055"},{"key":"71_CR12","doi-asserted-by":"crossref","unstructured":"Hinton, G., et\u00a0al.: Deep neural networks for acoustic modeling in speech recognition: the shared views of four research groups. In: IEEE Signal Process, pp. 82\u201397 (2012)","DOI":"10.1109\/MSP.2012.2205597"},{"key":"71_CR13","volume-title":"Convolutional neural networks for speech recognition in IEEE\/ACM transactions on audio","author":"O Abdel-Hamid","year":"2014","unstructured":"Abdel-Hamid, O., et al.: Convolutional neural networks for speech recognition in IEEE\/ACM transactions on audio. Speech and language processing, October, USA (2014)"},{"key":"71_CR14","doi-asserted-by":"crossref","unstructured":"Hermansky: Perceptual linear predictive (PLP) analysis of speech. J. Acoust. Soc. Am. 87(4), 1738\u201352 (1990)","DOI":"10.1121\/1.399423"},{"key":"71_CR15","doi-asserted-by":"crossref","unstructured":"Favero R.F.: Compound wavelets: wavelets for speech recognition. In: International symposium on time-frequency and time-scale analysis, pp. 600\u2013603, (1994)","DOI":"10.1109\/TFSA.1994.467280"},{"key":"71_CR16","doi-asserted-by":"crossref","unstructured":"Jaitly, N., Hinton, G.: Learning a better representation of speech soundwaves using restricted boltzmann machines. In: Proc. of ICASSP, pp. 5884\u20135887 (2011)","DOI":"10.1109\/ICASSP.2011.5947700"},{"key":"71_CR17","doi-asserted-by":"crossref","unstructured":"Sainath T., Weiss, R., Senior, A., Wilson, W., Vinyals O.: Learning the Speech Front-end with Raw Waveform CLDNNs. In: Interspeech (2015)","DOI":"10.21437\/Interspeech.2015-1"},{"key":"71_CR18","unstructured":"Dimitri, P., Mathew, M.D., Ronan, C.: Analysis of CNN-based speech recognition system using raw speech as input. In: Interspeech (2015)"},{"key":"71_CR19","doi-asserted-by":"crossref","unstructured":"Boiman, O., Shechtman, E., Iran, M.: In defense of nearest-neighbor based image classification. In: CVPR (2008)","DOI":"10.1109\/CVPR.2008.4587598"},{"key":"71_CR20","doi-asserted-by":"crossref","unstructured":"McCann, S., Lowe, D.G.: Local Naive Bayes nearest neighbor for image classification. In: CVPR (2012)","DOI":"10.1109\/CVPR.2012.6248111"},{"key":"71_CR21","doi-asserted-by":"crossref","unstructured":"Lowe, D.G.: Distinctive image features from scale-invariant keypoints. In: IJCV (2004)","DOI":"10.1023\/B:VISI.0000029664.99615.94"},{"key":"71_CR22","first-page":"1150","volume":"2","author":"DG Lowe","year":"1999","unstructured":"Lowe, D.G.: Object recognition from local scale-invariant features. Proceedings of the international conference on computer vision 2, 1150\u20131157 (1999)","journal-title":"Proceedings of the international conference on computer vision"},{"key":"71_CR23","unstructured":"Sakriani, S., Konstantin, M., Satoshi, N., Wolfgang, M.: Incorporating knowledge sources into statistical speech recognition.: Springer Science & Business Media (2009)"},{"key":"71_CR24","unstructured":"Sadaoki, F.: 50 years of Progress in speech and Speaker Recognition Research. vol. 1, no. 2, November (2005)"},{"key":"71_CR25","doi-asserted-by":"crossref","unstructured":"Davis K.H., Biddulph R., Balashek, S.: Automatic recognition of spoken digits. J. Acoust. Soc. Am, pp. 637\u2013642 (1952)","DOI":"10.1121\/1.1906946"},{"issue":"6","key":"71_CR26","doi-asserted-by":"crossref","first-page":"1072","DOI":"10.1121\/1.1908561","volume":"28","author":"HF Olson","year":"1996","unstructured":"Olson, H.F., Belar, H.: Phonetic typewriter. J. Acoust. Soc. Am. 28(6), 1072\u20131081 (1996)","journal-title":"J. Acoust. Soc. Am."},{"key":"71_CR27","doi-asserted-by":"crossref","unstructured":"Fry D.B.: Theoretical aspects of mechanical speech recognition. J. Br. Inst. Radio Eng., pp. 211\u2013299 (1959)","DOI":"10.1049\/jbire.1959.0026"},{"key":"71_CR28","doi-asserted-by":"crossref","DOI":"10.1109\/ICASSP.1979.1170821","volume-title":"Speaker independent recognition of isolated words using clustering techniques","author":"LR Rabiner","year":"1979","unstructured":"Rabiner, L.R., Levinson, S.E., Rosenberg, A.E., Wilpon, J.G.: Speaker independent recognition of isolated words using clustering techniques. IEEE Trans. Acoustics, Speech, Signal Proc (1979)"},{"key":"71_CR29","doi-asserted-by":"crossref","unstructured":"Sakoe, H.,: Two level DP matching\u2014a dynamic programming based pattern matching algorithm for connected word recognition. IEEE Trans. Acoustics, Speech, Signal Proc., pp. 588\u2013595 (1979)","DOI":"10.1109\/TASSP.1979.1163310"},{"key":"71_CR30","doi-asserted-by":"crossref","first-page":"430","DOI":"10.1109\/89.544528","volume":"4","author":"PC Loizou","year":"1996","unstructured":"Loizou, P.C., Spanias, A.S.: High-performance alphabet recognition. IEEE Trans. Speech Audio Proc. 4, 430\u2013445 (1996)","journal-title":"IEEE Trans. Speech Audio Proc."},{"key":"71_CR31","doi-asserted-by":"crossref","unstructured":"Cole, R., Fanty, M., Muthusamy, Y., Gopalakrishnan M.: Speaker-independent recognition of spoken english letters. In: International Joint Conference on Neural Networks (IJCNN), pp. 45\u201351 (1990)","DOI":"10.1109\/IJCNN.1990.137693"},{"key":"71_CR32","doi-asserted-by":"crossref","unstructured":"Cole, R., Fanty, M.,: Spoken letter recognition. In: Presented at the Proceedings of the conference on advances in neural information processing systems Denver, Colorado, United States (1990)","DOI":"10.3115\/116580.116725"},{"key":"71_CR33","unstructured":"Fanty, M., Cole, R.: Spoken Letter Recognition. In: Presented at the Proceedings of the conference on advances in neural information processing systems Denver, Colorado, United States (1990)"},{"key":"71_CR34","doi-asserted-by":"crossref","first-page":"647","DOI":"10.1109\/89.943342","volume":"9","author":"M Karnjanadecha","year":"2001","unstructured":"Karnjanadecha, M., Zahorian, S.A.: Signal modeling for high-performance robust isolated word recognition. IEEE Trans. Speech Audio Proc. 9, 647\u2013654 (2001)","journal-title":"IEEE Trans. Speech Audio Proc."},{"key":"71_CR35","unstructured":"Ibrahim, M.D., Ahmad, A.M., Smaon, D.F., Salam M.S.H.: Improved E-set recognition performance using time-expanded features. In: Presented at the second national conference on computer graphics and multimedia (CoGRAMM), Selangor, Malaysia (2004)"},{"key":"71_CR36","doi-asserted-by":"crossref","unstructured":"Jonathan, D., Da, T.H., Haizhou, L.: Spectrogram Image feature for sound event classification in mismatched conditions. In: IEEE Signal Processing letters, pp. 130\u2013133 (2011)","DOI":"10.1109\/LSP.2010.2100380"},{"key":"71_CR37","unstructured":"Mohamed, A.R., Dahl, G.E., Hinton, G.E.: Deep belief networks for phone recognition. In: NIPS workshop on deep learning for speech recognition and related applications (2009)"},{"key":"71_CR38","doi-asserted-by":"crossref","unstructured":"Mohamed, A., Dahl, G., Hinton, G.: \u201cAcoustic modeling using deep belief networks. In: IEEE Trans. Speech, & Language Proc, Audio (2012)","DOI":"10.1109\/TASL.2011.2109382"},{"key":"71_CR39","doi-asserted-by":"crossref","unstructured":"Mohamed, A., Hinton, G., Penn, G.: Understanding how deep belief networks perform acoustic modelling. In: Proc. ICASSP (2012)","DOI":"10.1109\/ICASSP.2012.6288863"},{"key":"71_CR40","doi-asserted-by":"crossref","unstructured":"Bocchieri, E., Dimitriadis, D.: Investigating deep neural network k based transforms of robust audio features for lvcsr. In: ICASSP (2013)","DOI":"10.1109\/ICASSP.2013.6638960"},{"key":"71_CR41","doi-asserted-by":"crossref","unstructured":"Tuske, Z., Golik, P., Schluter, R., Ney, H.: Acoustic modeling with deep neural networks using raw time signal for lvcsr. In: Interspeech (2014)","DOI":"10.21437\/Interspeech.2014-223"},{"key":"71_CR42","doi-asserted-by":"crossref","unstructured":"Palaz, D., Magimai, M., Collobert, R.: Convolutional neural networks-based continuous speech recognition using raw speech signal. In: ICASSP (2015)","DOI":"10.1109\/ICASSP.2015.7178781"},{"key":"71_CR43","doi-asserted-by":"crossref","unstructured":"Domingos, P., Pazzani, M.: On the optimality of the simple Bayesian classifier under zero-one loss. J. Mach., pp. 103\u2013130 (1997)","DOI":"10.1023\/A:1007413511361"},{"key":"71_CR44","doi-asserted-by":"crossref","unstructured":"Behmo, R., Marcombes, P., Dalalyan, A., Prinet V.: Towards optimal naive bayes nearest neighbor. In: ECCV (2010)","DOI":"10.1007\/978-3-642-15561-1_13"},{"key":"71_CR45","doi-asserted-by":"crossref","unstructured":"Tuytelaars, T., Fritz, M., Saenko, K., Darrell, T.: The NBNN kernel. In: ICCV (2011)","DOI":"10.1109\/ICCV.2011.6126449"},{"key":"71_CR46","doi-asserted-by":"crossref","unstructured":"Wang, J., Yang, J., Yu, K., Huang, T., Gong, Y.: Locality-constrained linear coding for image classification. In: CVPR (2010)","DOI":"10.1109\/CVPR.2010.5540018"},{"key":"71_CR47","unstructured":"Liu, L., Wang, L., Liu, X.: In defense of soft-assignment coding. In: ICCV (2011)"},{"issue":"4","key":"71_CR48","doi-asserted-by":"crossref","first-page":"2231","DOI":"10.1121\/1.408683","volume":"95","author":"C Ma","year":"1994","unstructured":"Ma, C., O\u2019Shaughnessy, D.: A perceptual study of source coding of Fourier phase and amplitude of the linear predictive coding residual of vowel sound. J. Acoust. Soc. Am. 95(4), 2231\u20132239 (1994)","journal-title":"J. Acoust. Soc. Am."},{"key":"71_CR49","doi-asserted-by":"crossref","unstructured":"Cano, P., Batlle, E., Kalker, T., Haitsma, J.: A review of audio fingerprinting. J. VLSI Signal Proc. Syst. Signal Image Video Technol. 41, 271\u2013284 (2005)","DOI":"10.1007\/s11265-005-4151-3"},{"key":"71_CR50","unstructured":"Wang, A.L.C.: https:\/\/www.ee.columbia.edu\/dpwe\/papers\/ . Accessed 15 Nov 2015"},{"key":"71_CR51","unstructured":"https:\/\/catalog.ldc.upenn.edu\/LDC2008S07 . Accessed 15 Nov 2015"},{"key":"71_CR52","unstructured":"http:\/\/www.alovoice.vn\/ai\/du-lieu-tieng-noi-tieng-viet\/ . Acces-sed 15 Nov 2015"},{"key":"71_CR53","unstructured":"http:\/\/research.nii.ac.jp\/src\/en\/TMW.html . Accessed 15 Nov 2015"},{"key":"71_CR54","unstructured":"http:\/\/research.nii.ac.jp\/src\/en\/JVPD.html . Accessed 15 Nov 2015"},{"key":"71_CR55","doi-asserted-by":"crossref","unstructured":"Diehl, R.L., Lotto, A.J., Holt, L.L.: Speech perception. Annu. Rev. Psychol, pp. 149\u2013179 (2004)","DOI":"10.1146\/annurev.psych.55.090902.142028"},{"key":"71_CR56","volume-title":"Perception of the speech code","author":"AM Liberman","year":"1967","unstructured":"Liberman, A.M., Cooper, F.S., Shankweiler, D.P., Studdert-Kennedy, M.: Perception of the speech code. Psychol, Rev (1967)"},{"key":"71_CR57","doi-asserted-by":"crossref","unstructured":"Dah, G., Yu, D., Deng, L., Acero, A.: Context-dependent pre-trained deep neural networks for large vocabulary speech recognition. In: IEEE Trans Speech, Lang Proc. Audio, USA (2012)","DOI":"10.1109\/TASL.2011.2134090"}],"container-title":["Vietnam Journal of Computer Science"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s40595-016-0071-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s40595-016-0071-3\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s40595-016-0071-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,7,1]],"date-time":"2022-07-01T20:21:16Z","timestamp":1656706876000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s40595-016-0071-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016,6,16]]},"references-count":57,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2016,11]]}},"alternative-id":["71"],"URL":"https:\/\/doi.org\/10.1007\/s40595-016-0071-3","relation":{},"ISSN":["2196-8888","2196-8896"],"issn-type":[{"value":"2196-8888","type":"print"},{"value":"2196-8896","type":"electronic"}],"subject":[],"published":{"date-parts":[[2016,6,16]]}}}