{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,11]],"date-time":"2026-04-11T13:16:02Z","timestamp":1775913362704,"version":"3.50.1"},"reference-count":44,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2017,1,6]],"date-time":"2017-01-06T00:00:00Z","timestamp":1483660800000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"name":"National Natural Science Foundation of China (CN)","award":["61101160"],"award-info":[{"award-number":["61101160"]}]},{"name":"The Fundamental Research Funds for the Central Universities","award":["2015ZZ102"],"award-info":[{"award-number":["2015ZZ102"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61271314"],"award-info":[{"award-number":["61271314"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61571192"],"award-info":[{"award-number":["61571192"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Project of the Pearl River Young Talents of Science and Technology in Guangzhou, China","award":["2013J2200070"],"award-info":[{"award-number":["2013J2200070"]}]},{"DOI":"10.13039\/501100012245","name":"Science and Technology Planning Project of Guangdong Province","doi-asserted-by":"crossref","award":["2014A050503022"],"award-info":[{"award-number":["2014A050503022"]}],"id":[{"id":"10.13039\/501100012245","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100012245","name":"Science and Technology Planning Project of Guangdong Province","doi-asserted-by":"crossref","award":["2015A010103003"],"award-info":[{"award-number":["2015A010103003"]}],"id":[{"id":"10.13039\/501100012245","id-type":"DOI","asserted-by":"crossref"}]},{"name":"Foundation of China Scholarship Council","award":["201208440078"],"award-info":[{"award-number":["201208440078"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2018,1]]},"DOI":"10.1007\/s11042-016-4332-z","type":"journal-article","created":{"date-parts":[[2017,1,6]],"date-time":"2017-01-06T07:55:11Z","timestamp":1483689311000},"page":"897-916","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":21,"title":["Using multi-stream hierarchical deep neural network to extract deep audio feature for acoustic event detection"],"prefix":"10.1007","volume":"77","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4362-1125","authenticated-orcid":false,"given":"Yanxiong","family":"Li","sequence":"first","affiliation":[]},{"given":"Xue","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Hai","family":"Jin","sequence":"additional","affiliation":[]},{"given":"Xianku","family":"Li","sequence":"additional","affiliation":[]},{"given":"Qin","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Qianhua","family":"He","sequence":"additional","affiliation":[]},{"given":"Qian","family":"Huang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2017,1,6]]},"reference":[{"key":"4332_CR1","doi-asserted-by":"crossref","unstructured":"Atrey PK, Maddage M, Kankanhalli MS (2006) Audio based event detection for multimedia surveillance. In: Proc. of IEEE ICASSP, pp 813\u2013816. IEEE","DOI":"10.1109\/ICASSP.2006.1661400"},{"key":"4332_CR2","unstructured":"British Broadcasting Corporation (BBC), \u201cBBC Sound Effects Library,\u201d http:\/\/www.sound-ideas.com\/bbc.html , Accessed May 2015"},{"key":"4332_CR3","doi-asserted-by":"crossref","unstructured":"Bugalho M, Portelo J, Trancoso I, Pellegrini T, Abad A (2009) Detecting audio events for semantic video search. In: Proc. of INTERSPEECH, pp 1151\u20131154. ISCA","DOI":"10.21437\/Interspeech.2009-335"},{"key":"4332_CR4","doi-asserted-by":"crossref","unstructured":"Cakir E, Heittola T, Huttunen H, Virtanen T (2015) Polyphonic sound event detection using multi label deep neural networks. In: Proc. of International Joint Conference on Neural Networks, pp 1\u20137. IEEE","DOI":"10.1109\/IJCNN.2015.7280624"},{"key":"4332_CR5","doi-asserted-by":"crossref","unstructured":"Chang CC, Lin CJ (2011) LIBSVM: a library for support vector machines. In: ACM Transactions on Intelligent Systems and Technology, 2:27:1\u201327:27. ACM","DOI":"10.1145\/1961189.1961199"},{"key":"4332_CR6","doi-asserted-by":"crossref","unstructured":"Childers DG, Skinner DP, Kemerait RC (1977) The cepstrum: a guide to processing. In: Proceeding of IEEE, 65(10):1428\u20131443. IEEE","DOI":"10.1109\/PROC.1977.10747"},{"key":"4332_CR7","unstructured":"Diment A, Heittola T, Virtanen T (2013) Sound event detection for office live and office synthetic AASP challenge. In: Proc. of IEEE AASP challenge on detection and classification of acoustic scenes and events. IEEE"},{"key":"4332_CR8","doi-asserted-by":"crossref","unstructured":"Fawcett T (2011) An introduction to ROC analysis. In: Pattern Recognition Letters, 27(8):861\u2013874. Elsevier","DOI":"10.1016\/j.patrec.2005.10.010"},{"key":"4332_CR9","unstructured":"Gabor D (1946) Theory of communication. In: Institute Electronica, no. 93, pp 429\u2013457"},{"key":"4332_CR10","unstructured":"Gencoglu O, Virtanen T, Huttunen H (2014) Recognition of acoustic events using deep neural networks. In: Proc. of the 22nd European Signal Processing Conference, pp 506\u2013510. ISCA"},{"key":"4332_CR11","doi-asserted-by":"crossref","unstructured":"Giannoulis D, Stowell D, Benetos E, Rossignol M, Lagrange M, Plumbley MD (2013) A database and challenge for acoustic scene classification and event detection. In: Proc. of EUSIPCO, pp 1\u20135. ISCA","DOI":"10.1109\/WASPAA.2013.6701819"},{"key":"4332_CR12","doi-asserted-by":"crossref","unstructured":"Grezl F, Karafiat M, Kontar S, Cernocky J (2007) Probabilistic and bottle-neck features for LVCSR of meetings. In: Proc. of IEEE ICASSP, pp 757\u2013760. IEEE","DOI":"10.1109\/ICASSP.2007.367023"},{"key":"4332_CR13","doi-asserted-by":"crossref","unstructured":"Heittola T, Klapuri A (2008) TUT acoustic event detection system 2007. In: multimodal technologies for perception of humans, vol. 4625 of the series Lecture Notes in Computer Science, pp 364\u2013370. Springer","DOI":"10.1007\/978-3-540-68585-2_35"},{"key":"4332_CR14","doi-asserted-by":"crossref","unstructured":"Heittola T, Mesaros A, Virtanen T, Gabbouj M (2013) Supervised model training for overlapping sound events based on unsupervised source separation. In: Proc. of IEEE ICASSP, Vancouver, Canada, pp 8677\u20138681. IEEE","DOI":"10.1109\/ICASSP.2013.6639360"},{"key":"4332_CR15","doi-asserted-by":"crossref","unstructured":"Hinton GE, Deng L, Yu D, Dahl GE, Mohamed AR, Jaitly N, Senior A, Vanhoucke V, Nguyen P, Sainath TN et al (2012) Deep neural networks for acoustic modeling in speech recognition: The shared views of four research groups. In: IEEE Signal Processing Magazine, 29(6):82\u201397. IEEE","DOI":"10.1109\/MSP.2012.2205597"},{"key":"4332_CR16","doi-asserted-by":"crossref","first-page":"1527","DOI":"10.1162\/neco.2006.18.7.1527","volume":"18","author":"GE Hinton","year":"2006","unstructured":"Hinton GE, Osindero S, Teh YW (2006) A fast learning algorithm for deep belief nets. Neural Comput 18:1527\u20131554, MIT Press","journal-title":"Neural Comput"},{"key":"4332_CR17","doi-asserted-by":"crossref","unstructured":"Jin F, Sattar F, Krishnan S (2012) Log-frequency spectrogram for respiratory sound monitoring. In: Proc. of IEEE ICASSP, pp 597\u2013600. IEEE","DOI":"10.1109\/ICASSP.2012.6287954"},{"key":"4332_CR18","doi-asserted-by":"crossref","unstructured":"Lin KZ, Pwint M (2010) Structuring sport video through audio event classification. In: PCM 2010, Part I, LNCS 6297, pp 481\u2013492. Springer","DOI":"10.1007\/978-3-642-15702-8_44"},{"key":"4332_CR19","unstructured":"Loren DE, Robert KO (1968) Programming and analysis for digital time series data, United Stated Department of Defense, first edition, Shock and Vibration Information Center"},{"key":"4332_CR20","doi-asserted-by":"crossref","unstructured":"Lu L, Hanjalic A (2009) audio keywords discovery for text-like audio content analysis and retrieval. In: IEEE Trans. on Multimedia 10(1):74\u201385. IEEE","DOI":"10.1109\/TMM.2007.911304"},{"key":"4332_CR21","doi-asserted-by":"crossref","unstructured":"Ma L, Milner B, Smith D (2006) Acoustic environment classification. In: ACM Trans. On Speech Language Processing, 3(2):1\u201322. ACM","DOI":"10.1145\/1149290.1149292"},{"key":"4332_CR22","doi-asserted-by":"crossref","unstructured":"McLoughlin I, Zhang HM, Xie ZP, Song Y, Xiao W (2015) Robust sound event classification using deep neural networks. In: IEEE Trans. on Audio, Speech, and Language Processing, 23(3):540\u2013552. IEEE","DOI":"10.1109\/TASLP.2015.2389618"},{"key":"4332_CR23","doi-asserted-by":"crossref","unstructured":"Moritz N, Anem\u00fcller J, Kollmeier B (2011) Amplitude modulation spectrogram based features for robust speech recognition in noisy and reverberant environments. In: Proc. of IEEE ICASSP, pp 5492\u20135495. IEEE","DOI":"10.1109\/ICASSP.2011.5947602"},{"key":"4332_CR24","doi-asserted-by":"crossref","unstructured":"Niessen ME, Van Kasteren TLM, Merentitis A (2013) Hierarchical modeling using automated sub-clustering for sound event recognition. In: Proc. of IEEE workshop on applications of signal processing to audio and acoustics, pp 1\u20134. IEEE","DOI":"10.1109\/WASPAA.2013.6701862"},{"key":"4332_CR25","unstructured":"Nogueira W, Roma G, Herrera P (2013) Automatic event classification using front end single channel noise reduction, MFCC features and a support vector machine classifier. In: IEEE AASP challenge: detection and classification of acoustic scenes and events. IEEE"},{"key":"4332_CR26","doi-asserted-by":"crossref","unstructured":"Okuyucu C, Sert M, Yazlcl A (2013) Audio feature and classifier analysis for efficient recognition of environmental sounds. In: Proc. of IEEE International Symposium on Multimedia, pp 125\u2013132. IEEE","DOI":"10.1109\/ISM.2013.29"},{"key":"4332_CR27","doi-asserted-by":"crossref","unstructured":"Phan H, Maa\u00df M, Mazur R, Mertins A (2015) Random regression forests for acoustic event detection and classification. In: IEEE Trans. on Audio Speech & Language Processing, 23(1):20\u201331. IEEE","DOI":"10.1109\/TASLP.2014.2367814"},{"issue":"1","key":"4332_CR28","doi-asserted-by":"crossref","first-page":"456","DOI":"10.1152\/jn.00851.2002","volume":"90","author":"A Qiu","year":"2003","unstructured":"Qiu A, Schreiner C, Escabi M (2003) Gabor analysis of auditory midbrain receptive fields: spectro-temporal and binaural composition. J Neurophysiol 90(1):456\u2013476, American Physiological Society","journal-title":"J Neurophysiol"},{"key":"4332_CR29","doi-asserted-by":"crossref","unstructured":"Schadler MR, Kollmeier B (2012) Normalization of spectro-temporal Gabor filter bank features for improved robust automatic speech recognition systems. In: Proc. of INTERSPEECH, pp 1\u20134. ISCA","DOI":"10.21437\/Interspeech.2012-493"},{"issue":"5","key":"4332_CR30","doi-asserted-by":"crossref","first-page":"4134","DOI":"10.1121\/1.3699200","volume":"131","author":"MR Sch\u00e4dler","year":"2012","unstructured":"Sch\u00e4dler MR, Meyer BT, Kollmeier B (2012) Spectro-temporal modulation subspace-spanning filter bank features for robust automatic speech recognition. J Acoust Soc Am 131(5):4134\u20134151, Acoustical Society of America","journal-title":"J Acoust Soc Am"},{"key":"4332_CR31","doi-asserted-by":"crossref","unstructured":"Schr\u00f6der J, Cauchi B, Sch\u00e4dler MR, Moritz N, Adiloglu K, Anem\u00fcller J, Doclo S, Kollmeier B, Goetze S (2013) Acoustic event detection using signal enhancement and spectro-temporal feature extraction. IEEE AASP challenge: detection and classification of acoustic scenes and events. IEEE","DOI":"10.1109\/WASPAA.2013.6701868"},{"key":"4332_CR32","doi-asserted-by":"crossref","unstructured":"Schr\u00f6der J, Goetze S, Anem\u00fcller J (2015) Spectro-temporal gabor filterbank features for acoustic event detection. In: IEEE\/ACM Trans. on Audio, Speech, and Language Processing, 23(12):2198\u20132208. IEEE\/ACM","DOI":"10.1109\/TASLP.2015.2467964"},{"key":"4332_CR33","doi-asserted-by":"crossref","unstructured":"Schr\u00f6der J, Moritz N, Sch\u00e4dler MR, Cauchi B, Adiloglu K, Anem\u00fcller J, Doclo S, Kollmeier B, Goetze S (2013) On the use of spectro-temporal features for the IEEE AASP challenge detection and classification of acoustic scenes and events. In: Proc. of IEEE Workshop on Applications of Signal Processing to Audio and Acoustics, pp 1\u20134. IEEE","DOI":"10.1109\/WASPAA.2013.6701868"},{"key":"4332_CR34","doi-asserted-by":"crossref","unstructured":"Temko A, Malkin R, Zieger C, Macho D, Nadeu C, Omologo M (2007) Clear evaluation of acoustic event detection and classification systems. Lecture notes in computing science, 4122:311\u2013322. Springer","DOI":"10.1007\/978-3-540-69568-4_29"},{"key":"4332_CR35","doi-asserted-by":"crossref","unstructured":"Temko A, Nadeu C (2009) Acoustic event detection in meeting-room environments. In: Pattern recognition letter, 30(14):1281\u20131288. Elsevier","DOI":"10.1016\/j.patrec.2009.06.009"},{"key":"4332_CR36","doi-asserted-by":"crossref","unstructured":"Temko A, Nadeu C, Macho D, Malkin R, Zieger C, Omologo M (2009) Acoustic event detection and classification. In: Computers in the human interaction loop, pp 61\u201373. Springer","DOI":"10.1007\/978-1-84882-054-8_7"},{"key":"4332_CR37","doi-asserted-by":"crossref","unstructured":"Varga A, Steeneken HJM (1993) Assessment for automatic speech recognition: II. NOISEX-92: A database and an experiment to study the effect of additive noise on speech recognition systems. In: Speech Communication, 12(3):247\u2013251. ISCA","DOI":"10.1016\/0167-6393(93)90095-3"},{"key":"4332_CR38","doi-asserted-by":"crossref","unstructured":"Vesel\u00fd K, Luk\u00e1\u0161 B, Franti\u0161ek (2010) Parallel training of neural networks for speech recognition. In: Proc. of INTERSPEECH, pp 439\u2013446. ISCA","DOI":"10.1007\/978-3-642-15760-8_56"},{"key":"4332_CR39","doi-asserted-by":"crossref","unstructured":"Wang S, Yang X, Zhang Y, Phillips P, Yang J, Yuan T (2015) Identification of green, Oolong and black teas in China via wavelet packet entropy and fuzzy support vector machine. In: Entropy, 17(10):6663\u20136682. MDPI","DOI":"10.3390\/e17106663"},{"key":"4332_CR40","volume-title":"The HTK Book, version 3.4","author":"SJ Young","year":"2006","unstructured":"Young SJ, Evermann G, Gales MJF, Hain T, Kershaw D, Moore G, Odell J, Ollason D, Povey D, Valtchev V, Woodland PC (2006) The HTK Book, version 3.4. Cambridge University Engineering Department, Cambridge"},{"key":"4332_CR41","doi-asserted-by":"crossref","unstructured":"Yu D, Seltzer ML (2011) Improved bottleneck features using pretrained deep neural networks. In: Proc. of INTERSPEECH, pp 237\u2013240. ISCA","DOI":"10.21437\/Interspeech.2011-91"},{"issue":"4","key":"4332_CR42","doi-asserted-by":"crossref","first-page":"317","DOI":"10.1002\/ima.22144","volume":"25","author":"Y Zhang","year":"2015","unstructured":"Zhang Y, Chen S, Wang S, Yang J, Phillips P (2015) Magnetic resonance brain image classification based on weighted-type fractional Fourier transform and nonparallel support vector machine. Int J Imaging Syst Technol 25(4):317\u2013327, Wiley","journal-title":"Int J Imaging Syst Technol"},{"key":"4332_CR43","doi-asserted-by":"crossref","unstructured":"Zhang X, He Q, Feng X (2015) Acoustic feature extraction by tensor-based sparse representation for sound effects classification. In: Proc. of IEEE ICASSP, pp 166\u2013170. IEEE","DOI":"10.1109\/ICASSP.2015.7177953"},{"key":"4332_CR44","doi-asserted-by":"crossref","unstructured":"Zhang Y, Wu L (2012) Classification of fruits using computer vision and a multiclass support vector machine. In: Sensors, 12(9):12489\u201312505. MDPI","DOI":"10.3390\/s120912489"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11042-016-4332-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-016-4332-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-016-4332-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,14]],"date-time":"2025-06-14T02:05:05Z","timestamp":1749866705000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11042-016-4332-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017,1,6]]},"references-count":44,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2018,1]]}},"alternative-id":["4332"],"URL":"https:\/\/doi.org\/10.1007\/s11042-016-4332-z","relation":{},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"value":"1380-7501","type":"print"},{"value":"1573-7721","type":"electronic"}],"subject":[],"published":{"date-parts":[[2017,1,6]]}}}