{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,26]],"date-time":"2026-04-26T06:18:05Z","timestamp":1777184285065,"version":"3.51.4"},"reference-count":25,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018,4]]},"DOI":"10.1109\/icassp.2018.8462015","type":"proceedings-article","created":{"date-parts":[[2018,9,21]],"date-time":"2018-09-21T22:24:48Z","timestamp":1537568688000},"page":"5509-5513","source":"Crossref","is-referenced-by-count":57,"title":["Learning Filterbanks from Raw Speech for Phone Recognition"],"prefix":"10.1109","author":[{"given":"Neil","family":"Zeghidour","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Nicolas","family":"Usunier","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Iasonas","family":"Kokkinos","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Thomas","family":"Schaiz","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Gabriel","family":"Synnaeve","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Emmanuel","family":"Dupoux","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref10","doi-asserted-by":"crossref","first-page":"25","DOI":"10.1186\/s13636-015-0068-3","article-title":"Phone recognition with hierarchical convolutional deep max out networks","volume":"2015","author":"laszlo","year":"2015","journal-title":"EURASIP Journal on Audio Speech and Music Processing"},{"key":"ref11","author":"wayne","year":"2016","journal-title":"Achieving human parity in conversational speech recognition"},{"key":"ref12","doi-asserted-by":"crossref","first-page":"4114","DOI":"10.1109\/TSP.2014.2326991","article-title":"Deep scattering spectrum","volume":"62","author":"joakim","year":"2014","journal-title":"IEEE Transactions on Signal Processing"},{"key":"ref13","article-title":"Deep scattering spectrum with deep neural networks","author":"vijayaditya","year":"2014","journal-title":"ICASSP IEEE"},{"key":"ref14","article-title":"A deep scattering spectrumdeep siamese network pipeline for unsupervised acoustic modeling","author":"neil","year":"2016","journal-title":"ICASSP IEEE"},{"key":"ref15","author":"dimitri","year":"2013","journal-title":"End-to-end phoneme sequence recognition using convolutional neural networks"},{"key":"ref16","author":"aaron van","year":"2016","journal-title":"WaveNet A Generative Model for Raw Audio"},{"key":"ref17","article-title":"Towards end-to-end speech recognition with deep convolutional neural networks","author":"ying","year":"2017"},{"key":"ref18","article-title":"Attention-based models for speech recognition","author":"jan k","year":"0","journal-title":"NIPS"},{"key":"ref19","author":"liang","year":"2016","journal-title":"Segmental recurrent neural networks for end-to-end speech recognition"},{"key":"ref4","article-title":"Im-agenet classification with deep convolutional neural networks","author":"alex","year":"2012","journal-title":"NIPS"},{"key":"ref3","doi-asserted-by":"crossref","first-page":"2278","DOI":"10.1109\/5.726791","article-title":"Gradient-based learning applied to document recognition","volume":"86","author":"yann","year":"1998","journal-title":"Proceedings of the IEEE"},{"key":"ref6","article-title":"Speech acoustic modeling from raw multichannel waveforms","author":"yedid","year":"2015","journal-title":"Proceedings of ICASSP"},{"key":"ref5","article-title":"Estimating phoneme class conditional probabilities from raw speech signal using convolutional neural networks","author":"dimitri","year":"2013"},{"key":"ref8","author":"andros","year":"2017","journal-title":"Attention-based wav2text with feature transfer learning"},{"key":"ref7","article-title":"Learning the speech front-end with raw waveform cldnns","author":"tara n","year":"2015","journal-title":"InterSpeech"},{"key":"ref2","article-title":"An efficient auditory filterbank based on the gammatone function","volume":"2","author":"rd","year":"1987","journal-title":"Meeting IOC Speech Group on Auditory Modeling at RSRE"},{"key":"ref9","author":"min","year":"2013","journal-title":"Network in Network"},{"key":"ref1","doi-asserted-by":"crossref","first-page":"357","DOI":"10.1109\/TASSP.1980.1163420","article-title":"Comparison of parametric representations for monosyllabic word recognition in continuously spoken sentences","volume":"28","author":"steven","year":"1980","journal-title":"IEEE Transactions on Acoustics Speech and Signal Processing"},{"key":"ref20","article-title":"Timit acoustic-phonetic continuous speech corpus","volume":"10","author":"john","year":"1993","journal-title":"Linguistic Data Consortium"},{"key":"ref22","article-title":"Delving deep into rectifiers: Surpassing human-level performance on imagenet classification","author":"kaiming","year":"2015","journal-title":"CVPR"},{"key":"ref21","first-page":"1929","article-title":"Dropout: a simple way to prevent neural networks from overfitting","volume":"15","author":"nitish","year":"2014","journal-title":"Journal of Machine Learning Research"},{"key":"ref24","doi-asserted-by":"crossref","first-page":"978","DOI":"10.1038\/nature04485","article-title":"Efficient auditory coding","volume":"439","author":"evan c","year":"2006","journal-title":"Nature"},{"key":"ref23","author":"ronan","year":"2016","journal-title":"Wav2Ietter an end-to-end convnet-based speech recognition system"},{"key":"ref25","doi-asserted-by":"crossref","first-page":"412","DOI":"10.1121\/1.384752","article-title":"Parametric coding of speech spectra","volume":"68","author":"jl","year":"1980","journal-title":"The Journal of the Acoustical Society of America"}],"event":{"name":"ICASSP 2018 - 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","location":"Calgary, AB","start":{"date-parts":[[2018,4,15]]},"end":{"date-parts":[[2018,4,20]]}},"container-title":["2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/8450881\/8461260\/08462015.pdf?arnumber=8462015","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,8,24]],"date-time":"2020-08-24T04:27:31Z","timestamp":1598243251000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8462015\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,4]]},"references-count":25,"URL":"https:\/\/doi.org\/10.1109\/icassp.2018.8462015","relation":{},"subject":[],"published":{"date-parts":[[2018,4]]}}}