{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,13]],"date-time":"2025-11-13T07:21:31Z","timestamp":1763018491379,"version":"3.37.3"},"reference-count":21,"publisher":"IEEE","license":[{"start":{"date-parts":[[2021,12,13]],"date-time":"2021-12-13T00:00:00Z","timestamp":1639353600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,12,13]],"date-time":"2021-12-13T00:00:00Z","timestamp":1639353600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U1836219"],"award-info":[{"award-number":["U1836219"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,12,13]]},"DOI":"10.1109\/asru51503.2021.9687969","type":"proceedings-article","created":{"date-parts":[[2022,2,3]],"date-time":"2022-02-03T20:31:00Z","timestamp":1643920260000},"page":"427-432","source":"Crossref","is-referenced-by-count":5,"title":["Topic Classification on Spoken Documents Using Deep Acoustic and Linguistic Features"],"prefix":"10.1109","author":[{"given":"Tan","family":"Liu","sequence":"first","affiliation":[{"name":"University of Science and Technology of China,National Engineering Laboratory for Speech and Language Information Processing,Hefei,China"}]},{"given":"Wu","family":"Guo","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China,National Engineering Laboratory for Speech and Language Information Processing,Hefei,China"}]}],"member":"263","reference":[{"key":"ref10","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"2018","journal-title":"ArXiv Preprint"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461785"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461718"},{"key":"ref13","first-page":"1266","article-title":"Adaptive speaker normalization for etc-based speech recognition","author":"ding","year":"0","journal-title":"Proc Interspeech 2020"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2015.7404790"},{"key":"ref15","article-title":"Neural speech recognizer: Acoustic-to-word lstm model for large vocabulary speech recognition","author":"soltau","year":"2016","journal-title":"ArXiv Preprint"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461361"},{"key":"ref17","article-title":"Learning alignment for multi-modal emotion recognition from speech","author":"xu","year":"2019","journal-title":"ArXiv Preprint"},{"key":"ref18","article-title":"Layer normalization","author":"ba","year":"2016","journal-title":"ArXiv Preprint"},{"key":"ref19","first-page":"478","article-title":"Unsupervised deep embedding for clustering analysis","author":"xie","year":"2016","journal-title":"International Conference on Machine Learning"},{"key":"ref4","article-title":"Direct acoustics-to-word models for english conversational speech recognition","author":"audhkhasi","year":"2017","journal-title":"ArXiv Preprint"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-232"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"ref5","first-page":"939","article-title":"A comparison of sequence-to-sequence models for speech recognition","author":"rohit","year":"0","journal-title":"InterSpeech"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1181"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/P14-1062"},{"key":"ref2","first-page":"1764","article-title":"Towards end-to-end speech recognition with recurrent neural networks","author":"graves","year":"0","journal-title":"International Conference on Machine Learning"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2011.2134090"},{"key":"ref9","article-title":"An introduction to convolutional neural networks","author":"o'shea","year":"2015","journal-title":"ArXiv Preprint"},{"key":"ref20","article-title":"Improving the accuracy of pre-trained word embeddings for sentiment analysis","author":"rezaeinia","year":"2017","journal-title":"ArXiv Preprint"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-32381-3_16"}],"event":{"name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","start":{"date-parts":[[2021,12,13]]},"location":"Cartagena, Colombia","end":{"date-parts":[[2021,12,17]]}},"container-title":["2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9687821\/9687855\/09687969.pdf?arnumber=9687969","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,16]],"date-time":"2022-05-16T20:42:12Z","timestamp":1652733732000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9687969\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,12,13]]},"references-count":21,"URL":"https:\/\/doi.org\/10.1109\/asru51503.2021.9687969","relation":{},"subject":[],"published":{"date-parts":[[2021,12,13]]}}}