{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,7,27]],"date-time":"2025-07-27T07:53:29Z","timestamp":1753602809249,"version":"3.37.3"},"reference-count":35,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,6,4]],"date-time":"2023-06-04T00:00:00Z","timestamp":1685836800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,6,4]],"date-time":"2023-06-04T00:00:00Z","timestamp":1685836800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100006190","name":"Research and Development","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100006190","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100014219","name":"National Science Fund for Distinguished Young Scholars","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100014219","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,6,4]]},"DOI":"10.1109\/icassp49357.2023.10095330","type":"proceedings-article","created":{"date-parts":[[2023,5,5]],"date-time":"2023-05-05T17:28:30Z","timestamp":1683307710000},"page":"1-5","source":"Crossref","is-referenced-by-count":2,"title":["Self-Convolution for Automatic Speech Recognition"],"prefix":"10.1109","author":[{"given":"Tian-Hao","family":"Zhang","sequence":"first","affiliation":[{"name":"University of Science and Technology Beijing,Department of Computer Science and Technology,Beijing,China,100083"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qi","family":"Liu","sequence":"additional","affiliation":[{"name":"University of Science and Technology Beijing,Department of Computer Science and Technology,Beijing,China,100083"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xinyuan","family":"Qian","sequence":"additional","affiliation":[{"name":"University of Science and Technology Beijing,Department of Computer Science and Technology,Beijing,China,100083"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Song-Lu","family":"Chen","sequence":"additional","affiliation":[{"name":"University of Science and Technology Beijing,Department of Computer Science and Technology,Beijing,China,100083"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Feng","family":"Chen","sequence":"additional","affiliation":[{"name":"University of Science and Technology Beijing,USTB-EEasyTech Joint Lab of Artificial Intelligence,Beijing,China,100083"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xu-Cheng","family":"Yin","sequence":"additional","affiliation":[{"name":"University of Science and Technology Beijing,Department of Computer Science and Technology,Beijing,China,100083"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"article-title":"Linformer: Selfattention with linear complexity","year":"2020","author":"wang","key":"ref13"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-1162"},{"key":"ref12","article-title":"cosformer: Rethinking softmax in attention","author":"qin","year":"2022","journal-title":"ICLR International Conference on Learning Representations"},{"article-title":"Adam: A method for stochastic optimization","year":"2014","author":"kingma","key":"ref34"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00518"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-711"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9687874"},{"key":"ref33","first-page":"6224","article-title":"Intermediate loss regularization for ctc-based speech recognition","author":"lee","year":"2021","journal-title":"IEEE International Conference on Acoustics Speech and Signal Processing ICASSP"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-427"},{"key":"ref32","first-page":"2207","article-title":"Espnet: End-toend speech processing toolkit","author":"watanabe","year":"2018","journal-title":"10th Annual Conference of the International Speech Communication Association InterSpeech"},{"key":"ref2","first-page":"5884","article-title":"Speech-transformer: A no-recurrence sequence-to-sequence model for speech recognition","author":"dong","year":"2018","journal-title":"IEEE International Conference on Acoustics Speech and Signal Processing ICASSP"},{"key":"ref1","first-page":"5998","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"NIPS-Annual Conference on Neural Information Processing Systems"},{"article-title":"MLP-ASR: sequence-length agnostic all-MLP architectures for speech recognition","year":"2022","author":"sakuma","key":"ref17"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2471"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2059"},{"key":"ref18","first-page":"5899","article-title":"Transformer-based end-to-end speech recognition with local dense synthesizer attention","author":"xu","year":"2021","journal-title":"IEEE International Conference on Acoustics Speech and Signal Processing ICASSP"},{"key":"ref24","first-page":"933","article-title":"Language modeling with gated convolutional networks","author":"dauphin","year":"2017","journal-title":"Proceedings of the ICML International Conference on Machine Learning"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01214"},{"key":"ref26","first-page":"5206","article-title":"Librispeech: an asr corpus based on public domain audio books","author":"panayotov","year":"2015","journal-title":"IEEE International Conference on Acoustics Speech and Signal Processing ICASSP"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICSDA.2017.8384449"},{"article-title":"Citrinet: Closing the gap between non-autoregressive and autoregressive end-to-end models for automatic speech recognition","year":"2021","author":"majumdar","key":"ref20"},{"article-title":"Mobilenets: Efficient convolutional neural networks for mobile vision applications","year":"2017","author":"howard","key":"ref22"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"ref28","article-title":"The kaldi speech recognition toolkit","author":"povey","year":"2011","journal-title":"IEEE Workshop Automatic Speech Recognition Understanding"},{"key":"ref27","first-page":"3935","article-title":"Enhancing` the TED-LIUM corpus with selected data for language modeling and more TED talks","author":"rousseau","year":"2014","journal-title":"Proceedings of the Ninth International Conference on Language Resources and Evaluation LREC"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1016\/S0167-6393(98)00033-8"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1285"},{"key":"ref7","first-page":"9204","article-title":"Pay attention to mlps","author":"liu","year":"2021","journal-title":"NIPS-Annual Conference on Neural Information Processing Systems"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"article-title":"Non-autoregressive transformer with unified bidirectional decoder for automatic speech recognition","year":"2021","author":"zhang","key":"ref4"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1107"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.319"},{"key":"ref5","first-page":"24261","article-title":"Mlpmixer: An all-mlp architecture for vision","author":"tolstikhin","year":"2021","journal-title":"NIPS-Annual Conference on Neural Information Processing Systems"}],"event":{"name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","start":{"date-parts":[[2023,6,4]]},"location":"Rhodes Island, Greece","end":{"date-parts":[[2023,6,10]]}},"container-title":["ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10094559\/10094560\/10095330.pdf?arnumber=10095330","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,11,13]],"date-time":"2023-11-13T18:59:08Z","timestamp":1699901948000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10095330\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6,4]]},"references-count":35,"URL":"https:\/\/doi.org\/10.1109\/icassp49357.2023.10095330","relation":{},"subject":[],"published":{"date-parts":[[2023,6,4]]}}}