{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T19:45:30Z","timestamp":1730231130592,"version":"3.28.0"},"reference-count":32,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,6,4]],"date-time":"2023-06-04T00:00:00Z","timestamp":1685836800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,6,4]],"date-time":"2023-06-04T00:00:00Z","timestamp":1685836800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,6,4]]},"DOI":"10.1109\/icassp49357.2023.10095534","type":"proceedings-article","created":{"date-parts":[[2023,5,5]],"date-time":"2023-05-05T17:28:30Z","timestamp":1683307710000},"page":"1-5","source":"Crossref","is-referenced-by-count":3,"title":["Unified Keyword Spotting and Audio Tagging on Mobile Devices with Transformers"],"prefix":"10.1109","author":[{"given":"Heinrich","family":"Dinkel","sequence":"first","affiliation":[{"name":"Xiaomi Corperation,Beijing,China"}]},{"given":"Yongqing","family":"Wang","sequence":"additional","affiliation":[{"name":"Xiaomi Corperation,Beijing,China"}]},{"given":"Zhiyong","family":"Yan","sequence":"additional","affiliation":[{"name":"Xiaomi Corperation,Beijing,China"}]},{"given":"Junbo","family":"Zhang","sequence":"additional","affiliation":[{"name":"Xiaomi Corperation,Beijing,China"}]},{"given":"Yujun","family":"Wang","sequence":"additional","affiliation":[{"name":"Xiaomi Corperation,Beijing,China"}]}],"member":"263","reference":[{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1428"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414777"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747025"},{"article-title":"Attention-free keyword spotting","year":"2021","author":"morshed","key":"ref14"},{"article-title":"Psla: Improving audio event classification with pretraining, sampling, labeling, and aggregation","year":"2021","author":"gong","key":"ref31"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2021.3078715"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747295"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1286"},{"article-title":"Gaussian error linear units (gelus)","year":"2016","author":"hendrycks","key":"ref32"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3073596"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-607"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-698"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3030497"},{"article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","year":"2021","author":"kolesnikov","key":"ref19"},{"key":"ref18","article-title":"Attention is all you need","volume":"30","author":"vaswani","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"article-title":"Speech commands: A dataset for limited-vocabulary speech recognition","year":"2018","author":"warden","key":"ref23"},{"key":"ref26","article-title":"8-bit optimizers via block-wise quantization","author":"dettmers","year":"2021","journal-title":"International Conference on Learning Representations"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2680"},{"article-title":"Masked autoencoders that listen","year":"2022","author":"xu","key":"ref20"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00474"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746431"},{"article-title":"Mega: Moving average equipped gated attention","year":"2022","author":"ma","key":"ref28"},{"key":"ref27","first-page":"8026","article-title":"PyTorch: An Imperative Style, High-Performance Deep Learning Library","author":"paszke","year":"2019","journal-title":"Advances in Neural IInformation Processing Systems"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1058"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-352"},{"article-title":"Data augmentation for robust keyword spotting under playback interference","year":"2018","author":"raju","key":"ref7"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-383"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-260"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1977"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3132"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1363"}],"event":{"name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","start":{"date-parts":[[2023,6,4]]},"location":"Rhodes Island, Greece","end":{"date-parts":[[2023,6,10]]}},"container-title":["ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10094559\/10094560\/10095534.pdf?arnumber=10095534","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,11,13]],"date-time":"2023-11-13T19:03:25Z","timestamp":1699902205000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10095534\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6,4]]},"references-count":32,"URL":"https:\/\/doi.org\/10.1109\/icassp49357.2023.10095534","relation":{},"subject":[],"published":{"date-parts":[[2023,6,4]]}}}