{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,11]],"date-time":"2026-04-11T13:09:21Z","timestamp":1775912961593,"version":"3.50.1"},"reference-count":29,"publisher":"IEEE","license":[{"start":{"date-parts":[[2022,5,23]],"date-time":"2022-05-23T00:00:00Z","timestamp":1653264000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,5,23]],"date-time":"2022-05-23T00:00:00Z","timestamp":1653264000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,5,23]]},"DOI":"10.1109\/icassp43922.2022.9746312","type":"proceedings-article","created":{"date-parts":[[2022,4,27]],"date-time":"2022-04-27T15:50:34Z","timestamp":1651074634000},"page":"646-650","source":"Crossref","is-referenced-by-count":229,"title":["HTS-AT: A Hierarchical Token-Semantic Audio Transformer for Sound Classification and Detection"],"prefix":"10.1109","author":[{"given":"Ke","family":"Chen","sequence":"first","affiliation":[{"name":"University of California San Diego"}]},{"given":"Xingjian","family":"Du","sequence":"additional","affiliation":[{"name":"Bytedance Inc.,AI Lab"}]},{"given":"Bilei","family":"Zhu","sequence":"additional","affiliation":[{"name":"Bytedance Inc.,AI Lab"}]},{"given":"Zejun","family":"Ma","sequence":"additional","affiliation":[{"name":"Bytedance Inc.,AI Lab"}]},{"given":"Taylor","family":"Berg-Kirkpatrick","sequence":"additional","affiliation":[{"name":"University of California San Diego"}]},{"given":"Shlomo","family":"Dubnov","sequence":"additional","affiliation":[{"name":"University of California San Diego"}]}],"member":"263","reference":[{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682847"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3030497"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3120633"},{"key":"ref13","article-title":"Attention is all you need","author":"vaswani","year":"0","journal-title":"NeurIPS 2017"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-698"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00288"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054478"},{"key":"ref19","article-title":"mixup: Beyond empirical risk minimization","author":"zhang","year":"0","journal-title":"2018 ICLR"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i4.20366"},{"key":"ref4","article-title":"Speech commands: A dataset for limited-vocabulary speech recognition","volume":"abs 1804 3209","author":"warden","year":"2018","journal-title":"CoRR"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414579"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414458"},{"key":"ref6","article-title":"Muspy: A toolkit for symbolic music generation","author":"dong","year":"0","journal-title":"ISMIR 2020"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICSC.2020.00025"},{"key":"ref5","article-title":"Music sketchnet: Controllable music generation via factorized representations of pitch and rhythm","author":"chen","year":"0","journal-title":"ISMIR 2020"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2806390"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref2","article-title":"Automatic classification of musical instrument sounds","author":"herrera","year":"2010","journal-title":"Journal of New Music Research"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2731"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2021.3090678"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"ref22","article-title":"Eranns: Efficient residual audio neural networks for audio pattern recognition","volume":"abs 2106 1621","author":"verbitskiy","year":"2021","journal-title":"CoRR"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-87802-3_69"},{"key":"ref24","article-title":"Dcase 2021 challenge task 4: Sound event detection and separation in domestic environments","year":"0"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1286"},{"key":"ref26","article-title":"Training data-efficient image transformers & distillation through attention","author":"touvron","year":"0","journal-title":"ICML 2021"},{"key":"ref25","article-title":"Efficientnet: Rethinking model scaling for convolutional neural networks","author":"tan","year":"0","journal-title":"ICML 2019"}],"event":{"name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","location":"Singapore, Singapore","start":{"date-parts":[[2022,5,23]]},"end":{"date-parts":[[2022,5,27]]}},"container-title":["ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9745891\/9746004\/09746312.pdf?arnumber=9746312","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,8,22]],"date-time":"2022-08-22T16:12:36Z","timestamp":1661184756000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9746312\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,5,23]]},"references-count":29,"URL":"https:\/\/doi.org\/10.1109\/icassp43922.2022.9746312","relation":{},"subject":[],"published":{"date-parts":[[2022,5,23]]}}}