{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,4]],"date-time":"2026-06-04T16:06:54Z","timestamp":1780589214789,"version":"3.54.1"},"reference-count":32,"publisher":"IEEE","license":[{"start":{"date-parts":[[2022,6,21]],"date-time":"2022-06-21T00:00:00Z","timestamp":1655769600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,6,21]],"date-time":"2022-06-21T00:00:00Z","timestamp":1655769600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100004543","name":"China Scholarship Council","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100004543","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100000857","name":"Loughborough University","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100000857","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,6,21]]},"DOI":"10.1109\/ist55454.2022.9827729","type":"proceedings-article","created":{"date-parts":[[2022,7,20]],"date-time":"2022-07-20T19:37:45Z","timestamp":1658345865000},"page":"1-6","source":"Crossref","is-referenced-by-count":25,"title":["Spectrogram Transformers for Audio Classification"],"prefix":"10.1109","author":[{"given":"Yixiao","family":"Zhang","sequence":"first","affiliation":[{"name":"Loughborough University,Department of Computer Science,Loughborough,U.K."}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Baihua","family":"Li","sequence":"additional","affiliation":[{"name":"Loughborough University,Department of Computer Science,Loughborough,U.K."}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Hui","family":"Fang","sequence":"additional","affiliation":[{"name":"Loughborough University,Department of Computer Science,Loughborough,U.K."}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Qinggang","family":"Meng","sequence":"additional","affiliation":[{"name":"Loughborough University,Department of Computer Science,Loughborough,U.K."}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref32","article-title":"Soundnet: Learning sound representations from unlabeled video","volume":"29","author":"aytar","year":"2016","journal-title":"Advances in neural information processing systems"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2806390"},{"key":"ref30","article-title":"Spec augment: A simple data augmentation method for automatic speech recognition","author":"park","year":"2019","journal-title":"ArXiv Preprint"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2014.6854950"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICDSP.2017.8096153"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICCSNT47585.2019.8962462"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/EUSIPCO.2016.7760424"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1016\/j.apacoust.2020.107389"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO.2017.8081521"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR48806.2021.9413035"},{"key":"ref17","article-title":"Rethinking cnn models for audio classification","author":"palanisamy","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref18","article-title":"Study of positional encoding approaches for audio spectrogram transformers","author":"pepino","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref19","article-title":"Effi-cient training of audio transformers with patchout","author":"koutini","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref28","first-page":"5998","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2017.07.032"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-3019"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2016.03.020"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952132"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2806390"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/MLSP.2016.7738875"},{"key":"ref8","author":"liang","year":"2019","journal-title":"Acoustic scene classification using attention-based convolutional neural network"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2018.07.033"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.3390\/s18061858"},{"key":"ref9","article-title":"Ast: Audio spectrogram trans-former","author":"gong","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1145\/354384.354443"},{"key":"ref20","article-title":"An image is worth 16&#x00D7;16 words: Trans-formers for image recognition at scale","author":"dosovitskiy","year":"2021","journal-title":"International Conference on Learning Representations"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1016\/j.sigpro.2018.07.012"},{"key":"ref21","first-page":"142","article-title":"Histogram of gradients of time-frequency representations for audio scene classification","volume":"23","author":"rakotomamonjy","year":"2014","journal-title":"IEEE\/ACM Transactions on Audio Speech and Language Processing"},{"key":"ref24","article-title":"Aclnet: efficient end-to-end audio classification cnn","author":"huang","year":"2018","journal-title":"ArXiv Preprint"},{"key":"ref23","article-title":"Comparison of time-frequency representations for en-vironmental sound classification using convolutional neural networks","author":"huzaifah","year":"2017","journal-title":"ArXiv Preprint"},{"key":"ref26","article-title":"Eranns: Efficient residual audio neural networks for audio pattern recognition","author":"verbitskiy","year":"2021","journal-title":"ar Xiv preprint"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053613"}],"event":{"name":"2022 IEEE International Conference on Imaging Systems and Techniques (IST)","location":"Kaohsiung, Taiwan","start":{"date-parts":[[2022,6,21]]},"end":{"date-parts":[[2022,6,23]]}},"container-title":["2022 IEEE International Conference on Imaging Systems and Techniques (IST)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9827659\/9827662\/09827729.pdf?arnumber=9827729","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,8,8]],"date-time":"2022-08-08T20:02:11Z","timestamp":1659988931000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9827729\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,6,21]]},"references-count":32,"URL":"https:\/\/doi.org\/10.1109\/ist55454.2022.9827729","relation":{},"subject":[],"published":{"date-parts":[[2022,6,21]]}}}