{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T09:02:51Z","timestamp":1765357371980,"version":"3.28.0"},"reference-count":28,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,6,4]],"date-time":"2023-06-04T00:00:00Z","timestamp":1685836800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,6,4]],"date-time":"2023-06-04T00:00:00Z","timestamp":1685836800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,6,4]]},"DOI":"10.1109\/icassp49357.2023.10096319","type":"proceedings-article","created":{"date-parts":[[2023,5,5]],"date-time":"2023-05-05T17:28:30Z","timestamp":1683307710000},"page":"1-5","source":"Crossref","is-referenced-by-count":8,"title":["SemanticAC: Semantics-Assisted Framework for Audio Classification"],"prefix":"10.1109","author":[{"given":"Yicheng","family":"Xiao","sequence":"first","affiliation":[{"name":"Tsinghua University,Tsinghua Shenzhen International Graduate School,China"}]},{"given":"Yue","family":"Ma","sequence":"additional","affiliation":[{"name":"Tsinghua University,Tsinghua Shenzhen International Graduate School,China"}]},{"given":"Shuyan","family":"Li","sequence":"additional","affiliation":[{"name":"Tsinghua University,Tsinghua Shenzhen International Graduate School,China"}]},{"given":"Hantao","family":"Zhou","sequence":"additional","affiliation":[{"name":"Tsinghua University,Tsinghua Shenzhen International Graduate School,China"}]},{"given":"Ran","family":"Liao","sequence":"additional","affiliation":[{"name":"Tsinghua University,Tsinghua Shenzhen International Graduate School,China"}]},{"given":"Xiu","family":"Li","sequence":"additional","affiliation":[{"name":"Tsinghua University,Tsinghua Shenzhen International Graduate School,China"}]}],"member":"263","reference":[{"journal-title":"Septr Separable transformer for audio spectrogram processing","year":"2022","author":"ristea","key":"ref13"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746312"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747631"},{"key":"ref14","first-page":"14200","article-title":"Attention bottlenecks for multimodal fusion","volume":"34","author":"nagrani","year":"2021","journal-title":"NeurIPS"},{"journal-title":"AST Audio Spectrogram Transformer","year":"2021","author":"gong","key":"ref11"},{"key":"ref10","article-title":"Attention is all you need","volume":"30","author":"vaswani","year":"2017","journal-title":"NeurIPS"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1088\/1742-6596\/1848\/1\/012046"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ITSC45102.2020.9294415"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.07.028"},{"journal-title":"Simvtp Simple video text pre-training with masked autoencoders","year":"2022","author":"ma","key":"ref16"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2806390"},{"key":"ref18","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"radford","year":"2021","journal-title":"ICML"},{"key":"ref24","first-page":"9758","article-title":"Self-supervised learning by cross-modal audio-video clustering","volume":"33","author":"alwassel","year":"2020","journal-title":"NeurIPS"},{"journal-title":"End-to-end audio strikes back Boosting augmentations towards an efficient audio classification network","year":"2022","author":"gazneli","key":"ref23"},{"key":"ref26","first-page":"12475","article-title":"Audiovisual instance discrimination with cross-modal agreement","author":"morgado","year":"2021","journal-title":"CVPR"},{"journal-title":"Self-supervised audio-visual representation learning with relaxed cross-modal temporal synchronicity","year":"2021","author":"sarkar","key":"ref25"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2655045"},{"key":"ref22","first-page":"3","article-title":"Cbam: Convolutional block attention module","author":"woo","year":"2018","journal-title":"ECCV"},{"journal-title":"An image is worth 16x16 words Transformers for image recognition at scale","year":"2020","author":"dosovitskiy","key":"ref21"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1137\/0330046"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN52387.2021.9533654"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2019.06.040"},{"journal-title":"Rethinking cnn models for audio classification","year":"2020","author":"palanisamy","key":"ref9"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747584"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/IISA56318.2022.9904377"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-2045"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2014.6854370"}],"event":{"name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","start":{"date-parts":[[2023,6,4]]},"location":"Rhodes Island, Greece","end":{"date-parts":[[2023,6,10]]}},"container-title":["ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10094559\/10094560\/10096319.pdf?arnumber=10096319","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,11,20]],"date-time":"2023-11-20T19:09:06Z","timestamp":1700507346000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10096319\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6,4]]},"references-count":28,"URL":"https:\/\/doi.org\/10.1109\/icassp49357.2023.10096319","relation":{},"subject":[],"published":{"date-parts":[[2023,6,4]]}}}