{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,30]],"date-time":"2026-07-30T16:07:13Z","timestamp":1785427633816,"version":"3.56.0"},"reference-count":36,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100003725","name":"National Research Foundation of Korea","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100003725","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Korean Government","award":["RS-2023-00212845"],"award-info":[{"award-number":["RS-2023-00212845"]}]},{"name":"Information Technology Research Center","award":["IITP-2024-RS2023-00259991"],"award-info":[{"award-number":["IITP-2024-RS2023-00259991"]}]},{"name":"Institute for Information and Communications Technology Planning and Evaluation"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Signal Process. Lett."],"published-print":{"date-parts":[[2024]]},"DOI":"10.1109\/lsp.2024.3483009","type":"journal-article","created":{"date-parts":[[2024,10,17]],"date-time":"2024-10-17T17:56:40Z","timestamp":1729187800000},"page":"2975-2979","source":"Crossref","is-referenced-by-count":42,"title":["Audio Mamba: Bidirectional State Space Model for Audio Representation Learning"],"prefix":"10.1109","volume":"31","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1909-4775","authenticated-orcid":false,"given":"Mehmet Hamza","family":"Erol","sequence":"first","affiliation":[{"name":"Computer Science Department, Stanford University, Stanford, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9141-3270","authenticated-orcid":false,"given":"Arda","family":"Senocak","sequence":"additional","affiliation":[{"name":"School of Electrical Engineering, KAIST, South Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-3626-6444","authenticated-orcid":false,"given":"Jiu","family":"Feng","sequence":"additional","affiliation":[{"name":"Department of Computer Science, UT Austin, Austin, TX, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7741-7275","authenticated-orcid":false,"given":"Joon Son","family":"Chung","sequence":"additional","affiliation":[{"name":"School of Electrical Engineering, KAIST, South Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3030497"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref4","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Dosovitskiy","year":"2021"},{"key":"ref5","first-page":"10347","article-title":"Training data-efficient image transformers & distillation through attention","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Touvron","year":"2021"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-698"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2022.3224688"},{"key":"ref8","article-title":"Contrastive audio-visual masked autoencoder","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Gong","year":"2022"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/SLT54892.2023.10022954"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747669"},{"key":"ref12","first-page":"14200","article-title":"Attention bottlenecks for multimodal fusion","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Nagrani","year":"2021"},{"key":"ref13","first-page":"4733","article-title":"Learning representations from audio-visual spatial alignment","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Morgado","year":"2020"},{"key":"ref14","first-page":"25","article-title":"Self-supervised multimodal versatile networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Alayrac","year":"2020"},{"key":"ref15","first-page":"24206","article-title":"VATT: Transformers for multimodal self-supervised learning from raw video, audio and text","volume-title":"Proc. 35th Int. Conf. Neural Inf. Process. Syst.","author":"Akbari","year":"2021"},{"key":"ref16","article-title":"Mamba: Linear-time sequence modeling with selective state spaces","author":"Gu","year":"2023"},{"key":"ref17","article-title":"Efficiently modeling long sequences with structured state spaces","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Gu","year":"2022"},{"key":"ref18","article-title":"Simplified state space layers for sequence modeling","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Smith","year":"2023"},{"key":"ref19","article-title":"Hungry hungry hippos: Towards language modeling with state space models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Fu","year":"2023"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2023-696"},{"key":"ref21","first-page":"7616","article-title":"It\u2019s raw! audio generation with state-space models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Goel","year":"2022"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10097095"},{"key":"ref23","first-page":"62429","article-title":"Vision mamba: Efficient visual representation learning with bidirectional state space model","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Zhu","year":"2024"},{"key":"ref24","article-title":"Vivim: A video vision mamba for medical video object segmentation","author":"Yang","year":"2024"},{"key":"ref25","article-title":"VMamba: Visual state space model","author":"Liu","year":"2024"},{"key":"ref26","article-title":"U-Mamba: Enhancing long-range dependency for biomedical image segmentation","author":"Ma","year":"2024"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72111-3_54"},{"key":"ref28","first-page":"776","article-title":"AudioSet: An ontology and human-labeled dataset for audio events","volume-title":"Proc. IEEE Int. Conf. Acoust., Speech Signal Process.","author":"Gemmeke","year":"2017"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2019.101027"},{"key":"ref31","article-title":"Speech commands: A dataset for limited-vocabulary speech recognition","author":"Warden","year":"2018"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096198"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746312"},{"key":"ref34","first-page":"28708","article-title":"Masked autoencoders that listen","volume-title":"Proc. 36th Int. Conf. Neural Inf. Process. Syst.","author":"Huang","year":"2022"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10961"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01531-2"}],"container-title":["IEEE Signal Processing Letters"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/97\/10380231\/10720871.pdf?arnumber=10720871","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T00:59:14Z","timestamp":1732669154000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10720871\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"references-count":36,"URL":"https:\/\/doi.org\/10.1109\/lsp.2024.3483009","relation":{},"ISSN":["1070-9908","1558-2361"],"issn-type":[{"value":"1070-9908","type":"print"},{"value":"1558-2361","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]}}}