{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,23]],"date-time":"2026-01-23T06:51:36Z","timestamp":1769151096245,"version":"3.49.0"},"reference-count":41,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100010663","name":"H2020 European Research Council","doi-asserted-by":"publisher","award":["637422"],"award-info":[{"award-number":["637422"]}],"id":[{"id":"10.13039\/100010663","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2021]]},"DOI":"10.1109\/taslp.2021.3065234","type":"journal-article","created":{"date-parts":[[2021,3,11]],"date-time":"2021-03-11T21:16:16Z","timestamp":1615497376000},"page":"1233-1242","source":"Crossref","is-referenced-by-count":38,"title":["Zero-Shot Audio Classification Via Semantic Embeddings"],"prefix":"10.1109","volume":"29","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7609-8049","authenticated-orcid":false,"given":"Huang","family":"Xie","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4604-9729","authenticated-orcid":false,"given":"Tuomas","family":"Virtanen","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1145\/1008992.1009000"},{"key":"ref38","article-title":"Adam: A method for stochastic optimization","author":"kingma","year":"0","journal-title":"Proc 3 rd Int Conf Learn Representations"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"ref32","author":"mikolov","year":"2013"},{"key":"ref31","first-page":"3111","article-title":"Distributed representations of words and phrases and their compositionality","author":"mikolov","year":"0","journal-title":"Proc 26th Int Conf Neural Inf Process Syst"},{"key":"ref30","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"0","journal-title":"Proc 2019 Conf North Amer Chapter Assoc Comput Linguistics Hum Lang Technol"},{"key":"ref37","first-page":"2764","article-title":"WSABIE: Scaling up to large vocabulary image annotation","author":"weston","year":"0","journal-title":"Proc 22nd Int Joint Conf Artif Intell"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/1553374.1553509"},{"key":"ref35","article-title":"Very deep convolutional networks for large-scale image recognition","author":"simonyan","year":"0","journal-title":"Proc 3rd Int Conf Learn Representations"},{"key":"ref34","author":"devlin","year":"2019"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682558"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1080\/01431161.2013.774099"},{"key":"ref11","first-page":"248","article-title":"A software framework for musical data augmentation","author":"mcfee","year":"0","journal-title":"Proc 16th Int Soc Music Inf Retrieval Conf"},{"key":"ref12","first-page":"1126","article-title":"Model-agnostic meta-learning for fast adaptation of deep networks","volume":"70","author":"finn","year":"0","journal-title":"Proc 34th Int Conf Mach Learn"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/3386252"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2017.2763441"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2013.140"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-50077-5_2"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-15549-9_55"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2015.2487986"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.15"},{"key":"ref28","article-title":"Efficient estimation of word representations in vector space","author":"mikolov","year":"0","journal-title":"Proc 1st Int Conf Learn Representations"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2655045"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952132"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2806390"},{"key":"ref6","first-page":"486","article-title":"Freesound datasets: A platform for the creation of open audio datasets","author":"fonseca","year":"0","journal-title":"Proc 18th Int Soc Music Inf Retrieval Conf"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053251"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2017.2657381"},{"key":"ref2","author":"heittola","year":"2020"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053336"},{"key":"ref1","first-page":"1410","article-title":"Zero-shot learning with semantic output codes","author":"palatucci","year":"0","journal-title":"Proc 22nd Int Conf Neural Inf Process Syst"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2857768"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.575"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.474"},{"key":"ref24","first-page":"67","article-title":"Zero-shot learning for audio-based music classification and tagging","author":"choi","year":"0","journal-title":"Proc 20th Int Soc Music Inf Retrieval Conf"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1145\/3302506.3310402"},{"key":"ref26","article-title":"Siamese neural networks for one-shot image recognition","author":"koch","year":"0","journal-title":"Proc 32nd Annu Int Conf Mach Learn"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/WASPAA.2019.8937283"}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6570655\/9289074\/09376628.pdf?arnumber=9376628","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,10]],"date-time":"2022-05-10T14:53:59Z","timestamp":1652194439000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9376628\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"references-count":41,"URL":"https:\/\/doi.org\/10.1109\/taslp.2021.3065234","relation":{},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"value":"2329-9290","type":"print"},{"value":"2329-9304","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021]]}}}