{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,17]],"date-time":"2026-03-17T08:54:19Z","timestamp":1773737659244,"version":"3.50.1"},"reference-count":34,"publisher":"IEEE","license":[{"start":{"date-parts":[[2022,5,23]],"date-time":"2022-05-23T00:00:00Z","timestamp":1653264000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,5,23]],"date-time":"2022-05-23T00:00:00Z","timestamp":1653264000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,5,23]]},"DOI":"10.1109\/icassp43922.2022.9747669","type":"proceedings-article","created":{"date-parts":[[2022,4,27]],"date-time":"2022-04-27T15:50:34Z","timestamp":1651074634000},"page":"4563-4567","source":"Crossref","is-referenced-by-count":150,"title":["Wav2CLIP: Learning Robust Audio Representations from Clip"],"prefix":"10.1109","author":[{"given":"Ho-Hsiang","family":"Wu","sequence":"first","affiliation":[{"name":"Music and Audio Research Laboratory, New York University,USA"}]},{"given":"Prem","family":"Seetharaman","sequence":"additional","affiliation":[{"name":"Descript, Inc."}]},{"given":"Kundan","family":"Kumar","sequence":"additional","affiliation":[{"name":"Descript, Inc."}]},{"given":"Juan Pablo","family":"Bello","sequence":"additional","affiliation":[{"name":"Music and Audio Research Laboratory, New York University,USA"}]}],"member":"263","reference":[{"key":"ref33","author":"rafii","year":"2019","journal-title":"MUSDB18-HQ - an uncompressed version of musdb18"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref31","article-title":"Investigating waveform and spectrogram feature fusion for audio classification","author":"fedorishin","year":"2021","journal-title":"Tech Rep DCASE2016 Challenge"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413376"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"ref10","first-page":"892","article-title":"Sound-net: Learning sound representations from unlabeled video","volume":"29","author":"aytar","year":"2016","journal-title":"NeurIPS"},{"key":"ref11","first-page":"609","article-title":"Look, listen and learn","author":"arandjelovic","year":"2017","journal-title":"ICCV"},{"key":"ref12","article-title":"Self-supervised learning by cross-modal audio-video clustering","author":"alwassel","year":"2020","journal-title":"NeurIPS"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682475"},{"key":"ref14","article-title":"Into the wild with audioscope: Unsupervised audio-visual separation of on-screen sounds","author":"tzinis","year":"2021","journal-title":"ICLRE"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"ref16","article-title":"Learning transferable visual models from natural language supervision","author":"radford","year":"2021","journal-title":"ICML"},{"key":"ref17","article-title":"Audioclip: Extending clip to image, text and audio","author":"guzhov","year":"2021","journal-title":"arXiv preprint arXiv 2106 13112"},{"key":"ref18","first-page":"297","article-title":"Noise-contrastive estimation: A new estimation principle for unnormalized statistical models","author":"gutmann","year":"2010","journal-title":"AISTATS JMLR Workshop and Conference Proceedings"},{"key":"ref19","first-page":"8821","article-title":"Zero-shot text-to-image generation","volume":"139","author":"ramesh","year":"2021","journal-title":"PMLR"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.33682\/006b-jx26"},{"key":"ref4","article-title":"Contrastive learning of musical representations","author":"spijkervet","year":"2021","journal-title":"ISMIR"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052990"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053569"},{"key":"ref6","first-page":"1597","article-title":"A simple framework for contrastive learning of visual representations","author":"chen","year":"2020","journal-title":"ICML"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3120633"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414405"},{"key":"ref8","article-title":"Bert: Pretraining of deep bidirectional transformers for language understanding","author":"devlin","year":"2019","journal-title":"NAACL"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58558-7_29"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1873"},{"key":"ref9","article-title":"Language models are few-shot learners","author":"brown","year":"2020","journal-title":"NeurIPS"},{"key":"ref1","article-title":"Representation learning with contrastive predictive coding","author":"den oord","year":"2018","journal-title":"arXiv preprint arXiv 1807 03748"},{"key":"ref20","article-title":"Bootstrap your own latent: A new approach to self-supervised learning","author":"grill","year":"2020","journal-title":"NeurIPS"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00129"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2655045"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2806390"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9415085"},{"key":"ref25","article-title":"Fsd50k: an open dataset of human-labeled sound events","author":"fonseca","year":"2020","journal-title":"arXiv preprint arXiv 2010 00170"}],"event":{"name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","location":"Singapore, Singapore","start":{"date-parts":[[2022,5,23]]},"end":{"date-parts":[[2022,5,27]]}},"container-title":["ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9745891\/9746004\/09747669.pdf?arnumber=9747669","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,8,15]],"date-time":"2022-08-15T16:11:06Z","timestamp":1660579866000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9747669\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,5,23]]},"references-count":34,"URL":"https:\/\/doi.org\/10.1109\/icassp43922.2022.9747669","relation":{},"subject":[],"published":{"date-parts":[[2022,5,23]]}}}