{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,9,14]],"date-time":"2024-09-14T00:17:15Z","timestamp":1726273035028},"reference-count":24,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,10,22]],"date-time":"2023-10-22T00:00:00Z","timestamp":1697932800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,10,22]],"date-time":"2023-10-22T00:00:00Z","timestamp":1697932800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,10,22]]},"DOI":"10.1109\/waspaa58266.2023.10248180","type":"proceedings-article","created":{"date-parts":[[2023,10,18]],"date-time":"2023-10-18T17:42:17Z","timestamp":1697650937000},"page":"1-5","source":"Crossref","is-referenced-by-count":0,"title":["Unsupervised Improvement of Audio-Text Cross-Modal Representations"],"prefix":"10.1109","author":[{"given":"Zhepei","family":"Wang","sequence":"first","affiliation":[{"name":"University of Illinois at Urbana-Champaign"}]},{"given":"Cem","family":"Subakan","sequence":"additional","affiliation":[{"name":"Universit&#x00E9; Laval"}]},{"given":"Krishna","family":"Subramani","sequence":"additional","affiliation":[{"name":"University of Illinois at Urbana-Champaign"}]},{"given":"Junkai","family":"Wu","sequence":"additional","affiliation":[{"name":"University of Illinois at Urbana-Champaign"}]},{"given":"Tiago","family":"Tavares","sequence":"additional","affiliation":[{"name":"Insper"}]},{"given":"Fabio","family":"Ayres","sequence":"additional","affiliation":[{"name":"Insper"}]},{"given":"Paris","family":"Smaragdis","sequence":"additional","affiliation":[{"name":"University of Illinois at Urbana-Champaign"}]}],"member":"263","reference":[{"article-title":"A survey of self-supervised learning from multiple perspectives: Algorithms, theory, applications and future trends","year":"2023","author":"Gui","key":"ref1"},{"article-title":"A simple framework for contrastive learning of visual representations","volume-title":"Proceedings of the International Conference on Machine Learning (ICML)","author":"Chen","key":"ref2"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-1423"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i10.21315"},{"key":"ref5","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proceedings of the 38th International Conference on Machine Learning","author":"Radford"},{"article-title":"Florence: A new foundation model for computer vision","year":"2021","author":"Yuan","key":"ref6"},{"article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","year":"2021","author":"Jia","key":"ref7"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/icassp43922.2022.9747631"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/icassp49357.2023.10095889"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747669"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.333"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01594"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i3.27955"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3133208"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052990"},{"key":"ref18","first-page":"119","article-title":"AudioCaps: Generating captions for audios in the wild","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)","author":"Kim"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO54536.2021.9616087"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3030497"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2022.3229643"},{"key":"ref22","first-page":"1015","article-title":"ESC: Dataset for Environmental Sound Classification","volume-title":"Proceedings of the 23rd Annual ACM Conference on Multimedia","author":"Piczak"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2655045"},{"key":"ref24","first-page":"85","article-title":"DCASE 2017 challenge setup: Tasks, datasets and baseline system","volume-title":"Proceedings of the Detection and Classification of Acoustic Scenes and Events 2017 Workshop (DCASE2017)","author":"Mesaros"}],"event":{"name":"2023 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","start":{"date-parts":[[2023,10,22]]},"location":"New Paltz, NY, USA","end":{"date-parts":[[2023,10,25]]}},"container-title":["2023 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10248019\/10248047\/10248180.pdf?arnumber=10248180","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,13]],"date-time":"2024-09-13T05:30:54Z","timestamp":1726205454000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10248180\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,22]]},"references-count":24,"URL":"https:\/\/doi.org\/10.1109\/waspaa58266.2023.10248180","relation":{},"subject":[],"published":{"date-parts":[[2023,10,22]]}}}