{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,26]],"date-time":"2025-12-26T07:10:35Z","timestamp":1766733035928},"reference-count":25,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,4,14]],"date-time":"2024-04-14T00:00:00Z","timestamp":1713052800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,4,14]],"date-time":"2024-04-14T00:00:00Z","timestamp":1713052800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,4,14]]},"DOI":"10.1109\/icasspw62465.2024.10625827","type":"proceedings-article","created":{"date-parts":[[2024,8,15]],"date-time":"2024-08-15T17:19:18Z","timestamp":1723742358000},"page":"465-469","source":"Crossref","is-referenced-by-count":3,"title":["SpeechCLIP+: Self-Supervised Multi-Task Representation Learning for Speech Via Clip and Speech-Image Data"],"prefix":"10.1109","author":[{"given":"Hsuan-Fu","family":"Wang","sequence":"first","affiliation":[{"name":"National Taiwan University,Taiwan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yi-Jen","family":"Shih","sequence":"additional","affiliation":[{"name":"The University of Texas at Austin,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Heng-Jui","family":"Chang","sequence":"additional","affiliation":[{"name":"Massachusetts Institute of Technology,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Layne","family":"Berry","sequence":"additional","affiliation":[{"name":"The University of Texas at Austin,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Puyuan","family":"Peng","sequence":"additional","affiliation":[{"name":"The University of Texas at Austin,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hung-Yi","family":"Lee","sequence":"additional","affiliation":[{"name":"National Taiwan University,Taiwan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hsin-Min","family":"Wang","sequence":"additional","affiliation":[{"name":"Academia Sinica,Institute of Information Science,Taiwan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"David","family":"Harwath","sequence":"additional","affiliation":[{"name":"The University of Texas at Austin,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3207050"},{"key":"ref2","article-title":"Neural discrete representation learning","author":"Den Oord","year":"2017","journal-title":"NIPS"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2019-1473"},{"key":"ref4","article-title":"vq-wav2vec: Self-supervised learning of discrete speech representations","author":"Baevski","year":"2020","journal-title":"ICLR"},{"key":"ref5","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","author":"Baevski","year":"2020","journal-title":"NeurIPS"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747490"},{"key":"ref9","article-title":"data2vec: A general framework for self-supervised learning in speech, vision and language","author":"Baevski","year":"2022","journal-title":"ICML"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2016.7846320"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10652"},{"article-title":"Self-supervised representation learning for speech using visual grounding and masked language modeling","year":"2022","author":"Peng","key":"ref12"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/SLT54892.2023.10022954"},{"key":"ref14","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021","journal-title":"ICML"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-1775"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096882"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054250"},{"article-title":"Collecting image annotations using Amazon\u2019s Mechanical Turk","volume-title":"NAACL HLT Workshop on Creating Speech and Language Data with Amazon\u2019s Mechanical Turk","author":"Rashtchian","key":"ref18"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/SLT54892.2023.10022991"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2022.3185955"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-502"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/SLT54892.2023.10023079"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747103"},{"article-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"International Conference on Machine Learning","author":"Li","key":"ref25"}],"event":{"name":"2024 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops (ICASSPW)","start":{"date-parts":[[2024,4,14]]},"location":"Seoul, Korea, Republic of","end":{"date-parts":[[2024,4,19]]}},"container-title":["2024 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops (ICASSPW)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10625769\/10625780\/10625827.pdf?arnumber=10625827","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,16]],"date-time":"2024-08-16T05:46:37Z","timestamp":1723787197000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10625827\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,14]]},"references-count":25,"URL":"https:\/\/doi.org\/10.1109\/icasspw62465.2024.10625827","relation":{},"subject":[],"published":{"date-parts":[[2024,4,14]]}}}