{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,4]],"date-time":"2026-06-04T16:06:05Z","timestamp":1780589165648,"version":"3.54.1"},"reference-count":43,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"6","license":[{"start":{"date-parts":[[2022,10,1]],"date-time":"2022-10-01T00:00:00Z","timestamp":1664582400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"DOI":"10.13039\/501100002341","name":"Academy of Finland","doi-asserted-by":"publisher","award":["332063"],"award-info":[{"award-number":["332063"]}],"id":[{"id":"10.13039\/501100002341","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Teaching machines to listen"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE J. Sel. Top. Signal Process."],"published-print":{"date-parts":[[2022,10]]},"DOI":"10.1109\/jstsp.2022.3180592","type":"journal-article","created":{"date-parts":[[2022,6,8]],"date-time":"2022-06-08T19:39:59Z","timestamp":1654717199000},"page":"1467-1479","source":"Crossref","is-referenced-by-count":12,"title":["Self-Supervised Learning of Audio Representations From Audio-Visual Data Using Spatial Alignment"],"prefix":"10.1109","volume":"16","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9530-6958","authenticated-orcid":false,"given":"Shanshan","family":"Wang","sequence":"first","affiliation":[{"name":"Faculty of Information Technology and Communication Sciences, Tampere University, Tampere, Finland"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Archontis","family":"Politis","sequence":"additional","affiliation":[{"name":"Faculty of Information Technology and Communication Sciences, Tampere University, Tampere, Finland"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6640-9752","authenticated-orcid":false,"given":"Annamaria","family":"Mesaros","sequence":"additional","affiliation":[{"name":"Faculty of Information Technology and Communication Sciences, Tampere University, Tampere, Finland"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4604-9729","authenticated-orcid":false,"given":"Tuomas","family":"Virtanen","sequence":"additional","affiliation":[{"name":"Faculty of Information Technology and Communication Sciences, Tampere University, Tampere, Finland"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Language agnostic speech embeddings for emotion classification","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Nandan","year":"2020"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9415009"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683158"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/WASPAA52581.2021.9632739"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.73"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_40"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_39"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682475"},{"key":"ref9","first-page":"45","article-title":"Audio-visual scene classification: Analysis of DCASE 2021 challenge submissions","volume-title":"Proc. Detection Classification Acoustic Scenes Events Workshop","author":"Wang","year":"2021"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00458"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00715"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01144"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58548-8_37"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683142"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093307"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00995"},{"key":"ref17","first-page":"4733","article-title":"Learning Representations From Audio-Visual Spatial Alignment","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Morgado","year":"2020"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.3390\/app7111204"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9415085"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2806390"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2015.2428998"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"ref23","first-page":"18661","article-title":"Supervised Contrastive Learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"30","author":"Khosla","year":"2020"},{"key":"ref24","article-title":"Representation learning with contrastive predictive coding","author":"Oord","year":"2018"},{"key":"ref25","first-page":"1597","article-title":"A simple framework for contrastive learning of visual representations","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Chen","year":"2020"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/1553374.1553380"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.91"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00675"},{"key":"ref29","first-page":"5998","article-title":"Attention is all you need","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"30","author":"Vaswani","year":"2017"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-662-45664-4_8"},{"key":"ref31","article-title":"Jsambisonics: A web audio library for interactive spatial sound processing on the web","volume-title":"Proc. Interactive Audio Syst. Symp.","author":"Politis","year":"2016"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/EUSIPCO.2015.7362645"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1002\/9781119252634.ch5"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2019.2900164"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3047233"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.33682\/4jhy-bj81"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-17207-7"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2018.2881912"},{"key":"ref39","article-title":"UCF101: A dataset of 101 human actions classes from videos in the wild","author":"Soomro","year":"2012"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/WASPAA.2019.8937282"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2018.8546070"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2014.2337846"}],"container-title":["IEEE Journal of Selected Topics in Signal Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/4200690\/9923627\/09790080.pdf?arnumber=9790080","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,2,1]],"date-time":"2024-02-01T03:00:01Z","timestamp":1706756401000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9790080\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,10]]},"references-count":43,"journal-issue":{"issue":"6"},"URL":"https:\/\/doi.org\/10.1109\/jstsp.2022.3180592","relation":{},"ISSN":["1932-4553","1941-0484"],"issn-type":[{"value":"1932-4553","type":"print"},{"value":"1941-0484","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,10]]}}}