{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,11]],"date-time":"2026-02-11T19:52:58Z","timestamp":1770839578148,"version":"3.50.1"},"reference-count":63,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"Powder, a Deep Tech Startup"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Access"],"published-print":{"date-parts":[[2022]]},"DOI":"10.1109\/access.2022.3164745","type":"journal-article","created":{"date-parts":[[2022,4,4]],"date-time":"2022-04-04T20:40:26Z","timestamp":1649104826000},"page":"41622-41638","source":"Crossref","is-referenced-by-count":11,"title":["Comparing Learning Methodologies for Self-Supervised Audio-Visual Representation Learning"],"prefix":"10.1109","volume":"10","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2217-4525","authenticated-orcid":false,"given":"Hacene","family":"Terbouche","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Liam","family":"Schoneveld","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Oisin","family":"Benson","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3442-0578","authenticated-orcid":false,"given":"Alice","family":"Othmani","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1186\/s40537-021-00444-8"},{"key":"ref2","article-title":"Self-supervised visual feature learning with deep neural networks: A survey","author":"Jing","year":"2019","journal-title":"arXiv:1902.06162"},{"key":"ref3","article-title":"A simple framework for contrastive learning of visual representations","author":"Chen","year":"2020","journal-title":"arXiv:2002.05709"},{"key":"ref4","article-title":"Efficient estimation of word representations in vector space","author":"Mikolov","year":"2013","journal-title":"arXiv:1301.3781"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"ref6","article-title":"Enriching word vectors with subword information","author":"Bojanowski","year":"2016","journal-title":"arXiv:1607.04606"},{"key":"ref7","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018","journal-title":"arXiv:1810.04805"},{"key":"ref8","article-title":"Language models are few-shot learners","author":"Brown","year":"2020","journal-title":"arXiv:2005.14165"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46466-4_5"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_9"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"ref12","article-title":"Unsupervised learning of visual features by contrasting cluster assignments","author":"Caron","year":"2020","journal-title":"arXiv:2006.09882"},{"key":"ref13","article-title":"Barlow twins: Self-supervised learning via redundancy reduction","author":"Zbontar","year":"2021","journal-title":"arXiv:2103.03230"},{"key":"ref14","article-title":"Spatiotemporal contrastive video representation learning","author":"Qian","year":"2020","journal-title":"arXiv:2008.03800"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00658"},{"key":"ref16","article-title":"Self-supervised video representation learning with odd-one-out networks","author":"Fernando","year":"2016","journal-title":"arXiv:1611.06646"},{"key":"ref17","article-title":"Self-supervised learning by cross-modal audio-video clustering","author":"Alwassel","year":"2019","journal-title":"arXiv:1911.12667"},{"key":"ref18","article-title":"Self-supervised video representation learning with cross-stream prototypical contrasting","author":"Toering","year":"2021","journal-title":"arXiv:2106.10137"},{"key":"ref19","article-title":"Self-supervised co-training for video representation learning","author":"Han","year":"2020","journal-title":"arXiv:2010.09709"},{"key":"ref20","article-title":"Audio-visual instance discrimination with cross-modal agreement","author":"Morgado","year":"2020","journal-title":"arXiv:2004.12943"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1126\/science.1127647"},{"key":"ref22","article-title":"Building high-level features using large scale unsupervised learning","author":"Le","year":"2011","journal-title":"arXiv:1112.6209"},{"key":"ref23","first-page":"2672","article-title":"Generative adversarial nets","volume-title":"Proc. NIPS","author":"Goodfellow"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.278"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46487-9_40"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.167"},{"key":"ref27","article-title":"Unsupervised representation learning by predicting image rotations","author":"Gidaris","year":"2018","journal-title":"arXiv:1803.07728"},{"key":"ref28","article-title":"Unsupervised deep embedding for clustering analysis","author":"Xie","year":"2015","journal-title":"arXiv:1511.06335"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.556"},{"key":"ref30","article-title":"Discriminative unsupervised feature learning with exemplar convolutional neural networks","author":"Dosovitskiy","year":"2014","journal-title":"arXiv:1406.6909"},{"key":"ref31","article-title":"Unsupervised feature learning via non-parametric instance-level discrimination","author":"Wu","year":"2018","journal-title":"arXiv:1805.01978"},{"key":"ref32","first-page":"297","article-title":"Noise-contrastive estimation: A new estimation principle for unnormalized statistical models","volume-title":"Proc. 13th Int. Conf. Artif. Intell. Statist.","volume":"9","author":"Gutmann"},{"key":"ref33","article-title":"Exploring simple Siamese representation learning","author":"Chen","year":"2020","journal-title":"arXiv:2011.10566"},{"key":"ref34","article-title":"Momentum contrast for unsupervised visual representation learning","author":"He","year":"2019","journal-title":"arXiv:1911.05722"},{"key":"ref35","article-title":"Bootstrap your own latent: A new approach to self-supervised learning","author":"Grill","year":"2020","journal-title":"arXiv:2006.07733"},{"key":"ref36","article-title":"Generating videos with scene dynamics","author":"Vondrick","year":"2016","journal-title":"arXiv:1609.02612"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_24"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_32"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018545"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58604-1_26"},{"key":"ref41","article-title":"Unsupervised learning of video representations using LSTMs","author":"Srivastava","year":"2015","journal-title":"arXiv:1502.04681"},{"key":"ref42","article-title":"Representation learning with contrastive predictive coding","author":"van den Oord","year":"2018","journal-title":"arXiv:1807.03748"},{"key":"ref43","article-title":"FlowNet: Learning optical flow with convolutional networks","author":"Fischer","year":"2015","journal-title":"arXiv:1504.06852"},{"key":"ref44","article-title":"Cross and learn: Cross-modal self-supervision","author":"Sayed","year":"2018","journal-title":"arXiv:1811.03879"},{"key":"ref45","article-title":"Cooperative learning of audio and video models from self-supervised synchronization","author":"Korbar","year":"2018","journal-title":"arXiv:1807.00230"},{"key":"ref46","article-title":"Perceiver: General perception with iterative attention","author":"Jaegle","year":"2021","journal-title":"arXiv:2103.03206"},{"key":"ref47","article-title":"Decomposing motion and content for natural video sequence prediction","author":"Villegas","year":"2017","journal-title":"arXiv:1706.08033"},{"key":"ref48","article-title":"Deep multi-scale video prediction beyond mean square error","author":"Mathieu","year":"2015","journal-title":"arXiv:1511.05440"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00021"},{"key":"ref50","article-title":"Improved deep metric learning with multi-class n-pair loss objective","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"29","author":"Sohn"},{"key":"ref51","article-title":"Self-labelling via simultaneous clustering and representation learning","author":"Markus Asano","year":"2019","journal-title":"arXiv:1911.05371"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1561\/2200000073"},{"key":"ref53","article-title":"Sinkhorn distances: Lightspeed computation of optimal transport","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"26","author":"Cuturi"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299154"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10584-0_33"},{"key":"ref56","article-title":"Deep residual learning for image recognition","author":"He","year":"2015","journal-title":"arXiv:1512.03385"},{"key":"ref57","article-title":"CNN architectures for large-scale audio classification","author":"Hershey","year":"2016","journal-title":"arXiv:1609.09430"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1238"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref61","article-title":"Large batch training of convolutional networks","author":"You","year":"2017","journal-title":"arXiv:1708.03888"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.23915\/distill.00007"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2020.102961"}],"container-title":["IEEE Access"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6287639\/9668973\/09749114.pdf?arnumber=9749114","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,18]],"date-time":"2024-01-18T00:24:01Z","timestamp":1705537441000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9749114\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"references-count":63,"URL":"https:\/\/doi.org\/10.1109\/access.2022.3164745","relation":{},"ISSN":["2169-3536"],"issn-type":[{"value":"2169-3536","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022]]}}}