{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T16:08:02Z","timestamp":1772726882828,"version":"3.50.1"},"reference-count":74,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2020]]},"DOI":"10.1109\/taslp.2019.2957889","type":"journal-article","created":{"date-parts":[[2019,12,24]],"date-time":"2019-12-24T04:32:03Z","timestamp":1577161923000},"page":"416-428","source":"Crossref","is-referenced-by-count":16,"title":["Weakly Supervised Representation Learning for Audio-Visual Scene Analysis"],"prefix":"10.1109","volume":"28","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2251-7436","authenticated-orcid":false,"given":"Sanjeel","family":"Parekh","sequence":"first","affiliation":[]},{"given":"Slim","family":"Essid","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4834-5166","authenticated-orcid":false,"given":"Alexey","family":"Ozerov","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7627-8194","authenticated-orcid":false,"given":"Ngoc Q. K.","family":"Duong","sequence":"additional","affiliation":[]},{"given":"Patrick","family":"Perez","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4960-0010","authenticated-orcid":false,"given":"Gael","family":"Richard","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref73","first-page":"289","article-title":"Hierarchical question-image co-attention for visual question answering","author":"lu","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/TSA.2005.858005"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2015.2428998"},{"key":"ref70","year":"0","journal-title":"NMF Mel Clustering Code"},{"key":"ref74","first-page":"1","article-title":"Audio-visual event localization in unconstrained videos","author":"tian","year":"0","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-013-0646-8"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2535231"},{"key":"ref33","first-page":"1","article-title":"Weakly supervised object detection with posterior regularization","author":"bilen","year":"0","journal-title":"Proc Brit Mach Vis Conf"},{"key":"ref32","first-page":"1417","article-title":"Multiple instance boosting for object detection","author":"zhang","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref31","first-page":"35","article-title":"Learning to separate object sounds by watching unlabeled video","author":"gao","year":"0","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/3197517.3201357"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.319"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.311"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_22"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298668"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1016\/B978-0-08-051584-7.50010-3"},{"key":"ref62","article-title":"Youtube-8M: A large-scale video classification benchmark","author":"abu-el-haija","year":"2016","journal-title":"arXiv 1609 08675"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952132"},{"key":"ref63","article-title":"Multi-level attention model for weakly supervised audio classification","author":"yu","year":"2018","journal-title":"arXiv 1803 02353"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2895254"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1162\/neco.2008.04-08-771"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682847"},{"key":"ref65","article-title":"Adam: A method for stochastic optimization","author":"kingma","year":"2014","journal-title":"arXiv 1412 6980"},{"key":"ref66","first-page":"268","article-title":"Identify, locate and separate: Audio-visual object extraction in large video collections using weak supervision","author":"parekh","year":"2019","journal-title":"Proc IEEE Workshop Appl Signal Process Audio Acoust"},{"key":"ref29","first-page":"1","article-title":"The sound of pixels","author":"zhao","year":"0","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref67","article-title":"Ensemble of convolutional neural networks for weakly-supervised sound event detection using multiple scale input","author":"lee","year":"2017"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-73031-8_1"},{"key":"ref69","first-page":"36","article-title":"Source-filter based clustering for monaural blind source separation","author":"spiertz","year":"0","journal-title":"Proc Int Conf Digit Audio Effects"},{"key":"ref2","first-page":"5","article-title":"Short-term audiovisual atoms for generic video concept classification","author":"jiang","year":"0","journal-title":"Proc 17th ACM Int Conf Multimedia"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.17743\/jaes.2016.0007"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2964310"},{"key":"ref22","article-title":"Surrey-CVSSP system for DCASE2017 challenge task4","author":"xu","year":"2017"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref24","article-title":"Knowledge transfer from weakly labeled audio using convolutional neural network for sound events and scenes","author":"kumar","year":"2017","journal-title":"arXiv 1711 01369"},{"key":"ref23","article-title":"DCASE 2017 submission: Multiple instance learning for sound event detection","author":"salamon","year":"2017"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682475"},{"key":"ref25","article-title":"Weakly labelled audioset classification with attention neural networks","author":"kong","year":"2019","journal-title":"arXiv 1903 00765"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.5244\/C.29.177"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2013.2251591"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1145\/3065386"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.5244\/C.28.24"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.81"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.10"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1016\/S0004-3702(96)00034-3"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10578-9_48"},{"key":"ref10","first-page":"609","article-title":"Look, listen and learn","author":"arandjelovi?","year":"0","journal-title":"Proc IEEE Int Conf Comput Vis"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_27"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-15561-1_33"},{"key":"ref12","first-page":"1247","article-title":"Deep canonical correlation analysis","author":"andrew","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref13","first-page":"85","article-title":"DCASE 2017 challenge setup: Tasks, datasets and baseline system","author":"mesaros","year":"0","journal-title":"Workshop on Detection and Classification of Acoustic Scenes and Events (DCASE)"},{"key":"ref14","article-title":"The kinetics human action video dataset","author":"kay","year":"2017","journal-title":"arXiv 1705 06950"},{"key":"ref15","author":"bregman","year":"1994","journal-title":"Auditory Scene Analysis The Perceptual Organization of Sound"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7177950"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1016\/j.patrec.2010.02.005"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952260"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7951792"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2670560"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1145\/1290082.1290118"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2012.2228476"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1007\/s13735-012-0024-2"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.264"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2005.274"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_48"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-013-0620-5"},{"key":"ref45","first-page":"391","article-title":"Edge boxes: Locating object proposals from edges","author":"zitnick","year":"0","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2015.2389824"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"ref42","first-page":"1637","article-title":"Weakly-supervised discovery of visual pattern configurations","author":"song","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref41","first-page":"1189","article-title":"Self-paced learning for latent variable models","author":"kumar","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.129"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_42"}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6570655\/8938144\/08926380.pdf?arnumber=8926380","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,4,27]],"date-time":"2022-04-27T17:31:22Z","timestamp":1651080682000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8926380\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020]]},"references-count":74,"URL":"https:\/\/doi.org\/10.1109\/taslp.2019.2957889","relation":{},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"value":"2329-9290","type":"print"},{"value":"2329-9304","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020]]}}}