{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T15:27:45Z","timestamp":1759332465764,"version":"3.37.3"},"reference-count":82,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"1","license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"Toyota Motor Europe via the Research Project","award":["TRACE-Z\u00fcrich"],"award-info":[{"award-number":["TRACE-Z\u00fcrich"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2023,1,1]]},"DOI":"10.1109\/tpami.2022.3155643","type":"journal-article","created":{"date-parts":[[2022,3,3]],"date-time":"2022-03-03T20:28:07Z","timestamp":1646339287000},"page":"123-136","source":"Crossref","is-referenced-by-count":7,"title":["Binaural SoundNet: Predicting Semantics, Depth and Motion With Binaural Sounds"],"prefix":"10.1109","volume":"45","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5440-9678","authenticated-orcid":false,"given":"Dengxin","family":"Dai","sequence":"first","affiliation":[{"name":"Vision for Autonomous Systems Group, MPI for Informatics, Saarbr&#x00FC;cken, Germany"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Arun Balajee","family":"Vasudevan","sequence":"additional","affiliation":[{"name":"Computer Vision Lab, ETH Z&#x00FC;rich, Zrich, Switzerland"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0863-4844","authenticated-orcid":false,"given":"Jiri","family":"Matas","sequence":"additional","affiliation":[{"name":"Center for Machine Perception, Czech Technical University, Prague, Czechia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3445-5711","authenticated-orcid":false,"given":"Luc","family":"Van Gool","sequence":"additional","affiliation":[{"name":"Computer Vision Lab, ETH Z&#x00FC;rich, Zrich, Switzerland"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"doi-asserted-by":"publisher","key":"ref73","DOI":"10.1109\/ICCV.2019.00393"},{"doi-asserted-by":"publisher","key":"ref72","DOI":"10.1109\/CVPR.2011.5995508"},{"key":"ref71","first-page":"833","article-title":"Encoder-decoder with atrous separable convolution for semantic image segmentation","author":"chen","year":"2018","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref70","article-title":"Neural synthesis of binaural speech from mono audio","author":"richard","year":"2021","journal-title":"Proc Int Conf Learn Representations"},{"doi-asserted-by":"publisher","key":"ref76","DOI":"10.1037\/h0054629"},{"doi-asserted-by":"publisher","key":"ref77","DOI":"10.1109\/ICASSP.1983.1172092"},{"doi-asserted-by":"publisher","key":"ref74","DOI":"10.1177\/0278364913491297"},{"key":"ref39","article-title":"DeepWave: A recurrent neural-network for real-time acoustic imaging","author":"simeoni","year":"2019","journal-title":"Proc 33rd Int Conf Neural Inf Process Syst"},{"doi-asserted-by":"publisher","key":"ref75","DOI":"10.1109\/CVPR.2017.179"},{"doi-asserted-by":"publisher","key":"ref38","DOI":"10.1155\/ASP\/2006\/12378"},{"doi-asserted-by":"publisher","key":"ref78","DOI":"10.1109\/TPAMI.2017.2699184"},{"doi-asserted-by":"publisher","key":"ref79","DOI":"10.1109\/ICASSP.2018.8461329"},{"year":"1997","author":"blauert","journal-title":"Spatial Hearing The Psychophysics of Human Sound Localization","key":"ref33"},{"key":"ref32","first-page":"9758","article-title":"Self-supervised learning by cross-modal audio-video clustering","author":"alwassel","year":"2020","journal-title":"Proc Int Conf Neural Inf Process"},{"doi-asserted-by":"publisher","key":"ref31","DOI":"10.1109\/CVPR46437.2021.00694"},{"doi-asserted-by":"publisher","key":"ref30","DOI":"10.1109\/CVPR46437.2021.01274"},{"doi-asserted-by":"publisher","key":"ref37","DOI":"10.1109\/ROBOT.2009.5152861"},{"doi-asserted-by":"publisher","key":"ref36","DOI":"10.3390\/s140201918"},{"doi-asserted-by":"publisher","key":"ref35","DOI":"10.1016\/j.robot.2017.07.011"},{"doi-asserted-by":"publisher","key":"ref34","DOI":"10.1016\/j.csl.2015.03.003"},{"doi-asserted-by":"publisher","key":"ref60","DOI":"10.1109\/CVPR46437.2021.01526"},{"doi-asserted-by":"publisher","key":"ref62","DOI":"10.1371\/journal.pcbi.1006406"},{"key":"ref61","article-title":"Learning to set waypoints for audio-visual navigation","author":"chen","year":"2021","journal-title":"Proc Int Conf Learn Representations"},{"year":"2000","author":"begault","journal-title":"3-D Sound For Virtual Reality and Multimedia","key":"ref63"},{"doi-asserted-by":"publisher","key":"ref28","DOI":"10.1109\/CVPR42600.2020.00995"},{"doi-asserted-by":"publisher","key":"ref64","DOI":"10.3389\/fpsyg.2020.569056"},{"key":"ref27","first-page":"4733","article-title":"Learning representations from audio-visual spatial alignment","author":"morgado","year":"2020","journal-title":"Proc Int Conf Neural Inf Process"},{"doi-asserted-by":"publisher","key":"ref65","DOI":"10.1109\/CVPR.2016.264"},{"key":"ref66","article-title":"Self-supervised generation of spatial audio for 360 deg video","author":"pedro morgado","year":"2018","journal-title":"Proc 32nd Int Conf Neural Inf Process Syst"},{"doi-asserted-by":"publisher","key":"ref29","DOI":"10.1007\/978-3-030-58545-7_38"},{"doi-asserted-by":"publisher","key":"ref67","DOI":"10.1109\/CVPR.2019.00041"},{"doi-asserted-by":"publisher","key":"ref68","DOI":"10.1007\/978-3-030-58610-2_4"},{"doi-asserted-by":"publisher","key":"ref69","DOI":"10.1109\/CVPR46437.2021.01523"},{"doi-asserted-by":"publisher","key":"ref2","DOI":"10.1109\/CVPR.2016.350"},{"doi-asserted-by":"publisher","key":"ref1","DOI":"10.1007\/s11263-009-0275-4"},{"doi-asserted-by":"publisher","key":"ref20","DOI":"10.1109\/3DV.2016.69"},{"key":"ref22","article-title":"DDSP: Differentiable digital signal processing","author":"engel","year":"2020","journal-title":"Proc Int Conf Learn Representations"},{"doi-asserted-by":"publisher","key":"ref21","DOI":"10.1109\/TPAMI.2021.3054719"},{"doi-asserted-by":"publisher","key":"ref24","DOI":"10.1145\/3240508.3240578"},{"key":"ref23","doi-asserted-by":"crossref","first-page":"373","DOI":"10.1162\/jocn.1993.5.3.373","article-title":"The merging of the senses","volume":"5","author":"fendrich","year":"1993","journal-title":"J Cogn Neurosci"},{"doi-asserted-by":"publisher","key":"ref26","DOI":"10.1007\/978-3-030-01231-1_39"},{"doi-asserted-by":"publisher","key":"ref25","DOI":"10.1007\/978-3-319-46448-0_48"},{"doi-asserted-by":"publisher","key":"ref50","DOI":"10.1109\/ICPR.2016.7900169"},{"doi-asserted-by":"publisher","key":"ref51","DOI":"10.1109\/CVPR46437.2021.01144"},{"doi-asserted-by":"publisher","key":"ref59","DOI":"10.1007\/978-3-030-58539-6_2"},{"doi-asserted-by":"publisher","key":"ref58","DOI":"10.1109\/ICRA40945.2020.9197008"},{"doi-asserted-by":"publisher","key":"ref57","DOI":"10.1109\/CVPR46437.2021.00817"},{"doi-asserted-by":"publisher","key":"ref56","DOI":"10.1109\/ICRA40945.2020.9196934"},{"doi-asserted-by":"publisher","key":"ref55","DOI":"10.1073\/pnas.1221464110"},{"doi-asserted-by":"publisher","key":"ref54","DOI":"10.1109\/CVPR.2015.7299122"},{"doi-asserted-by":"publisher","key":"ref53","DOI":"10.1109\/3DV.2017.00076"},{"doi-asserted-by":"publisher","key":"ref52","DOI":"10.1109\/ICASSP.2019.8683142"},{"doi-asserted-by":"publisher","key":"ref10","DOI":"10.18653\/v1\/D19-1215"},{"doi-asserted-by":"publisher","key":"ref11","DOI":"10.1007\/978-3-030-01246-5_35"},{"doi-asserted-by":"publisher","key":"ref40","DOI":"10.1109\/CVPR.2018.00458"},{"doi-asserted-by":"publisher","key":"ref12","DOI":"10.1145\/3197517.3201391"},{"doi-asserted-by":"publisher","key":"ref13","DOI":"10.1109\/TASL.2012.2210877"},{"doi-asserted-by":"publisher","key":"ref14","DOI":"10.1007\/978-3-030-01219-9_3"},{"key":"ref15","first-page":"892","article-title":"SoundNet: Learning sound representations from unlabeled video","author":"aytar","year":"2016","journal-title":"Proc 30th Int Conf Neural Inf Process Syst"},{"key":"ref82","article-title":"Deep audio priors emerge from harmonic convolutional networks","author":"zhang","year":"2020","journal-title":"Proc Int Conf Learn Representations"},{"doi-asserted-by":"publisher","key":"ref16","DOI":"10.1109\/ICCV.2019.00182"},{"year":"2014","author":"kingma","article-title":"Adam: A method for stochastic optimization","key":"ref81"},{"doi-asserted-by":"publisher","key":"ref17","DOI":"10.3389\/fnins.2014.00210"},{"key":"ref18","article-title":"Head movements during sound localizationtd","volume":"42","author":"thurlow","year":"1967","journal-title":"J Acoustical Soc America"},{"year":"2012","author":"huang","journal-title":"Human Factors in Augmented Reality Environments","key":"ref19"},{"doi-asserted-by":"publisher","key":"ref80","DOI":"10.1007\/s11263-007-0090-8"},{"doi-asserted-by":"publisher","key":"ref4","DOI":"10.1109\/ICRA.2017.7989774"},{"doi-asserted-by":"publisher","key":"ref3","DOI":"10.1109\/ICCV48922.2021.01059"},{"doi-asserted-by":"publisher","key":"ref6","DOI":"10.1109\/ICCV.2019.00715"},{"doi-asserted-by":"publisher","key":"ref5","DOI":"10.1109\/LRA.2021.3062254"},{"doi-asserted-by":"publisher","key":"ref8","DOI":"10.1207\/s15326969eco0501_1"},{"doi-asserted-by":"publisher","key":"ref7","DOI":"10.1207\/S15326969ECO1203_1"},{"doi-asserted-by":"publisher","key":"ref49","DOI":"10.1109\/TRO.2020.3031214"},{"key":"ref9","first-page":"1861","article-title":"Object referring in visual scene with spoken language","author":"balajee vasudevan","year":"2018","journal-title":"Proc IEEE Winter Conf Appl Comput Vis"},{"year":"2018","author":"marchegiani","article-title":"Listening for sirens: Locating and classifying acoustic alarms in city scenes","key":"ref46"},{"doi-asserted-by":"publisher","key":"ref45","DOI":"10.1007\/978-3-030-01216-8_16"},{"doi-asserted-by":"publisher","key":"ref48","DOI":"10.1145\/2647868.2655045"},{"key":"ref47","first-page":"4250","article-title":"Acoustic based safety emergency vehicle detection for intelligent transport systems","author":"fazenda","year":"2009","journal-title":"Proc Int Conf Control Autom Syst"},{"doi-asserted-by":"publisher","key":"ref42","DOI":"10.1007\/978-3-030-01246-5_27"},{"key":"ref41","first-page":"609","article-title":"Look, listen and learn","author":"arandjelovic","year":"2017","journal-title":"Proc IEEE Int Conf Comput Vis"},{"doi-asserted-by":"publisher","key":"ref44","DOI":"10.1109\/ICCV.2019.00398"},{"doi-asserted-by":"publisher","key":"ref43","DOI":"10.1109\/CVPR.2007.383344"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/34\/9970415\/09726801.pdf?arnumber=9726801","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,12,26]],"date-time":"2022-12-26T19:15:37Z","timestamp":1672082137000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9726801\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,1,1]]},"references-count":82,"journal-issue":{"issue":"1"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2022.3155643","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"type":"print","value":"0162-8828"},{"type":"electronic","value":"2160-9292"},{"type":"electronic","value":"1939-3539"}],"subject":[],"published":{"date-parts":[[2023,1,1]]}}}