{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T16:26:23Z","timestamp":1774023983784,"version":"3.50.1"},"reference-count":69,"publisher":"Springer Science and Business Media LLC","issue":"10","license":[{"start":{"date-parts":[[2023,6,20]],"date-time":"2023-06-20T00:00:00Z","timestamp":1687219200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,6,20]],"date-time":"2023-06-20T00:00:00Z","timestamp":1687219200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2023,10]]},"DOI":"10.1007\/s11263-023-01816-8","type":"journal-article","created":{"date-parts":[[2023,6,20]],"date-time":"2023-06-20T13:02:23Z","timestamp":1687266143000},"page":"2723-2737","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":10,"title":["Visually-Guided Audio Spatialization in Video with Geometry-Aware Multi-task Learning"],"prefix":"10.1007","volume":"131","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4224-4890","authenticated-orcid":false,"given":"Rishabh","family":"Garg","sequence":"first","affiliation":[]},{"given":"Ruohan","family":"Gao","sequence":"additional","affiliation":[]},{"given":"Kristen","family":"Grauman","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,6,20]]},"reference":[{"key":"1816_CR1","doi-asserted-by":"crossref","unstructured":"Afouras, T., Chung, J. S., & Zisserman, A. (2019). My lips are concealed: Audio-visual speech enhancement through obstructions. In ICASSP.","DOI":"10.21437\/Interspeech.2019-3114"},{"key":"1816_CR2","doi-asserted-by":"crossref","unstructured":"Arandjelovic, R., & Zisserman, A. (2017). Look, listen and learn. In ICCV.","DOI":"10.1109\/ICCV.2017.73"},{"key":"1816_CR3","doi-asserted-by":"crossref","unstructured":"Arandjelovi\u0107, R., & Zisserman, A. (2018). Objects that sound. In ECCV.","DOI":"10.1007\/978-3-030-01246-5_27"},{"key":"1816_CR4","volume-title":"Learning sound representations from unlabeled video","author":"Y Aytar","year":"2016","unstructured":"Aytar, Y., Vondrick, C., & Torralba, A. (2016). Learning sound representations from unlabeled video. NeurIPS: Soundnet."},{"key":"1816_CR5","doi-asserted-by":"crossref","unstructured":"Chang, A., Dai, A., Funkhouser, T., Halber, M., Niessner, M., Savva, M., Song, S., Zeng, A., Zhang, Y. (2017). Matterport3d: Learning from RGB-D data in indoor environments. In International conference on 3D vision (3DV). MatterPort3D dataset license available at: http:\/\/kaldir.vc.in.tum.de\/matterport\/MP_TOS.pdf","DOI":"10.1109\/3DV.2017.00081"},{"key":"1816_CR6","doi-asserted-by":"crossref","unstructured":"Chen, C., Al-Halah, Z., & Grauman, K. (2021). Semantic audio-visual navigation. In CVPR.","DOI":"10.1109\/CVPR46437.2021.01526"},{"key":"1816_CR7","doi-asserted-by":"crossref","unstructured":"Chen, C., Gao, R., Calamia, P., & Grauman, K. (2022). Visual acoustic matching. In CVPR.","DOI":"10.1109\/CVPR52688.2022.01829"},{"key":"1816_CR8","doi-asserted-by":"crossref","unstructured":"Chen, C., Jain, U., Schissler, C., Gari, S. V. A., Al-Halah, Z., Ithapu, V. K., Robinson, P., & Grauman, K. (2020). Soundspaces: Audio-visual navigation in 3D environments. In ECCV.","DOI":"10.1007\/978-3-030-58539-6_2"},{"key":"1816_CR9","doi-asserted-by":"crossref","unstructured":"Chen, C., Majumder, S., Al-Halah, Z., Gao, R., Ramakrishnan, S. K., & Grauman, K. (2020). Learning to set waypoints for audio-visual navigation. In ICLR.","DOI":"10.1109\/CVPR46437.2021.01526"},{"key":"1816_CR10","doi-asserted-by":"crossref","unstructured":"Chen, P., Zhang, Y., Tan, M., Xiao, H., Huang, D., & Gan, C. (2020). Generating visually aligned sound from videos. In IEEE TIP.","DOI":"10.1109\/TIP.2020.3009820"},{"key":"1816_CR11","doi-asserted-by":"crossref","unstructured":"Christensen, J. H., Hornauer, S., & Stella, X. Y. (2020). Batvision: Learning to see 3d spatial layout with two ears. In ICRA.","DOI":"10.1109\/ICRA40945.2020.9196934"},{"key":"1816_CR12","doi-asserted-by":"crossref","unstructured":"Chung, J. S., Senior, A., Vinyals, O., & Zisserman, A. (2017). Lip reading sentences in the wild. In CVPR.","DOI":"10.1109\/CVPR.2017.367"},{"key":"1816_CR13","unstructured":"Dean, V., Tulsiani, S., & Gupta, A. (2020). See, hear, explore: Curiosity via audio-visual association. In NeurIPS."},{"key":"1816_CR14","unstructured":"Engel, J., Agrawal, K. K., Chen, S., Gulrajani, I., Donahue, C., & Roberts, A. (2019). Gansynth: Adversarial neural audio synthesis. In ICLR."},{"key":"1816_CR15","doi-asserted-by":"crossref","unstructured":"Ephrat, A., Mosseri, I., Lang, O., Dekel, T., Wilson, K., Hassidim, A., Freeman, W. T., & Rubinstein, M. (2018). Looking to listen at the cocktail party: A speaker-independent audio-visual model for speech separation. In SIGGRAPH.","DOI":"10.1145\/3197517.3201357"},{"key":"1816_CR16","doi-asserted-by":"crossref","unstructured":"Font, F., Roma, G., & Serra, X. (2013). Freesound technical demo. In Proceedings of the 21st ACM International Conference on Multimedia.","DOI":"10.1145\/2502081.2502245"},{"key":"1816_CR17","doi-asserted-by":"crossref","unstructured":"Gabbay, A., Shamir, A., & Peleg, S. (2018). Visual speech enhancement. In INTERSPEECH.","DOI":"10.21437\/Interspeech.2018-1955"},{"key":"1816_CR18","doi-asserted-by":"crossref","unstructured":"Gan, C., Huang, D., Chen, P., Tenenbaum, J. B., & Torralba, A. (2020). Foley music: Learning to generate music from videos. In ECCV.","DOI":"10.1007\/978-3-030-58621-8_44"},{"key":"1816_CR19","doi-asserted-by":"crossref","unstructured":"Gan, C., Huang, D., Zhao, H., Tenenbaum, J. B., & Torralba, A. (2020). Music gesture for visual sound separation. In CVPR.","DOI":"10.1109\/CVPR42600.2020.01049"},{"key":"1816_CR20","doi-asserted-by":"crossref","unstructured":"Gan, C., Zhang, Y., Wu, J., Gong, B., Tenenbaum, J.B. (2020). Look, listen, and act: Towards audio-visual embodied navigation. ICRA.","DOI":"10.1109\/ICRA40945.2020.9197008"},{"key":"1816_CR21","doi-asserted-by":"crossref","unstructured":"Gao, R., Chen, C., Al-Halah, Z., Schissler, C., & Grauman, K. (2020). Visualechoes: Spatial image representation learning through echolocation. In ECCV.","DOI":"10.1007\/978-3-030-58545-7_38"},{"key":"1816_CR22","doi-asserted-by":"crossref","unstructured":"Gao, R., Feris, R., & Grauman, K. (2018). Learning to separate object sounds by watching unlabeled video. In ECCV.","DOI":"10.1007\/978-3-030-01219-9_3"},{"key":"1816_CR23","doi-asserted-by":"crossref","unstructured":"Gao, R., & Grauman, K. (2019a). 2.5d visual sound. In CVPR.","DOI":"10.1109\/CVPR.2019.00041"},{"key":"1816_CR24","doi-asserted-by":"crossref","unstructured":"Gao, R., & Grauman, K. (2019b). Co-separating sounds of visual objects. In ICCV.","DOI":"10.1109\/ICCV.2019.00398"},{"key":"1816_CR25","doi-asserted-by":"crossref","unstructured":"Gao, R., & Grauman, K. (2021). Visualvoice: Audio-visual speech separation with cross-modal consistency. In CVPR.","DOI":"10.1109\/CVPR46437.2021.01524"},{"key":"1816_CR26","doi-asserted-by":"crossref","unstructured":"Gao, R., Oh, T.-H., Grauman, K., & Torresani, L. (2020). Listen to look: Action recognition by previewing audio. In CVPR.","DOI":"10.1109\/CVPR42600.2020.01047"},{"key":"1816_CR27","unstructured":"Garg, R., Gao, R., & Grauman, K. (2021). Geometry-aware multi-task learning for binaural audio generation from video. In BMVC."},{"key":"1816_CR28","doi-asserted-by":"crossref","unstructured":"Griffin, D., & Lim, J. (1984). Signal estimation from modified short-time fourier transform. In IEEE Transactions on Acoustics, Speech, and Signal Processing.","DOI":"10.1109\/TASSP.1984.1164317"},{"key":"1816_CR29","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In CVPR.","DOI":"10.1109\/CVPR.2016.90"},{"key":"1816_CR30","doi-asserted-by":"crossref","unstructured":"Hu, D., & Li, X. (2016). Temporal multimodal learning in audiovisual speech recognition. In CVPR.","DOI":"10.1109\/CVPR.2016.389"},{"key":"1816_CR31","unstructured":"Hu, D., Qian, R., Jiang, M., Tan, X., Wen, S., Ding, E., Lin, W., & Dou, D. (2020). Discriminative sounding objects localization via self-supervised audiovisual matching. In NeurIPS."},{"key":"1816_CR32","unstructured":"Kingma, D., & Ba, J. (2015). Adam: A method for stochastic optimization. In ICLR."},{"key":"1816_CR33","unstructured":"Korbar, B., Tran, D., & Torresani, L. (2018). Co-training of audio and video representations from self-supervised temporal synchronization. In NeurIPS."},{"key":"1816_CR34","doi-asserted-by":"crossref","unstructured":"Lu, Y.-D., Lee, H.-Y., Tseng, H.-Y., & Yang, M.-H. (2019). Self-supervised audio spatialization with correspondence classifier. In ICIP.","DOI":"10.1109\/ICIP.2019.8803494"},{"key":"1816_CR35","doi-asserted-by":"crossref","unstructured":"Majumder, S., Al-Halah, Z., & Grauman, K. (2021). Move2Hear: Active audio-visual source separation. In ICCV.","DOI":"10.1109\/ICCV48922.2021.00034"},{"key":"1816_CR36","doi-asserted-by":"crossref","unstructured":"Majumder, S., & Grauman, K. (2022). Active audio-visual separation of dynamic sound sources. In ECCV.","DOI":"10.1007\/978-3-031-19842-7_32"},{"key":"1816_CR37","unstructured":"Morgado, P., Li, Y., & Nvasconcelos, N. (2020). Learning representations from audio-visual spatial alignment. In NeurIPS."},{"key":"1816_CR38","unstructured":"Morgado, P., Vasconcelos, N., Langlois, T., & Wang, O. (2018). Self-supervised generation of spatial audio for 360$${}^\\circ $$ video. In: NeurIPS."},{"key":"1816_CR39","unstructured":"Murphy, D. T., & Shelley, S. (2010). Openair: An interactive auralization web resource and database. In Audio Engineering Society Convention 129."},{"key":"1816_CR40","doi-asserted-by":"crossref","unstructured":"Owens, A., & Efros, A. A. (2018). Audio-visual scene analysis with self-supervised multisensory features. In ECCV.","DOI":"10.1007\/978-3-030-01231-1_39"},{"key":"1816_CR41","doi-asserted-by":"crossref","unstructured":"Owens, A., Isola, P., McDermott, J., Torralba, A., Adelson, E. H., & Freeman, W. T. (2016). Visually indicated sounds. In CVPR.","DOI":"10.1109\/CVPR.2016.264"},{"key":"1816_CR42","doi-asserted-by":"crossref","unstructured":"Owens, A., Wu, J., McDermott, J. H., Freeman, W. T., & Torralba, A. (2016). Ambient sound provides supervision for visual learning. In ECCV.","DOI":"10.1007\/978-3-319-46448-0_48"},{"key":"1816_CR43","unstructured":"Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen, T., Lin, Z., Gimelshein, Z., Antiga, L., Desmaison, A., K\u00f6pf, A., Yang, E. Z., DeVito, Z., Raison, M., Tejani, A., Chilamkurthy, S., Steiner, B., Fang, L., Bai, J., & Chintala, S. (2019). Pytorch: An imperative style, high-performance deep learning library. In NeurIPS."},{"key":"1816_CR44","doi-asserted-by":"crossref","unstructured":"Perraudin, N., Balazs, P., & S\u00f8ndergaard, P. L. (2013). A fast griffin-lim algorithm. In WASPAA.","DOI":"10.1109\/WASPAA.2013.6701851"},{"key":"1816_CR45","doi-asserted-by":"crossref","unstructured":"Purushwalkam, S., Gari, S. V. A., Ithapu, V. K., Schissler, C., Robinson, P., Gupta, A., & Grauman, K. (2021). Audio-visual floorplan reconstruction. In ICCV.","DOI":"10.1109\/ICCV48922.2021.00122"},{"key":"1816_CR46","doi-asserted-by":"crossref","unstructured":"Rayleigh, L. (1875). On our perception of the direction of a source of sound. In Proceedings of the Musical Association.","DOI":"10.1093\/jrma\/2.1.75"},{"key":"1816_CR47","unstructured":"Richard, A., Markovic, D., Gebru, I. D., Krenn, S., Butler, G., de\u00a0la Torre, F., & Sheikh, Y. (2021). Neural synthesis of binaural speech from mono audio. In ICLR."},{"key":"1816_CR48","doi-asserted-by":"crossref","unstructured":"Ronneberger, O., Fischer, P., Brox, T. (2015). U-net: Convolutional networks for biomedical image segmentation. In International conference on medical image computing and computer-assisted intervention.","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"1816_CR49","doi-asserted-by":"crossref","unstructured":"Rouditchenko, A., Zhao, H., Gan, C., McDermott, J., & Torralba, A. (2019). Self-supervised audio-visual co-segmentation. In ICASSP.","DOI":"10.1109\/ICASSP.2019.8682467"},{"key":"1816_CR50","doi-asserted-by":"crossref","unstructured":"Savva, M., Kadian, A., Maksymets, O., Zhao, Y., Wijmans, E., Jain, B., Straub, J., Liu, J., Koltun, V., Malik, J., Parikh, D., & Batra D. (2019). Habitat: A platform for embodied ai research. In ICCV.","DOI":"10.1109\/ICCV.2019.00943"},{"key":"1816_CR51","doi-asserted-by":"crossref","unstructured":"Schissler, C., Loftin, C., & Manocha, D. (2017). Acoustic classification and optimization for multi-modal rendering of real-world scenes. IEEE Transactions on Visualization and Computer Graphics.","DOI":"10.1109\/TVCG.2017.2666150"},{"key":"1816_CR52","doi-asserted-by":"crossref","unstructured":"Schroeder, M. R. (1965). New method of measuring reverberation time. The Journal of the Acoustical Society of America, 37(6), 1187\u20131188.","DOI":"10.1121\/1.1939454"},{"key":"1816_CR53","doi-asserted-by":"crossref","unstructured":"Senocak, A., Oh, T.-H., Kim, J., Yang, M.-H., & So\u00a0Kweon, I. (2018). Learning to localize sound source in visual scenes. In CVPR.","DOI":"10.1109\/CVPR.2018.00458"},{"key":"1816_CR54","doi-asserted-by":"crossref","unstructured":"Tang, Z., Bryan, N.J., Li, D., Langlois, T. R., & Manocha, D. (2020). Scene-aware audio rendering via deep acoustic analysis. In IEEE Transactions on Visualization and Computer Graphics.","DOI":"10.1109\/TVCG.2020.2973058"},{"key":"1816_CR55","doi-asserted-by":"crossref","unstructured":"Tian, Y., Li, D., & Xu, C. (2020). Unified multisensory perception: Weakly-supervised audio-visual video parsing. In ECCV.","DOI":"10.1007\/978-3-030-58580-8_26"},{"key":"1816_CR56","doi-asserted-by":"crossref","unstructured":"Tian, Y., Shi, J., Li, B., Duan, Z., & Xu, C. (2018). Audio-visual event localization in unconstrained videos. In ECCV.","DOI":"10.1007\/978-3-030-01216-8_16"},{"key":"1816_CR57","unstructured":"Tzinis, E., Wisdom, S., Jansen, A., Hershey, S., Remez, T., Ellis, D. P., & Hershey, J. R. (2021). Into the wild with audioscope: Unsupervised audio-visual separation of on-screen sounds. In ICLR."},{"key":"1816_CR58","unstructured":"Van\u00a0der Maaten, L., & Hinton, G. (2008). Visualizing data using t-sne. In JMLR."},{"key":"1816_CR59","doi-asserted-by":"crossref","unstructured":"Wu, Y., Zhu, L., Yan, Y., & Yang, Y. (2019). Dual attention matching for audio-visual event localization. In ICCV.","DOI":"10.1109\/ICCV.2019.00639"},{"key":"1816_CR60","doi-asserted-by":"crossref","unstructured":"Xu, X., Dai, B., & Lin, D. (2019). Recursive visual sound separation using minus-plus net. In ICCV.","DOI":"10.1109\/ICCV.2019.00097"},{"key":"1816_CR61","doi-asserted-by":"crossref","unstructured":"Xu, X., Zhou, H., Liu, Z., Dai, B., Wang, X., & Lin, D. (2021). Visually informed binaural audio generation without binaural audios. In CVPR.","DOI":"10.1109\/CVPR46437.2021.01523"},{"key":"1816_CR62","doi-asserted-by":"crossref","unstructured":"Yang, K., Russell, B., & Salamon, J. (2020). Telling left from right: Learning spatial correspondence of sight and sound. In CVPR.","DOI":"10.1109\/CVPR42600.2020.00995"},{"key":"1816_CR63","doi-asserted-by":"crossref","unstructured":"Yu, J., Zhang, S.-X., Wu, J., Ghorbani, S., Wu, B., Kang, S., Liu, S., Liu, X., Meng, H., & Yu, D. (2020). Audio-visual recognition of overlapped speech for the lrs2 dataset. In ICASSP.","DOI":"10.1109\/ICASSP40776.2020.9054127"},{"issue":"6","key":"1816_CR64","doi-asserted-by":"publisher","first-page":"3616","DOI":"10.1121\/1.5040489","volume":"143","author":"M Zaunschirm","year":"2018","unstructured":"Zaunschirm, M., Sch\u00f6rkhuber, C., & H\u00f6ldrich, R. (2018). Binaural rendering of ambisonic signals by head-related impulse response time alignment and a diffuseness constraint. The Journal of the Acoustical Society of America, 143(6), 3616\u20133627.","journal-title":"The Journal of the Acoustical Society of America"},{"key":"1816_CR65","doi-asserted-by":"crossref","unstructured":"Zhao, H., Gan, C., Ma, W.-C., & Torralba, A. (2019). The sound of motions. In ICCV.","DOI":"10.1109\/ICCV.2019.00182"},{"key":"1816_CR66","doi-asserted-by":"crossref","unstructured":"Zhao, H., Gan, C., Rouditchenko, A., Vondrick, C., McDermott, J., & Torralba, A. (2018). The sound of pixels. In ECCV.","DOI":"10.1007\/978-3-030-01246-5_35"},{"key":"1816_CR67","doi-asserted-by":"crossref","unstructured":"Zhou, H., Liu, Y., Liu, Z., Luo, P., & Wang, X. (2019). Talking face generation by adversarially disentangled audio-visual representation. In AAAI.","DOI":"10.1609\/aaai.v33i01.33019299"},{"key":"1816_CR68","doi-asserted-by":"crossref","unstructured":"Zhou, H., Xu, X., Lin, D., Wang, X., & Liu, Z. (2020). Sep-stereo: Visually guided stereophonic audio generation by associating source separation. In ECCV.","DOI":"10.1007\/978-3-030-58610-2_4"},{"key":"1816_CR69","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Wang, Z., Fang, C., Bui, T., & Berg, T. L. (2018). Visual to sound: Generating natural sound for videos in the wild. In CVPR.","DOI":"10.1109\/CVPR.2018.00374"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-023-01816-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-023-01816-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-023-01816-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,8,19]],"date-time":"2023-08-19T02:11:31Z","timestamp":1692411091000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-023-01816-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6,20]]},"references-count":69,"journal-issue":{"issue":"10","published-print":{"date-parts":[[2023,10]]}},"alternative-id":["1816"],"URL":"https:\/\/doi.org\/10.1007\/s11263-023-01816-8","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,6,20]]},"assertion":[{"value":"20 August 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 May 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 June 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}