{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T20:49:30Z","timestamp":1773262170481,"version":"3.50.1"},"publisher-location":"Cham","reference-count":70,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031198410","type":"print"},{"value":"9783031198427","type":"electronic"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-19842-7_32","type":"book-chapter","created":{"date-parts":[[2022,10,22]],"date-time":"2022-10-22T12:12:59Z","timestamp":1666440779000},"page":"551-569","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":16,"title":["Active Audio-Visual Separation of\u00a0Dynamic Sound Sources"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2851-1510","authenticated-orcid":false,"given":"Sagnik","family":"Majumder","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9591-5873","authenticated-orcid":false,"given":"Kristen","family":"Grauman","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,10,23]]},"reference":[{"key":"32_CR1","doi-asserted-by":"crossref","unstructured":"Afouras, T., Chung, J.S., Zisserman, A.: The conversation: deep audio-visual speech enhancement. arXiv preprint arXiv:1804.04121 (2018)","DOI":"10.21437\/Interspeech.2018-1400"},{"key":"32_CR2","doi-asserted-by":"crossref","unstructured":"Afouras, T., Chung, J.S., Zisserman, A.: My lips are concealed: audio-visual speech enhancement through obstructions. arXiv preprint arXiv:1907.04975 (2019)","DOI":"10.21437\/Interspeech.2019-3114"},{"issue":"4\u20135","key":"32_CR3","doi-asserted-by":"publisher","first-page":"437","DOI":"10.1177\/0278364914548050","volume":"34","author":"X Alameda-Pineda","year":"2015","unstructured":"Alameda-Pineda, X., Horaud, R.: Vision-guided robot hearing. Int. J. Robot. Res. 34(4\u20135), 437\u2013456 (2015)","journal-title":"Int. J. Robot. Res."},{"key":"32_CR4","unstructured":"Yu, Y., Huang, W., Sun, F., Chen, C., Wang, Y., Liu, X.: sound adversarial audio-visual navigation. In: Submitted to The Tenth International Conference on Learning Representations (2022). https:\/\/openreview.net\/forum?id=NkZq4OEYN-"},{"key":"32_CR5","doi-asserted-by":"crossref","unstructured":"Asano, F., Goto, M., Itou, K., Asoh, H.: Real-time sound source localization and separation system and its application to automatic speech recognition. In: Eurospeech (2001)","DOI":"10.21437\/Eurospeech.2001-291"},{"key":"32_CR6","doi-asserted-by":"crossref","unstructured":"Ban, Y., Li, X., Alameda-Pineda, X., Girin, L., Horaud, R.: Accounting for room acoustics in audio-visual multi-speaker tracking. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2018)","DOI":"10.1109\/ICASSP.2018.8462100"},{"key":"32_CR7","doi-asserted-by":"publisher","unstructured":"Barzelay, Z., Schechner, Y.Y.: Harmony in motion. In: 2007 IEEE Conference on Computer Vision and Pattern Recognition, pp. 1\u20138 (2007). https:\/\/doi.org\/10.1109\/CVPR.2007.383344","DOI":"10.1109\/CVPR.2007.383344"},{"key":"32_CR8","unstructured":"Bellemare, M.G., Srinivasan, S., Ostrovski, G., Schaul, T., Saxton, D., Munos, R.: Unifying count-based exploration and intrinsic motivation. arXiv preprint arXiv:1606.01868 (2016)"},{"issue":"2","key":"32_CR9","doi-asserted-by":"publisher","first-page":"477","DOI":"10.1007\/s10514-017-9639-8","volume":"42","author":"G Bustamante","year":"2017","unstructured":"Bustamante, G., Dan\u00e8s, P., Forgue, T., Podlubne, A., Manh\u00e8s, J.: An information based feedback control for audio-motor binaural localization. Auton. Robots 42(2), 477\u2013490 (2017). https:\/\/doi.org\/10.1007\/s10514-017-9639-8","journal-title":"Auton. Robots"},{"key":"32_CR10","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"406","DOI":"10.1007\/978-3-030-66823-5_24","volume-title":"Computer Vision","author":"T Campari","year":"2020","unstructured":"Campari, T., Eccher, P., Serafini, L., Ballan, L.: Exploiting scene-specific features for object goal navigation. In: Bartoli, A., Fusiello, A. (eds.) ECCV 2020. LNCS, vol. 12538, pp. 406\u2013421. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-66823-5_24"},{"key":"32_CR11","doi-asserted-by":"crossref","unstructured":"Chang, A., et al.: Matterport3d: learning from RGB-D data in indoor environments. In: International Conference on 3D Vision (3DV) (2017). matterPort3D dataset license. http:\/\/kaldir.vc.in.tum.de\/matterport\/MP_TOS.pdf","DOI":"10.1109\/3DV.2017.00081"},{"key":"32_CR12","doi-asserted-by":"crossref","unstructured":"Chen, C., Al-Halah, Z., Grauman, K.: Semantic audio-visual navigation. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.01526"},{"key":"32_CR13","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"17","DOI":"10.1007\/978-3-030-58539-6_2","volume-title":"Computer Vision","author":"C Chen","year":"2020","unstructured":"Chen, C., et al.: SoundSpaces: audio-visual navigation in\u00a03d\u00a0environments. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12351, pp. 17\u201336. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58539-6_2"},{"key":"32_CR14","unstructured":"Chen, C., Majumder, S., Al-Halah, Z., Gao, R., Ramakrishnan, S.K., Grauman, K.: Learning to set waypoints for audio-visual navigation. In: International Conference on Learning Representations (2021). https:\/\/openreview.net\/forum?id=cR91FAodFMe"},{"key":"32_CR15","doi-asserted-by":"crossref","unstructured":"Chen, J., Mao, Q., Liu, D.: Dual-path transformer network: direct context-aware modeling for end-to-end monaural speech separation. arXiv preprint arXiv:2007.13975 (2020)","DOI":"10.21437\/Interspeech.2020-2205"},{"key":"32_CR16","doi-asserted-by":"crossref","unstructured":"Chen, K., Chen, J.K., Chuang, J., V\u00e1zquez, M., Savarese, S.: Topological planning with transformers for vision-and-language navigation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11276\u201311286 (2021)","DOI":"10.1109\/CVPR46437.2021.01112"},{"key":"32_CR17","unstructured":"Chung, J., Kastner, K., Dinh, L., Goel, K., Courville, A.C., Bengio, Y.: A recurrent latent variable model for sequential data. In: NeurIPS (2015)"},{"key":"32_CR18","doi-asserted-by":"crossref","unstructured":"Chung, S.W., Choe, S., Chung, J.S., Kang, H.G.: Facefilter: audio-visual speech separation using still images. arXiv preprint arXiv:2005.07074 (2020)","DOI":"10.21437\/Interspeech.2020-1065"},{"key":"32_CR19","doi-asserted-by":"publisher","unstructured":"Deleforge, A., Horaud, R.: The cocktail party robot: sound source separation and localisation with an active binaural head. In: HRI 2012\u20137th ACM\/IEEE International Conference on Human Robot Interaction, pp. 431\u2013438. ACM, Boston, United States, March 2012. https:\/\/doi.org\/10.1145\/2157689.2157834,https:\/\/hal.inria.fr\/hal-00768668","DOI":"10.1145\/2157689.2157834,"},{"issue":"7","key":"32_CR20","doi-asserted-by":"publisher","first-page":"1830","DOI":"10.1109\/TASL.2010.2050716","volume":"18","author":"NQ Duong","year":"2010","unstructured":"Duong, N.Q., Vincent, E., Gribonval, R.: Under-determined reverberant audio source separation using a full-rank spatial covariance model. IEEE Trans. Audio Speech Lang. Process. 18(7), 1830\u20131840 (2010)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"32_CR21","doi-asserted-by":"crossref","unstructured":"Ephrat, A., et al.: Looking to listen at the cocktail party: a speaker-independent audio-visual model for speech separation. arXiv preprint arXiv:1804.03619 (2018)","DOI":"10.1145\/3197517.3201357"},{"key":"32_CR22","doi-asserted-by":"crossref","unstructured":"Fang, K., Toshev, A., Fei-Fei, L., Savarese, S.: Scene memory transformer for embodied agents in long-horizon tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 538\u2013547 (2019)","DOI":"10.1109\/CVPR.2019.00063"},{"key":"32_CR23","unstructured":"Fisher III, J.W., Darrell, T., Freeman, W., Viola, P.: Learning joint statistical models for audio-visual fusion and segregation. In: Leen, T., Dietterich, T., Tresp, V. (eds.) Advances in Neural Information Processing Systems, vol. 13, pp. 772\u2013778. MIT Press (2001). https:\/\/proceedings.neurips.cc\/paper\/2000\/file\/11f524c3fbfeeca4aa916edcb6b6392e-Paper.pdf"},{"key":"32_CR24","doi-asserted-by":"crossref","unstructured":"Gabbay, A., Shamir, A., Peleg, S.: Visual speech enhancement. arXiv preprint arXiv:1711.08789 (2017)","DOI":"10.21437\/Interspeech.2018-1955"},{"key":"32_CR25","doi-asserted-by":"crossref","unstructured":"Gan, C., Huang, D., Zhao, H., Tenenbaum, J.B., Torralba, A.: Music gesture for visual sound separation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10478\u201310487 (2020)","DOI":"10.1109\/CVPR42600.2020.01049"},{"key":"32_CR26","doi-asserted-by":"crossref","unstructured":"Gan, C., Zhang, Y., Wu, J., Gong, B., Tenenbaum, J.B.: Look, listen, and act: towards audio-visual embodied navigation. In: ICRA (2020)","DOI":"10.1109\/ICRA40945.2020.9197008"},{"key":"32_CR27","doi-asserted-by":"crossref","unstructured":"Gao, R., Feris, R., Grauman, K.: Learning to separate object sounds by watching unlabeled video. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 35\u201353 (2018)","DOI":"10.1007\/978-3-030-01219-9_3"},{"key":"32_CR28","doi-asserted-by":"crossref","unstructured":"Gao, R., Grauman, K.: 2.5d visual sound. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00041"},{"key":"32_CR29","doi-asserted-by":"crossref","unstructured":"Gao, R., Grauman, K.: Co-separating sounds of visual objects. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3879\u20133888 (2019)","DOI":"10.1109\/ICCV.2019.00398"},{"key":"32_CR30","doi-asserted-by":"crossref","unstructured":"Gao, R., Grauman, K.: Visualvoice: audio-visual speech separation with cross-modal consistency. arXiv preprint arXiv:2101.03149 (2021)","DOI":"10.1109\/CVPR46437.2021.01524"},{"key":"32_CR31","doi-asserted-by":"publisher","unstructured":"Gu, R., et al.: Neural spatial filter: target speaker speech separation assisted with directional information. In: Kubin, G., Kacic, Z. (eds.) Interspeech 2019, 20th Annual Conference of the International Speech Communication Association, Graz, Austria, 15\u201319 September 2019, pp. 4290\u20134294. ISCA (2019). https:\/\/doi.org\/10.21437\/Interspeech.2019-2266","DOI":"10.21437\/Interspeech.2019-2266"},{"key":"32_CR32","unstructured":"Gu, R., Zou, Y.: Temporal-spatial neural filter: direction informed end-to-end multi-channel target speech separation. arXiv preprint arXiv:2001.00391 (2020)"},{"key":"32_CR33","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"32_CR34","unstructured":"Hershey, J.R., Movellan, J.R.: Audio vision: Using audio-visual synchrony to locate sounds. In: NeurIPS (2000)"},{"key":"32_CR35","doi-asserted-by":"publisher","unstructured":"Huang, P., Kim, M., Hasegawa-Johnson, M., Smaragdis, P.: Deep learning for monaural speech separation. In: 2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1562\u20131566 (2014). https:\/\/doi.org\/10.1109\/ICASSP.2014.6853860","DOI":"10.1109\/ICASSP.2014.6853860"},{"key":"32_CR36","doi-asserted-by":"publisher","unstructured":"Li, B., Dinesh, K., Duan, Z., Sharma, G.: See and listen: score-informed association of sound tracks to players in chamber music performance videos. In: 2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 2906\u20132910 (2017). https:\/\/doi.org\/10.1109\/ICASSP.2017.7952688","DOI":"10.1109\/ICASSP.2017.7952688"},{"key":"32_CR37","unstructured":"Lu, W.T., Wang, J.C., Won, M., Choi, K., Song, X.: Spectnt: a time-frequency transformer for music audio. arXiv preprint arXiv:2110.09127 (2021)"},{"key":"32_CR38","doi-asserted-by":"crossref","unstructured":"Majumder, S., Al-Halah, Z., Grauman, K.: Move2Hear: active audio-visual source separation. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00034"},{"key":"32_CR39","unstructured":"Mezghani, L., et al.: Memory-augmented reinforcement learning for image-goal navigation. arXiv preprint arXiv:2101.05181 (2021)"},{"key":"32_CR40","unstructured":"Mezghani, L., Sukhbaatar, S., Szlam, A., Joulin, A., Bojanowski, P.: Learning to visually navigate in photorealistic environments without any supervision. arXiv preprint arXiv:2004.04954 (2020)"},{"issue":"4","key":"32_CR41","doi-asserted-by":"publisher","first-page":"800","DOI":"10.1109\/JSTSP.2019.2922820","volume":"13","author":"K \u017dmol\u00edkov\u00e1","year":"2019","unstructured":"\u017dmol\u00edkov\u00e1, K., et al.: Speakerbeam: speaker aware neural network for target speaker extraction in speech mixtures. IEEE J. Sel. Top. Sign. Proces. 13(4), 800\u2013814 (2019). https:\/\/doi.org\/10.1109\/JSTSP.2019.2922820","journal-title":"IEEE J. Sel. Top. Sign. Proces."},{"key":"32_CR42","doi-asserted-by":"crossref","unstructured":"Nakadai, K., Hidai, K.i., Okuno, H.G., Kitano, H.: Real-time speaker localization and speech separation by audio-visual integration. In: Proceedings 2002 IEEE International Conference on Robotics and Automation (Cat. No. 02CH37292), vol. 1, pp. 1043\u20131049. IEEE (2002)","DOI":"10.1109\/ROBOT.2002.1013493"},{"key":"32_CR43","unstructured":"Nakadai, K., Lourens, T., Okuno, H.G., Kitano, H.: Active audition for humanoid. In: AAAI (2000)"},{"key":"32_CR44","doi-asserted-by":"publisher","unstructured":"Ochiai, T., et al.: Listen to what you want: neural network-based universal sound selector. In: Meng, H., Xu, B., Zheng, T.F. (eds.) Interspeech 2020, 21st Annual Conference of the International Speech Communication Association, Virtual Event, Shanghai, China, 25\u201329 October 2020, pp. 1441\u20131445. ISCA (2020). https:\/\/doi.org\/10.21437\/Interspeech.2020-2210, https:\/\/doi.org\/10.21437\/Interspeech.2020-2210","DOI":"10.21437\/Interspeech.2020-2210"},{"key":"32_CR45","doi-asserted-by":"crossref","unstructured":"Owens, A., Efros, A.A.: Audio-visual scene analysis with self-supervised multisensory features. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 631\u2013648 (2018)","DOI":"10.1007\/978-3-030-01231-1_39"},{"key":"32_CR46","doi-asserted-by":"publisher","unstructured":"Panayotov, V., Chen, G., Povey, D., Khudanpur, S.: Librispeech: An ASR corpus based on public domain audio books. In: 2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5206\u20135210 (2015). https:\/\/doi.org\/10.1109\/ICASSP.2015.7178964","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"32_CR47","doi-asserted-by":"publisher","unstructured":"Parekh, S., Essid, S., Ozerov, A., Duong, N.Q.K., P\u00e9rez, P., Richard, G.: Motion informed audio source separation. In: 2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6\u201310 (2017). https:\/\/doi.org\/10.1109\/ICASSP.2017.7951787","DOI":"10.1109\/ICASSP.2017.7951787"},{"key":"32_CR48","doi-asserted-by":"publisher","unstructured":"Piczak, K.J.: ESC: dataset for environmental sound classification. In: Proceedings of the 23rd Annual ACM Conference on Multimedia, pp. 1015\u20131018. ACM Press. https:\/\/doi.org\/10.1145\/2733373.2806390, http:\/\/dl.acm.org\/citation.cfm?doid=2733373.2806390","DOI":"10.1145\/2733373.2806390"},{"key":"32_CR49","doi-asserted-by":"crossref","unstructured":"Pu, J., Panagakis, Y., Petridis, S., Pantic, M.: Audio-visual object localization and separation using low-rank and sparsity. In: 2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 2901\u20132905. IEEE (2017)","DOI":"10.1109\/ICASSP.2017.7952687"},{"key":"32_CR50","unstructured":"Ramakrishnan, S.K., Nagarajan, T., Al-Halah, Z., Grauman, K.: Environment predictive coding for embodied agents. arXiv preprint arXiv:2102.02337 (2021)"},{"key":"32_CR51","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"234","DOI":"10.1007\/978-3-319-24574-4_28","volume-title":"Medical Image Computing and Computer-Assisted Intervention","author":"O Ronneberger","year":"2015","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-net: convolutional networks for biomedical image segmentation. In: Navab, N., Hornegger, J., Wells, W.M., Frangi, A.F. (eds.) MICCAI 2015. LNCS, vol. 9351, pp. 234\u2013241. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-24574-4_28"},{"key":"32_CR52","doi-asserted-by":"publisher","unstructured":"Roux, J.L., Wisdom, S., Erdogan, H., Hershey, J.R.: SDR - half-baked or well done? In: ICASSP 2019\u20132019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 626\u2013630 (2019). https:\/\/doi.org\/10.1109\/ICASSP.2019.8683855","DOI":"10.1109\/ICASSP.2019.8683855"},{"key":"32_CR53","doi-asserted-by":"crossref","unstructured":"Savva, M., et al.: Habitat: a platform for embodied AI research. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00943"},{"key":"32_CR54","doi-asserted-by":"publisher","unstructured":"Sedighin, F., Babaie-Zadeh, M., Rivet, B., Jutten, C.: Two multimodal approaches for single microphone source separation. In: 2016 24th European Signal Processing Conference (EUSIPCO), pp. 110\u2013114 (2016). https:\/\/doi.org\/10.1109\/EUSIPCO.2016.7760220","DOI":"10.1109\/EUSIPCO.2016.7760220"},{"key":"32_CR55","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"414","DOI":"10.1007\/978-3-540-74494-8_52","volume-title":"Independent Component Analysis and Signal Separation","author":"P Smaragdis","year":"2007","unstructured":"Smaragdis, P., Raj, B., Shashanka, M.: Supervised and semi-supervised separation of sounds from single-channel mixtures. In: Davies, M.E., James, C.J., Abdallah, S.A., Plumbley, M.D. (eds.) ICA 2007. LNCS, vol. 4666, pp. 414\u2013421. Springer, Heidelberg (2007). https:\/\/doi.org\/10.1007\/978-3-540-74494-8_52"},{"key":"32_CR56","unstructured":"Smaragdis, P., Smaragdis, P.: Audio\/visual independent components. In: Proceedings of International Symposium on Independant Component Analysis and Blind Source Separation (2003)"},{"key":"32_CR57","unstructured":"Spiertz, M., Gnann, V.: Source-filter based clustering for monaural blind source separation. In: Proceedings of International Conference on Digital Audio Effects DAFx\u201909 (2009)"},{"key":"32_CR58","doi-asserted-by":"crossref","unstructured":"Subakan, C., Ravanelli, M., Cornell, S., Bronzi, M., Zhong, J.: Attention is all you need in speech separation. In: ICASSP 2021\u20132021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 21\u201325. IEEE (2021)","DOI":"10.1109\/ICASSP39728.2021.9413901"},{"key":"32_CR59","unstructured":"Tzinis, E., et al.: Into the wild with audioscope: unsupervised audio-visual separation of on-screen sounds. arXiv preprint arXiv:2011.01143 (2020)"},{"key":"32_CR60","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, pp. 5998\u20136008 (2017)"},{"issue":"6","key":"32_CR61","doi-asserted-by":"publisher","first-page":"9522","DOI":"10.3390\/s140609522","volume":"14","author":"R Viciana-Abad","year":"2014","unstructured":"Viciana-Abad, R., Marfil, R., Perez-Lorenzo, J., Bandera, J., Romero-Garces, A., Reche-Lopez, P.: Audio-visual perception system for a humanoid robotic head. Sensors 14(6), 9522\u20139545 (2014)","journal-title":"Sensors"},{"key":"32_CR62","doi-asserted-by":"crossref","unstructured":"Virtanen, T.: Monaural sound source separation by nonnegative matrix factorization with temporal continuity and sparseness criteria. IEEE Trans. Audio Speech Lang. Process. 5(3), 1066\u20131074 (2007)","DOI":"10.1109\/TASL.2006.885253"},{"key":"32_CR63","doi-asserted-by":"crossref","unstructured":"Weiss, R.J., Mandel, M.I., Ellis, D.P.: Source separation based on binaural cues and source model constraints. In: Ninth Annual Conference of the International Speech Communication Association, vol. 2008 (2009)","DOI":"10.21437\/Interspeech.2008-51"},{"key":"32_CR64","unstructured":"Wijmans, E., et al.: DD-PPO: learning near-perfect pointgoal navigators from 2.5 billion frames. arXiv preprint arXiv:1911.00357 (2019)"},{"key":"32_CR65","doi-asserted-by":"crossref","unstructured":"Xu, X., Dai, B., Lin, D.: Recursive visual sound separation using minus-plus net. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 882\u2013891 (2019)","DOI":"10.1109\/ICCV.2019.00097"},{"key":"32_CR66","doi-asserted-by":"crossref","unstructured":"\u00d6zg\u00fcr Y\u0131lmaz, Rickard, S.: Blind separation of speech mixtures via time-frequency masking. In: IEEE Transactions on Signal Processing (2002) Submitted (2004)","DOI":"10.1109\/TSP.2004.828896"},{"key":"32_CR67","unstructured":"Zadeh, A., Ma, T., Poria, S., Morency, L.P.: Wildmix dataset and spectro-temporal transformer model for monoaural audio source separation. arXiv preprint arXiv:1911.09783 (2019)"},{"issue":"5","key":"32_CR68","doi-asserted-by":"publisher","first-page":"1075","DOI":"10.1109\/TASLP.2017.2687104","volume":"25","author":"X Zhang","year":"2017","unstructured":"Zhang, X., Wang, D.: Deep learning based binaural speech separation in reverberant environments. IEEE\/ACM Trans. Audio Speech Lang. Process. 25(5), 1075\u20131084 (2017)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"32_CR69","doi-asserted-by":"crossref","unstructured":"Zhang, Z., He, B., Zhang, Z.: Transmask: a compact and fast speech separation model based on transformer. In: ICASSP 2021\u20132021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5764\u20135768. IEEE (2021)","DOI":"10.1109\/ICASSP39728.2021.9413670"},{"key":"32_CR70","doi-asserted-by":"crossref","unstructured":"Zhao, H., Gan, C., Rouditchenko, A., Vondrick, C., McDermott, J., Torralba, A.: The sound of pixels. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 570\u2013586 (2018)","DOI":"10.1007\/978-3-030-01246-5_35"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-19842-7_32","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,6]],"date-time":"2024-10-06T10:14:46Z","timestamp":1728209686000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-19842-7_32"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031198410","9783031198427"],"references-count":70,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-19842-7_32","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"23 October 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}