{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:24:29Z","timestamp":1777656269800,"version":"3.51.4"},"publisher-location":"Singapore","reference-count":69,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819609598","type":"print"},{"value":"9789819609604","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,12,8]],"date-time":"2024-12-08T00:00:00Z","timestamp":1733616000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,8]],"date-time":"2024-12-08T00:00:00Z","timestamp":1733616000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-0960-4_7","type":"book-chapter","created":{"date-parts":[[2024,12,7]],"date-time":"2024-12-07T07:37:03Z","timestamp":1733557023000},"page":"104-122","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["High-Quality Visually-Guided Sound Separation from\u00a0Diverse Categories"],"prefix":"10.1007","author":[{"given":"Chao","family":"Huang","sequence":"first","affiliation":[]},{"given":"Susan","family":"Liang","sequence":"additional","affiliation":[]},{"given":"Yapeng","family":"Tian","sequence":"additional","affiliation":[]},{"given":"Anurag","family":"Kumar","sequence":"additional","affiliation":[]},{"given":"Chenliang","family":"Xu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,8]]},"reference":[{"key":"7_CR1","doi-asserted-by":"crossref","unstructured":"Afouras, T., Owens, A., Chung, J.S., Zisserman, A.: Self-supervised learning of audio-visual objects from video. In: European Conference on Computer Vision. pp. 208\u2013224. Springer (2020)","DOI":"10.1007\/978-3-030-58523-5_13"},{"key":"7_CR2","unstructured":"Amit, T., Shaharbany, T., Nachmani, E., Wolf, L.: Segdiff: Image segmentation with diffusion probabilistic models. arXiv preprint arXiv:2112.00390 (2021)"},{"key":"7_CR3","first-page":"17981","volume":"34","author":"J Austin","year":"2021","unstructured":"Austin, J., Johnson, D.D., Ho, J., Tarlow, D., van den Berg, R.: Structured denoising diffusion models in discrete state-spaces. Adv. Neural. Inf. Process. Syst. 34, 17981\u201317993 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"7_CR4","doi-asserted-by":"crossref","unstructured":"Avrahami, O., Lischinski, D., Fried, O.: Blended diffusion for text-driven editing of natural images. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 18208\u201318218 (2022)","DOI":"10.1109\/CVPR52688.2022.01767"},{"key":"7_CR5","unstructured":"Baranchuk, D., Rubachev, I., Voynov, A., Khrulkov, V., Babenko, A.: Label-efficient semantic segmentation with diffusion models. arXiv preprint arXiv:2112.03126 (2021)"},{"key":"7_CR6","doi-asserted-by":"crossref","unstructured":"Brempong, E.A., Kornblith, S., Chen, T., Parmar, N., Minderer, M., Norouzi, M.: Denoising pretraining for semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 4175\u20134186 (2022)","DOI":"10.1109\/CVPRW56347.2022.00462"},{"key":"7_CR7","doi-asserted-by":"crossref","unstructured":"Chatterjee, M., Ahuja, N., Cherian, A.: Learning audio-visual dynamics using scene graphs for audio source separation. In: NeurIPS (2022)","DOI":"10.1109\/ICCV48922.2021.00124"},{"key":"7_CR8","doi-asserted-by":"crossref","unstructured":"Chatterjee, M., Le\u00a0Roux, J., Ahuja, N., Cherian, A.: Visual scene graphs for audio source separation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 1204\u20131213 (2021)","DOI":"10.1109\/ICCV48922.2021.00124"},{"key":"7_CR9","doi-asserted-by":"crossref","unstructured":"Chen, J., Zhang, R., Lian, D., Yang, J., Zeng, Z., Shi, J.: iquery: Instruments as queries for audio-visual sound separation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 14675\u201314686 (2023)","DOI":"10.1109\/CVPR52729.2023.01410"},{"key":"7_CR10","unstructured":"Chen, N., Zhang, Y., Zen, H., Weiss, R.J., Norouzi, M., Chan, W.: Wavegrad: Estimating gradients for waveform generation. arXiv preprint arXiv:2009.00713 (2020)"},{"key":"7_CR11","doi-asserted-by":"crossref","unstructured":"Chen, S., Sun, P., Song, Y., Luo, P.: Diffusiondet: Diffusion model for object detection. arXiv preprint arXiv:2211.09788 (2022)","DOI":"10.1109\/ICCV51070.2023.01816"},{"key":"7_CR12","unstructured":"Chen, T., Zhang, R., Hinton, G.: Analog bits: Generating discrete data using diffusion models with self-conditioning. arXiv preprint arXiv:2208.04202 (2022)"},{"key":"7_CR13","doi-asserted-by":"crossref","unstructured":"Chou, J.C., Chien, C.M., Livescu, K.: Av2wav: Diffusion-based re-synthesis from continuous self-supervised features for audio-visual speech enhancement. arXiv preprint arXiv:2309.08030 (2023)","DOI":"10.1109\/ICASSP48485.2024.10446625"},{"key":"7_CR14","first-page":"8780","volume":"34","author":"P Dhariwal","year":"2021","unstructured":"Dhariwal, P., Nichol, A.: Diffusion models beat gans on image synthesis. Adv. Neural. Inf. Process. Syst. 34, 8780\u20138794 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"7_CR15","unstructured":"Dong, H.W., Takahashi, N., Mitsufuji, Y., McAuley, J., Berg-Kirkpatrick, T.: Clipsep: Learning text-queried sound separation with noisy unlabeled videos. In: Proceedings of International Conference on Learning Representations (ICLR) (2023)"},{"key":"7_CR16","doi-asserted-by":"crossref","unstructured":"Dumoulin, V., Perez, E., Schucher, N., Strub, F., Vries, H.d., Courville, A., Bengio, Y.: Feature-wise transformations. Distill 3(7), e11 (2018)","DOI":"10.23915\/distill.00011"},{"key":"7_CR17","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1016\/j.neunet.2017.12.012","volume":"107","author":"S Elfwing","year":"2018","unstructured":"Elfwing, S., Uchibe, E., Doya, K.: Sigmoid-weighted linear units for neural network function approximation in reinforcement learning. Neural Netw. 107, 3\u201311 (2018)","journal-title":"Neural Netw."},{"key":"7_CR18","doi-asserted-by":"crossref","unstructured":"Ephrat, A., Mosseri, I., Lang, O., Dekel, T., Wilson, K., Hassidim, A., Freeman, W.T., Rubinstein, M.: Looking to listen at the cocktail party: A speaker-independent audio-visual model for speech separation. arXiv preprint arXiv:1804.03619 (2018)","DOI":"10.1145\/3197517.3201357"},{"key":"7_CR19","doi-asserted-by":"crossref","unstructured":"Gan, C., Huang, D., Zhao, H., Tenenbaum, J.B., Torralba, A.: Music gesture for visual sound separation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 10478\u201310487 (2020)","DOI":"10.1109\/CVPR42600.2020.01049"},{"key":"7_CR20","doi-asserted-by":"crossref","unstructured":"Gao, R., Feris, R., Grauman, K.: Learning to separate object sounds by watching unlabeled video. In: Proceedings of the European Conference on Computer Vision (ECCV). pp. 35\u201353 (2018)","DOI":"10.1007\/978-3-030-01219-9_3"},{"key":"7_CR21","doi-asserted-by":"crossref","unstructured":"Gao, R., Grauman, K.: Co-separating sounds of visual objects. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 3879\u20133888 (2019)","DOI":"10.1109\/ICCV.2019.00398"},{"key":"7_CR22","unstructured":"Gong, S., Li, M., Feng, J., Wu, Z., Kong, L.: Diffuseq: Sequence to sequence text generation with diffusion models. arXiv preprint arXiv:2210.08933 (2022)"},{"key":"7_CR23","doi-asserted-by":"crossref","unstructured":"Gu, S., Chen, D., Bao, J., Wen, F., Zhang, B., Chen, D., Yuan, L., Guo, B.: Vector quantized diffusion model for text-to-image synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 10696\u201310706 (2022)","DOI":"10.1109\/CVPR52688.2022.01043"},{"key":"7_CR24","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. Adv. Neural. Inf. Process. Syst. 33, 6840\u20136851 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"7_CR25","unstructured":"Ho, J., Salimans, T., Gritsenko, A., Chan, W., Norouzi, M., Fleet, D.J.: Video diffusion models. arXiv preprint arXiv:2204.03458 (2022)"},{"key":"7_CR26","doi-asserted-by":"crossref","unstructured":"Huang, C., Tian, Y., Kumar, A., Xu, C.: Egocentric audio-visual object localization. arXiv preprint arXiv:2303.13471 (2023)","DOI":"10.1109\/CVPR52729.2023.02194"},{"key":"7_CR27","doi-asserted-by":"crossref","unstructured":"Huang, R., Zhao, Z., Liu, H., Liu, J., Cui, C., Ren, Y.: Prodiff: Progressive fast diffusion model for high-quality text-to-speech. In: Proceedings of the 30th ACM International Conference on Multimedia. pp. 2595\u20132605 (2022)","DOI":"10.1145\/3503161.3547855"},{"key":"7_CR28","unstructured":"Kong, Z., Ping, W., Huang, J., Zhao, K., Catanzaro, B.: Diffwave: A versatile diffusion model for audio synthesis. arXiv preprint arXiv:2009.09761 (2020)"},{"key":"7_CR29","doi-asserted-by":"crossref","unstructured":"Lee, J., Han, S.: Nu-wave: A diffusion probabilistic model for neural audio upsampling. arXiv preprint arXiv:2104.02321 (2021)","DOI":"10.21437\/Interspeech.2021-36"},{"key":"7_CR30","doi-asserted-by":"crossref","unstructured":"Lee, S., Jung, C., Jang, Y., Kim, J., Chung, J.S.: Seeing through the conversation: Audio-visual speech separation based on diffusion model. arXiv preprint arXiv:2310.19581 (2023)","DOI":"10.1109\/ICASSP48485.2024.10447679"},{"key":"7_CR31","first-page":"4328","volume":"35","author":"X Li","year":"2022","unstructured":"Li, X., Thickstun, J., Gulrajani, I., Liang, P.S., Hashimoto, T.B.: Diffusion-lm improves controllable text generation. Adv. Neural. Inf. Process. Syst. 35, 4328\u20134343 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"7_CR32","unstructured":"Meng, C., He, Y., Song, Y., Song, J., Wu, J., Zhu, J.Y., Ermon, S.: Sdedit: Guided image synthesis and editing with stochastic differential equations. arXiv preprint arXiv:2108.01073 (2021)"},{"key":"7_CR33","doi-asserted-by":"publisher","first-page":"1368","DOI":"10.1109\/TASLP.2021.3066303","volume":"29","author":"D Michelsanti","year":"2021","unstructured":"Michelsanti, D., Tan, Z.H., Zhang, S.X., Xu, Y., Yu, M., Yu, D., Jensen, J.: An overview of deep-learning-based audio-visual speech enhancement and separation. IEEE\/ACM Transactions on Audio, Speech, and Language Processing 29, 1368\u20131396 (2021)","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"key":"7_CR34","unstructured":"Mittal, H., Morgado, P., Jain, U., Gupta, A.: Learning state-aware visual representations from audible interactions. In: Proceedings of the European conference on computer vision (ECCV) (2022)"},{"key":"7_CR35","unstructured":"Nichol, A., Dhariwal, P., Ramesh, A., Shyam, P., Mishkin, P., McGrew, B., Sutskever, I., Chen, M.: Glide: Towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741 (2021)"},{"key":"7_CR36","unstructured":"Nichol, A.Q., Dhariwal, P.: Improved denoising diffusion probabilistic models. In: International Conference on Machine Learning. pp. 8162\u20138171. PMLR (2021)"},{"key":"7_CR37","doi-asserted-by":"crossref","unstructured":"Owens, A., Efros, A.A.: Audio-visual scene analysis with self-supervised multisensory features. In: Proceedings of the European Conference on Computer Vision (ECCV). pp. 631\u2013648 (2018)","DOI":"10.1007\/978-3-030-01231-1_39"},{"key":"7_CR38","unstructured":"Popov, V., Vovk, I., Gogoryan, V., Sadekova, T., Kudinov, M.: Grad-tts: A diffusion probabilistic model for text-to-speech. In: International Conference on Machine Learning. pp. 8599\u20138608. PMLR (2021)"},{"key":"7_CR39","doi-asserted-by":"crossref","unstructured":"Qian, R., Hu, D., Dinkel, H., Wu, M., Xu, N., Lin, W.: Multiple sound sources localization from coarse to fine. In: European Conference on Computer Vision. pp. 292\u2013308. Springer (2020)","DOI":"10.1007\/978-3-030-58565-5_18"},{"key":"7_CR40","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International conference on machine learning. pp. 8748\u20138763. PMLR (2021)"},{"key":"7_CR41","unstructured":"Raffel, C., McFee, B., Humphrey, E.J., Salamon, J., Nieto, O., Liang, D., Ellis, D.P., Raffel, C.C.: Mir_eval: A transparent implementation of common mir metrics. In: ISMIR. pp. 367\u2013372 (2014)"},{"key":"7_CR42","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022)"},{"key":"7_CR43","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"7_CR44","doi-asserted-by":"crossref","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-net: Convolutional networks for biomedical image segmentation. In: Medical Image Computing and Computer-Assisted Intervention\u2013MICCAI 2015: 18th International Conference, Munich, Germany, October 5-9, 2015, Proceedings, Part III 18. pp. 234\u2013241. Springer (2015)","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"7_CR45","doi-asserted-by":"crossref","unstructured":"Ruan, L., Ma, Y., Yang, H., He, H., Liu, B., Fu, J., Yuan, N.J., Jin, Q., Guo, B.: Mm-diffusion: Learning multi-modal diffusion models for joint audio and video generation. arXiv preprint arXiv:2212.09478 (2022)","DOI":"10.1109\/CVPR52729.2023.00985"},{"key":"7_CR46","doi-asserted-by":"crossref","unstructured":"Ruiz, N., Li, Y., Jampani, V., Pritch, Y., Rubinstein, M., Aberman, K.: Dreambooth: Fine tuning text-to-image diffusion models for subject-driven generation. arXiv preprint arXiv:2208.12242 (2022)","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"7_CR47","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia, C., Chan, W., Saxena, S., Li, L., Whang, J., Denton, E.L., Ghasemipour, K., Gontijo Lopes, R., Karagol Ayan, B., Salimans, T., et al.: Photorealistic text-to-image diffusion models with deep language understanding. Adv. Neural. Inf. Process. Syst. 35, 36479\u201336494 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"7_CR48","doi-asserted-by":"crossref","unstructured":"Scheibler, R., Ji, Y., Chung, S.W., Byun, J., Choe, S., Choi, M.S.: Diffusion-based generative speech source separation. In: ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). pp.\u00a01\u20135. IEEE (2023)","DOI":"10.1109\/ICASSP49357.2023.10095310"},{"key":"7_CR49","unstructured":"Shen, Z., Zhang, M., Zhao, H., Yi, S., Li, H.: Efficient attention: Attention with linear complexities. In: Proceedings of the IEEE\/CVF winter conference on applications of computer vision. pp. 3531\u20133539 (2021)"},{"key":"7_CR50","unstructured":"Singer, U., Polyak, A., Hayes, T., Yin, X., An, J., Zhang, S., Hu, Q., Yang, H., Ashual, O., Gafni, O., et\u00a0al.: Make-a-video: Text-to-video generation without text-video data. arXiv preprint arXiv:2209.14792 (2022)"},{"key":"7_CR51","doi-asserted-by":"crossref","unstructured":"Smaragdis, P., Brown, J.C.: Non-negative matrix factorization for polyphonic music transcription. In: 2003 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (IEEE Cat. No. 03TH8684). pp. 177\u2013180. IEEE (2003)","DOI":"10.1109\/ASPAA.2003.1285860"},{"key":"7_CR52","unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502 (2020)"},{"key":"7_CR53","unstructured":"Song, Y., Ermon, S.: Generative modeling by estimating gradients of the data distribution. Advances in neural information processing systems 32 (2019)"},{"key":"7_CR54","unstructured":"Song, Y., Sohl-Dickstein, J., Kingma, D.P., Kumar, A., Ermon, S., Poole, B.: Score-based generative modeling through stochastic differential equations. arXiv preprint arXiv:2011.13456 (2020)"},{"key":"7_CR55","unstructured":"Spiertz, M., Gnann, V.: Source-filter based clustering for monaural blind source separation. In: Proceedings of the 12th International Conference on Digital Audio Effects. vol.\u00a04, p.\u00a06 (2009)"},{"key":"7_CR56","doi-asserted-by":"crossref","unstructured":"Tan, R., Ray, A., Burns, A., Plummer, B.A., Salamon, J., Nieto, O., Russell, B., Saenko, K.: Language-guided audio-visual source separation via trimodal consistency. arXiv preprint arXiv:2303.16342 (2023)","DOI":"10.1109\/CVPR52729.2023.01019"},{"key":"7_CR57","doi-asserted-by":"crossref","unstructured":"Tian, Y., Hu, D., Xu, C.: Cyclic co-learning of sounding object visual grounding and sound separation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 2745\u20132754 (2021)","DOI":"10.1109\/CVPR46437.2021.00277"},{"key":"7_CR58","doi-asserted-by":"crossref","unstructured":"Tian, Y., Shi, J., Li, B., Duan, Z., Xu, C.: Audio-visual event localization in unconstrained videos. In: Proceedings of the European Conference on Computer Vision (ECCV). pp. 247\u2013263 (2018)","DOI":"10.1007\/978-3-030-01216-8_16"},{"key":"7_CR59","unstructured":"Tzinis, E., Wisdom, S., Jansen, A., Hershey, S., Remez, T., Ellis, D.P., Hershey, J.R.: Into the wild with audioscope: Unsupervised audio-visual separation of on-screen sounds. arXiv preprint arXiv:2011.01143 (2020)"},{"key":"7_CR60","doi-asserted-by":"crossref","unstructured":"Tzinis, E., Wisdom, S., Remez, T., Hershey, J.R.: Audioscopev2: Audio-visual attention architectures for calibrated open-domain on-screen sound separation. In: Computer Vision\u2013ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23\u201327, 2022, Proceedings, Part XXXVII. pp. 368\u2013385. Springer (2022)","DOI":"10.1007\/978-3-031-19836-6_21"},{"key":"7_CR61","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141., Polosukhin, I.: Attention is all you need. Advances in neural information processing systems 30 (2017)"},{"issue":"3","key":"7_CR62","doi-asserted-by":"publisher","first-page":"1066","DOI":"10.1109\/TASL.2006.885253","volume":"15","author":"T Virtanen","year":"2007","unstructured":"Virtanen, T.: Monaural sound source separation by nonnegative matrix factorization with temporal continuity and sparseness criteria. IEEE Trans. Audio Speech Lang. Process. 15(3), 1066\u20131074 (2007)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"7_CR63","doi-asserted-by":"crossref","unstructured":"Wang, Z.Q., Cornell, S., Choi, S., Lee, Y., Kim, B.Y., Watanabe, S.: Tf-gridnet: Making time-frequency domain models great again for monaural speaker separation. In: ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). pp.\u00a01\u20135. IEEE (2023)","DOI":"10.1109\/ICASSP49357.2023.10094992"},{"key":"7_CR64","doi-asserted-by":"crossref","unstructured":"Wu, Y., He, K.: Group normalization. In: Proceedings of the European conference on computer vision (ECCV). pp. 3\u201319 (2018)","DOI":"10.1007\/978-3-030-01261-8_1"},{"key":"7_CR65","doi-asserted-by":"crossref","unstructured":"Xu, X., Dai, B., Lin, D.: Recursive visual sound separation using minus-plus net. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 882\u2013891 (2019)","DOI":"10.1109\/ICCV.2019.00097"},{"key":"7_CR66","doi-asserted-by":"crossref","unstructured":"Zhao, H., Gan, C., Ma, W.C., Torralba, A.: The sound of motions. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 1735\u20131744 (2019)","DOI":"10.1109\/ICCV.2019.00182"},{"key":"7_CR67","doi-asserted-by":"crossref","unstructured":"Zhao, H., Gan, C., Rouditchenko, A., Vondrick, C., McDermott, J., Torralba, A.: The sound of pixels. In: Proceedings of the European conference on computer vision (ECCV). pp. 570\u2013586 (2018)","DOI":"10.1007\/978-3-030-01246-5_35"},{"key":"7_CR68","doi-asserted-by":"crossref","unstructured":"Zhu, L., Rahtu, E.: Visually guided sound source separation using cascaded opponent filter network. In: Proceedings of the Asian Conference on Computer Vision (2020)","DOI":"10.1007\/978-3-030-69544-6_25"},{"key":"7_CR69","doi-asserted-by":"crossref","unstructured":"Zhu, L., Rahtu, E.: Visually guided sound source separation and localization using self-supervised motion representations. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision. pp. 1289\u20131299 (2022)","DOI":"10.1109\/WACV51458.2022.00223"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ACCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-0960-4_7","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,7]],"date-time":"2024-12-07T08:31:58Z","timestamp":1733560318000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-0960-4_7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,8]]},"ISBN":["9789819609598","9789819609604"],"references-count":69,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-0960-4_7","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12,8]]},"assertion":[{"value":"8 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ACCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Asian Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Hanoi","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Vietnam","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"12 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"accv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}