{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,25]],"date-time":"2026-06-25T02:02:44Z","timestamp":1782352964631,"version":"3.54.5"},"publisher-location":"Cham","reference-count":52,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030012458","type":"print"},{"value":"9783030012465","type":"electronic"}],"license":[{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018]]},"DOI":"10.1007\/978-3-030-01246-5_35","type":"book-chapter","created":{"date-parts":[[2018,10,5]],"date-time":"2018-10-05T16:14:56Z","timestamp":1538756096000},"page":"587-604","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":289,"title":["The Sound of Pixels"],"prefix":"10.1007","author":[{"given":"Hang","family":"Zhao","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Chuang","family":"Gan","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Andrew","family":"Rouditchenko","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Carl","family":"Vondrick","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Josh","family":"McDermott","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Antonio","family":"Torralba","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2018,10,6]]},"reference":[{"key":"35_CR1","doi-asserted-by":"crossref","unstructured":"Arandjelovic, R., Zisserman, A.: Look, listen and learn. In: 2017 IEEE International Conference on Computer Vision (ICCV), pp. 609\u2013617. IEEE (2017)","DOI":"10.1109\/ICCV.2017.73"},{"key":"35_CR2","unstructured":"Arandjelovi\u0107, R., Zisserman, A.: Objects that sound (2017). arXiv preprint arXiv:1712.06651"},{"key":"35_CR3","doi-asserted-by":"crossref","unstructured":"Aytar, Y., Vondrick, C., Torralba, A.: Soundnet: learning sound representations from unlabeled video. In: Advances in Neural Information Processing Systems, pp. 892\u2013900 (2016)","DOI":"10.1109\/CVPR.2016.18"},{"issue":"2","key":"35_CR4","doi-asserted-by":"publisher","first-page":"434","DOI":"10.1109\/78.554307","volume":"45","author":"A Belouchrani","year":"1997","unstructured":"Belouchrani, A., Abed-Meraim, K., Cardoso, J.F., Moulines, E.: A blind source separation technique using second-order statistics. IEEE Trans. Sig. Process. 45(2), 434\u2013444 (1997)","journal-title":"IEEE Trans. Sig. Process."},{"key":"35_CR5","volume-title":"Auditory Scene Analysis: The Perceptual Organization of Sound","author":"AS Bregman","year":"1994","unstructured":"Bregman, A.S.: Auditory Scene Analysis: The Perceptual Organization of Sound. MIT Press, Cambridge (1994)"},{"issue":"4","key":"35_CR6","doi-asserted-by":"publisher","first-page":"112","DOI":"10.1109\/97.566704","volume":"4","author":"JF Cardoso","year":"1997","unstructured":"Cardoso, J.F.: Infomax and maximum likelihood for blind source separation. IEEE Sig. Process. Lett. 4(4), 112\u2013114 (1997)","journal-title":"IEEE Sig. Process. Lett."},{"key":"35_CR7","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"258","DOI":"10.1007\/978-3-319-53547-0_25","volume-title":"Latent Variable Analysis and Signal Separation","author":"P Chandna","year":"2017","unstructured":"Chandna, P., Miron, M., Janer, J., G\u00f3mez, E.: Monoaural audio source separation using deep convolutional neural networks. In: Tichavsk\u00fd, P., Babaie-Zadeh, M., Michel, O.J.J., Thirion-Moreau, N. (eds.) LVA\/ICA 2017. LNCS, vol. 10169, pp. 258\u2013266. Springer, Cham (2017). https:\/\/doi.org\/10.1007\/978-3-319-53547-0_25"},{"key":"35_CR8","doi-asserted-by":"publisher","DOI":"10.1002\/9780470747278","volume-title":"Nonnegative Matrix and Tensor Factorizations: Applications to Exploratory Multi-Way Data Analysis and Blind Source Separation","author":"A Cichocki","year":"2009","unstructured":"Cichocki, A., Zdunek, R., Phan, A.H., Amari, S.I.: Nonnegative Matrix and Tensor Factorizations: Applications to Exploratory Multi-Way Data Analysis and Blind Source Separation. Wiley, Chichester (2009)"},{"key":"35_CR9","volume-title":"Handbook of Blind Source Separation: Independent Component Analysis and Applications","author":"P Comon","year":"2010","unstructured":"Comon, P., Jutten, C.: Handbook of Blind Source Separation: Independent Component Analysis and Applications. Academic Press, San Diego (2010)"},{"key":"35_CR10","doi-asserted-by":"crossref","unstructured":"Doersch, C., Gupta, A., Efros, A.A.: Unsupervised visual representation learning by context prediction. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1422\u20131430 (2015)","DOI":"10.1109\/ICCV.2015.167"},{"key":"35_CR11","doi-asserted-by":"crossref","unstructured":"Ephrat, A., et al.: Looking to listen at the cocktail party: a speaker-independent audio-visual model for speech separation (2018). arXiv preprint arXiv:1804.03619","DOI":"10.1145\/3197517.3201357"},{"key":"35_CR12","doi-asserted-by":"crossref","unstructured":"Gabbay, A., Ephrat, A., Halperin, T., Peleg, S.: Seeing through noise: speaker separation and enhancement using visually-derived speech (2017). arXiv preprint arXiv:1708.06767","DOI":"10.1109\/ICASSP.2018.8462527"},{"key":"35_CR13","doi-asserted-by":"crossref","unstructured":"Gan, C., Gong, B., Liu, K., Su, H., Guibas, L.J.: Geometry-guided CNN for self-supervised video representation learning (2018)","DOI":"10.1109\/CVPR.2018.00586"},{"issue":"9","key":"35_CR14","doi-asserted-by":"publisher","first-page":"1875","DOI":"10.1162\/0899766054322964","volume":"17","author":"S Haykin","year":"2005","unstructured":"Haykin, S., Chen, Z.: The cocktail party problem. Neural Comput. 17(9), 1875\u20131902 (2005)","journal-title":"Neural Comput."},{"key":"35_CR15","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"35_CR16","doi-asserted-by":"crossref","unstructured":"Hershey, J.R., Chen, Z., Le Roux, J., Watanabe, S.: Deep clustering: discriminative embeddings for segmentation and separation. In: 2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 31\u201335. IEEE (2016)","DOI":"10.1109\/ICASSP.2016.7471631"},{"key":"35_CR17","unstructured":"Hershey, J.R., Movellan, J.R.: Audio vision: using audio-visual synchrony to locate sounds. In: Solla, S.A., Leen, T.K., M\u00fcller, K. (eds.) Advances in Neural Information Processing Systems, vol. 12, pp. 813\u2013819. MIT Press (2000). http:\/\/papers.nips.cc\/paper\/1686-audio-vision-using-audio-visual-synchrony-to-locate-sounds.pdf"},{"key":"35_CR18","doi-asserted-by":"crossref","unstructured":"Hershey, S., Chaudhuri, S., Ellis, D.P., Gemmeke, J.F., Jansen, A., Moore, R.C., Plakal, M., Platt, D., Saurous, R.A., Seybold, B., et al.: Cnn architectures for large-scale audio classification. In: 2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 131\u2013135. IEEE (2017)","DOI":"10.1109\/ICASSP.2017.7952132"},{"issue":"2","key":"35_CR19","doi-asserted-by":"publisher","first-page":"378","DOI":"10.1109\/TMM.2012.2228476","volume":"15","author":"H Izadinia","year":"2013","unstructured":"Izadinia, H., Saleemi, I., Shah, M.: Multimodal analysis for identification and segmentation of moving-sounding objects. IEEE Trans. Multimed. 15(2), 378\u2013390 (2013)","journal-title":"IEEE Trans. Multimed."},{"key":"35_CR20","doi-asserted-by":"crossref","unstructured":"Jayaraman, D., Grauman, K.: Learning image representations tied to ego-motion. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1413\u20131421 (2015)","DOI":"10.1109\/ICCV.2015.166"},{"key":"35_CR21","doi-asserted-by":"publisher","unstructured":"Kidron, E., Schechner, Y.Y., Elad, M.: Pixels that sound. In: Proceedings of the 2005 IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR 2005), vol. 1, pp. 88\u201395. IEEE Computer Society, Washington (2005). https:\/\/doi.org\/10.1109\/CVPR.2005.274","DOI":"10.1109\/CVPR.2005.274"},{"key":"35_CR22","doi-asserted-by":"crossref","unstructured":"Larsson, G., Maire, M., Shakhnarovich, G.: Colorization as a proxy task for visual understanding. In: CVPR, vol. 2, p. 8 (2017)","DOI":"10.1109\/CVPR.2017.96"},{"key":"35_CR23","first-page":"1","volume":"270","author":"B Logan","year":"2000","unstructured":"Logan, B.: Mel frequency cepstral coefficients for music modeling. Int. Soc. Music Inf. Retrieval 270, 1\u201311 (2000)","journal-title":"Int. Soc. Music Inf. Retrieval"},{"key":"35_CR24","series-title":"LNCS","first-page":"211","volume-title":"ECCV 2018, Part XIV","author":"WC Ma","year":"2018","unstructured":"Ma, W.C., Chu, H., Zhou, B., Urtasun, R., Torralba, A.: Single image intrinsic decomposition without a single intrinsic image. In: Ferrari, V., et al. (eds.) ECCV 2018, Part XIV. LNCS, vol. 11205, pp. 211\u2013229. Springer, Cham (2018)"},{"issue":"22","key":"35_CR25","doi-asserted-by":"publisher","first-page":"R1024","DOI":"10.1016\/j.cub.2009.09.005","volume":"19","author":"JH McDermott","year":"2009","unstructured":"McDermott, J.H.: The cocktail party problem. Curr. Biol. 19(22), R1024\u2013R1027 (2009)","journal-title":"Curr. Biol."},{"key":"35_CR26","unstructured":"Mesaros, A., Heittola, T., Diment, A., Elizalde, B., Ankit Shah, E.A.: Dcase 2017 challenge setup: tasks, datasets and baseline system. In: DCASE 2017 - Workshop on Detection and Classification of Acoustic Scenes and Events (2017)"},{"key":"35_CR27","doi-asserted-by":"crossref","unstructured":"Nagrani, A., Albanie, S., Zisserman, A.: Seeing voices and hearing faces: cross-modal biometric matching (2018). arXiv preprint arXiv:1804.00326","DOI":"10.1109\/CVPR.2018.00879"},{"key":"35_CR28","unstructured":"Ngiam, J., Khosla, A., Kim, M., Nam, J., Lee, H., Ng, A.Y.: Multimodal deep learning. In: Proceedings of the 28th International Conference on International Conference on Machine Learning, ICML 2011, pp. 689\u2013696 (2011)"},{"key":"35_CR29","doi-asserted-by":"crossref","unstructured":"Owens, A., Efros, A.A.: Audio-visual scene analysis with self-supervised multisensory features (2018). arXiv preprint arXiv:1804.03641","DOI":"10.1007\/978-3-030-01231-1_39"},{"key":"35_CR30","doi-asserted-by":"crossref","unstructured":"Owens, A., Isola, P., McDermott, J., Torralba, A., Adelson, E.H., Freeman, W.T.: Visually indicated sounds. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2405\u20132413 (2016)","DOI":"10.1109\/CVPR.2016.264"},{"key":"35_CR31","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"801","DOI":"10.1007\/978-3-319-46448-0_48","volume-title":"Computer Vision \u2013 ECCV 2016","author":"A Owens","year":"2016","unstructured":"Owens, A., Wu, J., McDermott, J.H., Freeman, W.T., Torralba, A.: Ambient sound provides supervision for visual learning. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 801\u2013816. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_48"},{"key":"35_CR32","doi-asserted-by":"crossref","unstructured":"Pathak, D., Girshick, R., Doll\u00e1r, P., Darrell, T., Hariharan, B.: Learning features by watching objects move. In: Proceedings of CVPR, vol. 2 (2017)","DOI":"10.1109\/CVPR.2017.638"},{"key":"35_CR33","doi-asserted-by":"crossref","unstructured":"Pathak, D., Krahenbuhl, P., Donahue, J., Darrell, T., Efros, A.A.: Context encoders: feature learning by inpainting. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2536\u20132544 (2016)","DOI":"10.1109\/CVPR.2016.278"},{"key":"35_CR34","unstructured":"Raffel, C., et al.: mir\\_eval: a transparent implementation of common mir metrics. In: Proceedings of the 15th International Society for Music Information Retrieval Conference, ISMIR. Citeseer (2014)"},{"key":"35_CR35","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"234","DOI":"10.1007\/978-3-319-24574-4_28","volume-title":"Medical Image Computing and Computer-Assisted Intervention \u2013 MICCAI 2015","author":"O Ronneberger","year":"2015","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-Net: convolutional networks for biomedical image segmentation. In: Navab, N., Hornegger, J., Wells, W.M., Frangi, A.F. (eds.) MICCAI 2015. LNCS, vol. 9351, pp. 234\u2013241. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-24574-4_28"},{"key":"35_CR36","unstructured":"de Sa, V.R.: Learning classification with unlabeled data. In: Advances in Neural Information Processing Systems, pp. 112\u2013119 (1993)"},{"key":"35_CR37","doi-asserted-by":"crossref","unstructured":"Senocak, A., Oh, T.H., Kim, J., Yang, M.H., Kweon, I.S.: Learning to localize sound source in visual scenes (2018). arXiv preprint arXiv:1803.03849","DOI":"10.1109\/CVPR.2018.00458"},{"key":"35_CR38","doi-asserted-by":"crossref","unstructured":"Shu, Z., Yumer, E., Hadap, S., Sunkavalli, K., Shechtman, E., Samaras, D.: Neural face editing with intrinsic image disentangling (2017). arXiv preprint arXiv:1704.04131","DOI":"10.1109\/CVPR.2017.578"},{"key":"35_CR39","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"429","DOI":"10.1007\/978-3-319-22482-4_50","volume-title":"Latent Variable Analysis and Signal Separation","author":"AJR Simpson","year":"2015","unstructured":"Simpson, A.J.R., Roma, G., Plumbley, M.D.: Deep karaoke: extracting vocals from musical mixtures using a convolutional deep neural network. In: Vincent, E., Yeredor, A., Koldovsk\u00fd, Z., Tichavsk\u00fd, P. (eds.) LVA\/ICA 2015. LNCS, vol. 9237, pp. 429\u2013436. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-22482-4_50"},{"key":"35_CR40","doi-asserted-by":"crossref","unstructured":"Smaragdis, P., Brown, J.C.: Non-negative matrix factorization for polyphonic music transcription. In: 2003 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics, pp. 177\u2013180. IEEE (2003)","DOI":"10.1109\/ASPAA.2003.1285860"},{"issue":"4","key":"35_CR41","doi-asserted-by":"publisher","first-page":"1462","DOI":"10.1109\/TSA.2005.858005","volume":"14","author":"E Vincent","year":"2006","unstructured":"Vincent, E., Gribonval, R., F\u00e9votte, C.: Performance measurement in blind audio source separation. IEEE Trans. Audio Speech Lang. Process. 14(4), 1462\u20131469 (2006)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"issue":"3","key":"35_CR42","doi-asserted-by":"publisher","first-page":"1066","DOI":"10.1109\/TASL.2006.885253","volume":"15","author":"T Virtanen","year":"2007","unstructured":"Virtanen, T.: Monaural sound source separation by nonnegative matrix factorization with temporal continuity and sparseness criteria. IEEE Trans. Audio Speech Lang. Process. 15(3), 1066\u20131074 (2007)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"35_CR43","unstructured":"Vondrick, C., Pirsiavash, H., Torralba, A.: Generating videos with scene dynamics. In: Advances in Neural Information Processing Systems, pp. 613\u2013621 (2016)"},{"key":"35_CR44","doi-asserted-by":"crossref","unstructured":"Vondrick, C., Shrivastava, A., Fathi, A., Guadarrama, S., Murphy, K.: Tracking emerges by colorizing videos (2018). arXiv preprint arXiv:1806.09594","DOI":"10.1007\/978-3-030-01261-8_24"},{"key":"35_CR45","unstructured":"Wang, D., Chen, J.: Supervised speech separation based on deep learning: an overview (2017). arXiv preprint arXiv:1708.07524"},{"key":"35_CR46","doi-asserted-by":"crossref","unstructured":"Wang, X., Gupta, A.: Unsupervised learning of visual representations using videos. In: ICCV, pp. 2794\u20132802 (2015)","DOI":"10.1109\/ICCV.2015.320"},{"issue":"1","key":"35_CR47","doi-asserted-by":"publisher","first-page":"47","DOI":"10.1109\/TCI.2016.2644865","volume":"3","author":"H Zhao","year":"2017","unstructured":"Zhao, H., Gallo, O., Frosio, I., Kautz, J.: Loss functions for image restoration with neural networks. IEEE Trans. Comput. Imaging 3(1), 47\u201357 (2017)","journal-title":"IEEE Trans. Comput. Imaging"},{"key":"35_CR48","doi-asserted-by":"crossref","unstructured":"Zhao, M., et al.: Through-wall human pose estimation using radio signals. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7356\u20137365 (2018)","DOI":"10.1109\/CVPR.2018.00768"},{"key":"35_CR49","doi-asserted-by":"crossref","unstructured":"Zhou, B., Khosla, A., Lapedriza, A., Oliva, A., Torralba, A.: Learning deep features for discriminative localization. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2921\u20132929 (2016)","DOI":"10.1109\/CVPR.2016.319"},{"key":"35_CR50","doi-asserted-by":"crossref","unstructured":"Zhou, B., Zhao, H., Puig, X., Fidler, S., Barriuso, A., Torralba, A.: Scene parsing through ADE20K dataset. In: Proceedings of CVPR (2017)","DOI":"10.1109\/CVPR.2017.544"},{"key":"35_CR51","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Wang, Z., Fang, C., Bui, T., Berg, T.L.: Visual to sound: Generating natural sound for videos in the wild (2017). arXiv preprint arXiv:1712.01393","DOI":"10.1109\/CVPR.2018.00374"},{"issue":"4","key":"35_CR52","doi-asserted-by":"publisher","first-page":"863","DOI":"10.1162\/089976601300014385","volume":"13","author":"M Zibulevsky","year":"2001","unstructured":"Zibulevsky, M., Pearlmutter, B.A.: Blind source separation by sparse decomposition in a signal dictionary. Neural Comput. 13(4), 863\u2013882 (2001)","journal-title":"Neural Comput."}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2018"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-01246-5_35","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T18:37:11Z","timestamp":1775241431000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-01246-5_35"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018]]},"ISBN":["9783030012458","9783030012465"],"references-count":52,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-01246-5_35","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018]]},"assertion":[{"value":"6 October 2018","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Munich","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2018","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 September 2018","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 September 2018","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2018","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2018.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}