{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,7]],"date-time":"2026-02-07T08:55:05Z","timestamp":1770454505327,"version":"3.49.0"},"publisher-location":"Cham","reference-count":86,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030012182","type":"print"},{"value":"9783030012199","type":"electronic"}],"license":[{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018]]},"DOI":"10.1007\/978-3-030-01219-9_3","type":"book-chapter","created":{"date-parts":[[2018,10,6]],"date-time":"2018-10-06T14:23:51Z","timestamp":1538835831000},"page":"36-54","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":144,"title":["Learning to Separate Object Sounds by Watching Unlabeled Video"],"prefix":"10.1007","author":[{"given":"Ruohan","family":"Gao","sequence":"first","affiliation":[]},{"given":"Rogerio","family":"Feris","sequence":"additional","affiliation":[]},{"given":"Kristen","family":"Grauman","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2018,10,7]]},"reference":[{"key":"3_CR1","doi-asserted-by":"crossref","unstructured":"Afouras, T., Chung, J.S., Zisserman, A.: The conversation: deep audio-visual speech enhancement. arXiv preprint arXiv:1804.04121 (2018)","DOI":"10.21437\/Interspeech.2018-1400"},{"key":"3_CR2","doi-asserted-by":"publisher","first-page":"288","DOI":"10.1109\/TPAMI.2008.284","volume":"32","author":"S Ali","year":"2010","unstructured":"Ali, S., Shah, M.: Human action recognition in videos using kinematic features and multiple instance learning. PAMI 32, 288\u2013303 (2010)","journal-title":"PAMI"},{"key":"3_CR3","doi-asserted-by":"crossref","unstructured":"Arandjelovic, R., Zisserman, A.: Look, listen and learn. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.73"},{"key":"3_CR4","unstructured":"Arandjelovi\u0107, R., Zisserman, A.: Objects that sound. arXiv preprint arXiv:1712.06651 (2017)"},{"key":"3_CR5","doi-asserted-by":"crossref","unstructured":"Aytar, Y., Vondrick, C., Torralba, A.: SoundNet: learning sound representations from unlabeled video. In: NIPS (2016)","DOI":"10.1109\/CVPR.2016.18"},{"key":"3_CR6","unstructured":"Aytar, Y., Vondrick, C., Torralba, A.: See, hear, and read: deep aligned representations. arXiv preprint arXiv:1706.00932 (2017)"},{"key":"3_CR7","first-page":"1107","volume":"3","author":"K Barnard","year":"2003","unstructured":"Barnard, K., Duygulu, P., de Freitas, N., Blei, D., Jordan, M.: Matching words and pictures. JMLR 3, 1107\u20131135 (2003)","journal-title":"JMLR"},{"key":"3_CR8","doi-asserted-by":"crossref","unstructured":"Barzelay, Z., Schechner, Y.Y.: Harmony in motion. In: CVPR (2007)","DOI":"10.1109\/CVPR.2007.383344"},{"key":"3_CR9","unstructured":"Berg, T., et al.: Names and faces in the news. In: CVPR (2004)"},{"key":"3_CR10","doi-asserted-by":"crossref","unstructured":"Bilen, H., Vedaldi, A.: Weakly supervised deep detection networks. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.311"},{"key":"3_CR11","unstructured":"Bryan, N.: Interactive Sound Source Separation. Ph.D. thesis, Stanford University (2014)"},{"key":"3_CR12","doi-asserted-by":"publisher","first-page":"358","DOI":"10.1109\/TMM.2010.2050650","volume":"12","author":"AL Casanovas","year":"2010","unstructured":"Casanovas, A.L., Monaci, G., Vandergheynst, P., Gribonval, R.: Blind audiovisual source separation based on sparse redundant representations. IEEE Trans. Multimed. 12, 358\u2013371 (2010)","journal-title":"IEEE Trans. Multimed."},{"key":"3_CR13","doi-asserted-by":"crossref","unstructured":"Chen, L., Srivastava, S., Duan, Z., Xu, C.: Deep cross-modal audio-visual generation. In: Proceedings of the on Thematic Workshops of ACM Multimedia (2017)","DOI":"10.1145\/3126686.3126723"},{"key":"3_CR14","doi-asserted-by":"publisher","first-page":"189","DOI":"10.1109\/TPAMI.2016.2535231","volume":"39","author":"R Cinbis","year":"2017","unstructured":"Cinbis, R., Verbeek, J., Schmid, C.: Weakly supervised object localization with multi-fold multiple instance learning. PAMI 39, 189\u2013203 (2017)","journal-title":"PAMI"},{"key":"3_CR15","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/3-540-40063-X_5","volume-title":"Advances in Multimodal Interfaces \u2014 ICMI 2000","author":"T Darrell","year":"2000","unstructured":"Darrell, T., Fisher, J.W., Viola, P.: Audio-visual segmentation and the cocktail party effect. In: Tan, T., Shi, Y., Gao, W. (eds.) ICMI 2000. LNCS, vol. 1948, pp. 32\u201340. Springer, Heidelberg (2000). https:\/\/doi.org\/10.1007\/3-540-40063-X_5"},{"key":"3_CR16","doi-asserted-by":"publisher","first-page":"275","DOI":"10.1007\/s11263-012-0538-3","volume":"100","author":"T Deselaers","year":"2012","unstructured":"Deselaers, T., Alexe, B., Ferrari, V.: Weakly supervised localization and learning with generic knowledge. IJCV 100, 275\u2013293 (2012)","journal-title":"IJCV"},{"key":"3_CR17","doi-asserted-by":"publisher","first-page":"31","DOI":"10.1016\/S0004-3702(96)00034-3","volume":"89","author":"TG Dietterich","year":"1997","unstructured":"Dietterich, T.G., Lathrop, R.H., Lozano-P\u00e9rez, T.: Solving the multiple instance problem with axis-parallel rectangles. Artif. intell. 89, 31\u201371 (1997)","journal-title":"Artif. intell."},{"key":"3_CR18","doi-asserted-by":"crossref","unstructured":"Donahue, J., et al.: Long-term recurrent convolutional networks for visual recognition and description. In: CVPR (2015)","DOI":"10.21236\/ADA623249"},{"key":"3_CR19","doi-asserted-by":"crossref","unstructured":"Duong, N.Q., Ozerov, A., Chevallier, L., Sirot, J.: An interactive audio source separation framework based on non-negative matrix factorization. In: ICASSP (2014)","DOI":"10.1109\/ICASSP.2014.6853861"},{"key":"3_CR20","doi-asserted-by":"publisher","first-page":"1830","DOI":"10.1109\/TASL.2010.2050716","volume":"18","author":"NQ Duong","year":"2010","unstructured":"Duong, N.Q., Vincent, E., Gribonval, R.: Under-determined reverberant audio source separation using a full-rank spatial covariance model. IEEE Trans. Audio Speech Lang. Process. 18, 1830\u20131840 (2010)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"3_CR21","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"97","DOI":"10.1007\/3-540-47979-1_7","volume-title":"Computer Vision \u2014 ECCV 2002","author":"P Duygulu","year":"2002","unstructured":"Duygulu, P., Barnard, K., de Freitas, J.F.G., Forsyth, D.A.: Object recognition as machine translation: learning a lexicon for a fixed image vocabulary. In: Heyden, A., Sparr, G., Nielsen, M., Johansen, P. (eds.) ECCV 2002. LNCS, vol. 2353, pp. 97\u2013112. Springer, Heidelberg (2002). https:\/\/doi.org\/10.1007\/3-540-47979-1_7"},{"key":"3_CR22","unstructured":"Ellis, D.P.W.: Prediction-driven computational auditory scene analysis. Ph.D. thesis, Massachusetts Institute of Technology (1996)"},{"key":"3_CR23","doi-asserted-by":"crossref","unstructured":"Ephrat, A., et al.: Looking to listen at the cocktail party: a speaker-independent audio-visual model for speech separation. arXiv preprint arXiv:1804.03619 (2018)","DOI":"10.1145\/3197517.3201357"},{"key":"3_CR24","doi-asserted-by":"crossref","unstructured":"Feng, J., Zhou, Z.H.: Deep MIML network. In: AAAI (2017)","DOI":"10.1609\/aaai.v31i1.10890"},{"key":"3_CR25","doi-asserted-by":"publisher","first-page":"793","DOI":"10.1162\/neco.2008.04-08-771","volume":"21","author":"C F\u00e9votte","year":"2009","unstructured":"F\u00e9votte, C., Bertin, N., Durrieu, J.L.: Nonnegative matrix factorization with the itakura-saito divergence: with application to music analysis. Neural comput. 21, 793\u2013830 (2009)","journal-title":"Neural comput."},{"key":"3_CR26","doi-asserted-by":"publisher","first-page":"2421","DOI":"10.1162\/NECO_a_00168","volume":"23","author":"C F\u00e9votte","year":"2011","unstructured":"F\u00e9votte, C., Idier, J.: Algorithms for nonnegative matrix factorization with the $$\\beta $$-divergence. Neural comput. 23, 2421\u20132456 (2011)","journal-title":"Neural comput."},{"key":"3_CR27","unstructured":"Fisher III, J.W., Darrell, T., Freeman, W.T., Viola, P.A.: Learning joint statistical models for audio-visual fusion and segregation. In: NIPS (2001)"},{"key":"3_CR28","doi-asserted-by":"crossref","unstructured":"Gabbay, A., Shamir, A., Peleg, S.: Visual speech enhancement using noise-invariant training. arXiv preprint arXiv:1711.08789 (2017)","DOI":"10.21437\/Interspeech.2018-1955"},{"key":"3_CR29","doi-asserted-by":"crossref","unstructured":"Gemmeke, J.F., et al.: Audio set: an ontology and human-labeled dataset for audio events. In: ICASSP (2017)","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"3_CR30","doi-asserted-by":"publisher","first-page":"236","DOI":"10.1109\/TASSP.1984.1164317","volume":"32","author":"D Griffin","year":"1984","unstructured":"Griffin, D., Lim, J.: Signal estimation from modified short-time fourier transform. IEEE Trans. Acoust. Speech Signal Process. 32, 236\u2013243 (1984)","journal-title":"IEEE Trans. Acoust. Speech Signal Process."},{"key":"3_CR31","doi-asserted-by":"crossref","unstructured":"Guo, X., Uhlich, S., Mitsufuji, Y.: NMF-based blind source separation using a linear predictive coding error clustering criterion. In: ICASSP (2015)","DOI":"10.1109\/ICASSP.2015.7177972"},{"key":"3_CR32","doi-asserted-by":"crossref","unstructured":"Harwath, D., Glass, J.: Learning word-like units from joint audio-visual analysis. In: ACL (2017)","DOI":"10.18653\/v1\/P17-1047"},{"key":"3_CR33","doi-asserted-by":"crossref","unstructured":"Harwath, D., Recasens, A., Sur\u00eds, D., Chuang, G., Torralba, A., Glass, J.: Jointly discovering visual objects and spoken words from raw sensory input. arXiv preprint arXiv:1804.01452 (2018)","DOI":"10.1007\/978-3-030-01231-1_40"},{"key":"3_CR34","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"3_CR35","doi-asserted-by":"crossref","unstructured":"Hennequin, R., David, B., Badeau, R.: Score informed audio source separation using a parametric model of non-negative spectrogram. In: ICASSP (2011)","DOI":"10.1109\/ICASSP.2011.5946324"},{"key":"3_CR36","doi-asserted-by":"crossref","unstructured":"Hershey, J.R., Chen, Z., Le Roux, J., Watanabe, S.: Deep clustering: discriminative embeddings for segmentation and separation. In: ICASSP (2016)","DOI":"10.1109\/ICASSP.2016.7471631"},{"key":"3_CR37","unstructured":"Hershey, J.R., Movellan, J.R.: Audio vision: using audio-visual synchrony to locate sounds. In: NIPS (2000)"},{"key":"3_CR38","doi-asserted-by":"crossref","unstructured":"Hofmann, T.: Probabilistic latent semantic indexing. In: International ACM SIGIR Conference on Research and Development in Information Retrieval (1999)","DOI":"10.1145\/312624.312649"},{"key":"3_CR39","doi-asserted-by":"crossref","unstructured":"Huang, P.S., Kim, M., Hasegawa-Johnson, M., Smaragdis, P.: Deep learning for monaural speech separation. In: ICASSP (2014)","DOI":"10.1109\/ICASSP.2014.6853860"},{"key":"3_CR40","doi-asserted-by":"publisher","first-page":"411","DOI":"10.1016\/S0893-6080(00)00026-5","volume":"13","author":"A Hyv\u00e4rinen","year":"2000","unstructured":"Hyv\u00e4rinen, A., Oja, E.: Independent component analysis: algorithms and applications. Neural Netw. 13, 411\u2013430 (2000)","journal-title":"Neural Netw."},{"key":"3_CR41","doi-asserted-by":"publisher","first-page":"1333","DOI":"10.1016\/j.camwa.2012.03.077","volume":"64","author":"S Innami","year":"2012","unstructured":"Innami, S., Kasai, H.: NMF-based environmental sound source separation using time-variant gain features. Comput. Math. Appl. 64, 1333\u20131342 (2012)","journal-title":"Comput. Math. Appl."},{"key":"3_CR42","unstructured":"Ioffe, S., Szegedy, C.: Batch normalization: accelerating deep network training by reducing internal covariate shift. In: ICML (2015)"},{"key":"3_CR43","doi-asserted-by":"publisher","first-page":"378","DOI":"10.1109\/TMM.2012.2228476","volume":"15","author":"H Izadinia","year":"2013","unstructured":"Izadinia, H., Saleemi, I., Shah, M.: Multimodal analysis for identification and segmentation of moving-sounding objects. IEEE Trans. Multimed. 15, 378\u2013390 (2013)","journal-title":"IEEE Trans. Multimed."},{"key":"3_CR44","doi-asserted-by":"crossref","unstructured":"Jaiswal, R., FitzGerald, D., Barry, D., Coyle, E., Rickard, S.: Clustering NMF basis functions using shifted NMF for monaural sound source separation. In: ICASSP (2011)","DOI":"10.1109\/ICASSP.2011.5946386"},{"key":"3_CR45","doi-asserted-by":"publisher","first-page":"33","DOI":"10.1007\/s00138-013-0567-0","volume":"25","author":"IH Jhuo","year":"2014","unstructured":"Jhuo, I.H., Ye, G., Gao, S., Liu, D., Jiang, Y.G., Lee, D., Chang, S.F.: Discovering joint audio-visual codewords for video event detection. Machine Vis. Appl. 25, 33\u201347 (2014)","journal-title":"Machine Vis. Appl."},{"key":"3_CR46","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"3_CR47","unstructured":"Kidron, E., Schechner, Y.Y., Elad, M.: Pixels that sound. In: CVPR (2005)"},{"key":"3_CR48","unstructured":"Kingma, D., Ba, J.: Adam: a method for stochastic optimization. In: ICLR (2015)"},{"key":"3_CR49","unstructured":"Korbar, B., Tran, D., Torresani, L.: Co-training of audio and video representations from self-supervised temporal synchronization. arXiv preprint arXiv:1807.00230 (2018)"},{"key":"3_CR50","doi-asserted-by":"publisher","first-page":"117","DOI":"10.1007\/s11265-014-0920-1","volume":"79","author":"L Le Magoarou","year":"2015","unstructured":"Le Magoarou, L., Ozerov, A., Duong, N.Q.: Text-informed audio source separation. Example-based approach using non-negative matrix partial co-factorization. J. Signal Process. Syst. 79, 117\u2013131 (2015)","journal-title":"J. Signal Process. Syst."},{"key":"3_CR51","unstructured":"Lee, D.D., Seung, H.S.: Algorithms for non-negative matrix factorization. In: Advances in Neural Information Processing Systems (2001)"},{"key":"3_CR52","doi-asserted-by":"crossref","unstructured":"Li, B., Dinesh, K., Duan, Z., Sharma, G.: See and listen: score-informed association of sound tracks to players in chamber music performance videos. In: ICASSP (2017)","DOI":"10.1109\/ICASSP.2017.7952688"},{"key":"3_CR53","doi-asserted-by":"crossref","unstructured":"Li, K., Ye, J., Hua, K.A.: What\u2019s making that sound? In: ACMMM (2014)","DOI":"10.1145\/2647868.2654936"},{"key":"3_CR54","doi-asserted-by":"publisher","first-page":"4298","DOI":"10.1109\/TSP.2014.2332434","volume":"62","author":"A Liutkus","year":"2014","unstructured":"Liutkus, A., Fitzgerald, D., Rafii, Z., Pardo, B., Daudet, L.: Kernel additive models for source separation. IEEE Trans. Signal Process. 62, 4298\u20134310 (2014)","journal-title":"IEEE Trans. Signal Process."},{"issue":"1","key":"3_CR55","doi-asserted-by":"publisher","first-page":"523","DOI":"10.1214\/12-AOAS597","volume":"7","author":"EF Lock","year":"2013","unstructured":"Lock, E.F., Hoadley, K.A., Marron, J.S., Nobel, A.B.: Joint and individual variation explained (JIVE) for integrated analysis of multiple data types. Ann. Appl. Stat. 7(1), 523 (2013)","journal-title":"Ann. Appl. Stat."},{"key":"3_CR56","unstructured":"Nakadai, K., Hidai, K.I., Okuno, H.G., Kitano, H.: Real-time speaker localization and speech separation by audio-visual integration. In: IEEE International Conference on Robotics and Automation (2002)"},{"key":"3_CR57","doi-asserted-by":"publisher","first-page":"86","DOI":"10.1109\/MMUL.2006.63","volume":"13","author":"M Naphade","year":"2006","unstructured":"Naphade, M., Smith, J.R., Tesic, J., Chang, S.F., Hsu, W., Kennedy, L., Hauptmann, A., Curtis, J.: Large-scale concept ontology for multimedia. IEEE Multimed. 13, 86\u201391 (2006)","journal-title":"IEEE Multimed."},{"key":"3_CR58","doi-asserted-by":"crossref","unstructured":"Owens, A., Efros, A.A.: Audio-visual scene analysis with self-supervised multisensory features. arXiv preprint arXiv:1804.03641 (2018)","DOI":"10.1007\/978-3-030-01231-1_39"},{"key":"3_CR59","doi-asserted-by":"crossref","unstructured":"Owens, A., Isola, P., McDermott, J., Torralba, A., Adelson, E.H., Freeman, W.T.: Visually indicated sounds. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.264"},{"key":"3_CR60","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"801","DOI":"10.1007\/978-3-319-46448-0_48","volume-title":"Computer Vision \u2013 ECCV 2016","author":"A Owens","year":"2016","unstructured":"Owens, A., Wu, J., McDermott, J.H., Freeman, W.T., Torralba, A.: Ambient sound provides supervision for visual learning. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 801\u2013816. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_48"},{"key":"3_CR61","doi-asserted-by":"crossref","unstructured":"Parekh, S., Essid, S., Ozerov, A., Duong, N.Q., P\u00e9rez, P., Richard, G.: Motion informed audio source separation. In: ICASSP (2017)","DOI":"10.1109\/ICASSP.2017.7951787"},{"key":"3_CR62","doi-asserted-by":"crossref","unstructured":"Pu, J., Panagakis, Y., Petridis, S., Pantic, M.: Audio-visual object localization and separation using low-rank and sparsity. In: ICASSP (2017)","DOI":"10.1109\/ICASSP.2017.7952687"},{"key":"3_CR63","doi-asserted-by":"publisher","first-page":"127","DOI":"10.1016\/j.brainres.2007.01.074","volume":"1144","author":"T Rahne","year":"2007","unstructured":"Rahne, T., B\u00f6ckmann, M., von Specht, H., Sussman, E.S.: Visual cues can modulate integration and segregation of objects in auditory scene analysis. Brain Res. 1144, 127\u2013135 (2007)","journal-title":"Brain Res."},{"key":"3_CR64","doi-asserted-by":"publisher","first-page":"96","DOI":"10.1109\/TASL.2006.872619","volume":"15","author":"B Rivet","year":"2007","unstructured":"Rivet, B., Girin, L., Jutten, C.: Mixing audiovisual speech processing and blind source separation for the extraction of speech signals from convolutive mixtures. IEEE Trans. Audio Speech Lang. Process. 15, 96\u2013108 (2007)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"3_CR65","doi-asserted-by":"crossref","unstructured":"Sedighin, F., Babaie-Zadeh, M., Rivet, B., Jutten, C.: Two multimodal approaches for single microphone source separation. In: 24th European Signal Processing Conference (2016)","DOI":"10.1109\/EUSIPCO.2016.7760220"},{"key":"3_CR66","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"429","DOI":"10.1007\/978-3-319-22482-4_50","volume-title":"Latent Variable Analysis and Signal Separation","author":"AJR Simpson","year":"2015","unstructured":"Simpson, A.J.R., Roma, G., Plumbley, M.D.: Deep karaoke: extracting vocals from musical mixtures using a convolutional deep neural network. In: Vincent, E., Yeredor, A., Koldovsk\u00fd, Z., Tichavsk\u00fd, P. (eds.) LVA\/ICA 2015. LNCS, vol. 9237, pp. 429\u2013436. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-22482-4_50"},{"key":"3_CR67","unstructured":"Smaragdis, P., Casey, M.: Audio\/visual independent components. In: International Conference on Independent Component Analysis and Signal Separation (2003)"},{"key":"3_CR68","unstructured":"Smaragdis, P., Raj, B., Shashanka, M.: A probabilistic latent variable model for acoustic modeling. In: NIPS (2006)"},{"key":"3_CR69","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"414","DOI":"10.1007\/978-3-540-74494-8_52","volume-title":"Independent Component Analysis and Signal Separation","author":"P Smaragdis","year":"2007","unstructured":"Smaragdis, P., Raj, B., Shashanka, M.: Supervised and semi-supervised separation of sounds from single-channel mixtures. In: Davies, M.E., James, C.J., Abdallah, S.A., Plumbley, M.D. (eds.) ICA 2007. LNCS, vol. 4666, pp. 414\u2013421. Springer, Heidelberg (2007). https:\/\/doi.org\/10.1007\/978-3-540-74494-8_52"},{"key":"3_CR70","doi-asserted-by":"crossref","unstructured":"Smeaton, A.F., Over, P., Kraaij, W.: Evaluation campaigns and TRECVid. In: Proceedings of the 8th ACM International Workshop on Multimedia Information Retrieval (2006)","DOI":"10.1145\/1178677.1178722"},{"key":"3_CR71","doi-asserted-by":"publisher","first-page":"5","DOI":"10.1023\/B:MTAP.0000046380.27575.a5","volume":"25","author":"CG Snoek","year":"2005","unstructured":"Snoek, C.G., Worring, M.: Multimodal video indexing: a review of the state-of-the-art. Multimed. Tools Appl. 25, 5\u201335 (2005)","journal-title":"Multimed. Tools Appl."},{"key":"3_CR72","unstructured":"SPIERTZ, M.: Source-filter based clustering for monaural blind source separation. In: 12th International Conference on Digital Audio Effects (2009)"},{"key":"3_CR73","doi-asserted-by":"crossref","unstructured":"Vijayanarasimhan, S., Grauman, K.: Keywords to visual categories: multiple-instance learning for weakly supervised object categorization. In: CVPR (2008)","DOI":"10.1109\/CVPR.2008.4587632"},{"key":"3_CR74","doi-asserted-by":"publisher","first-page":"1462","DOI":"10.1109\/TSA.2005.858005","volume":"14","author":"E Vincent","year":"2006","unstructured":"Vincent, E., Gribonval, R., F\u00e9votte, C.: Performance measurement in blind audio source separation. IEEE Trans. Audio Speech Lang. Process. 14, 1462\u20131469 (2006)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"3_CR75","unstructured":"Virtanen, T.: Sound source separation using sparse coding with temporal continuity objective. In: International Computer Music Conference (2003)"},{"key":"3_CR76","doi-asserted-by":"publisher","first-page":"1066","DOI":"10.1109\/TASL.2006.885253","volume":"15","author":"T Virtanen","year":"2007","unstructured":"Virtanen, T.: Monaural sound source separation by nonnegative matrix factorization with temporal continuity and sparseness criteria. IEEE Trans. Audio Speech Lang. Process. 15, 1066\u20131074 (2007)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"3_CR77","unstructured":"Wang, B.: Investigating single-channel audio source separation methods based on non-negative matrix factorization. In: ICA Research Network International Workshop (2006)"},{"key":"3_CR78","doi-asserted-by":"crossref","unstructured":"Wang, L., Xiong, Y., Lin, D., Gool, L.V.: Untrimmednets for weakly supervised action recognition and detection. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.678"},{"key":"3_CR79","unstructured":"Wang, Z., et al.: Truly multi-modal YouTube-8m video classification with video, audio, and text. arXiv preprint arXiv:1706.05461 (2017)"},{"key":"3_CR80","doi-asserted-by":"crossref","unstructured":"Wu, J., Yu, Y., Huang, C., Yu, K.: Deep multiple instance learning for image classification and auto-annotation. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298968"},{"key":"3_CR81","doi-asserted-by":"crossref","unstructured":"Yang, H., Zhou, J.T., Cai, J., Ong, Y.S.: MIML-FCN+: multi-instance multi-label learning via fully convolutional networks with privileged information. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.635"},{"key":"3_CR82","doi-asserted-by":"publisher","first-page":"1830","DOI":"10.1109\/TSP.2004.828896","volume":"52","author":"O Yilmaz","year":"2004","unstructured":"Yilmaz, O., Rickard, S.: Blind separation of speech mixtures via time-frequency masking. IEEE Trans. Signal Process. 52, 1830\u20131847 (2004)","journal-title":"IEEE Trans. Signal Process."},{"key":"3_CR83","doi-asserted-by":"crossref","unstructured":"Zhang, Z., et al.: Generative modeling of audible shapes for object perception. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.141"},{"key":"3_CR84","doi-asserted-by":"crossref","unstructured":"Zhao, H., Gan, C., Rouditchenko, A., Vondrick, C., McDermott, J., Torralba, A.: The sound of pixels. arXiv preprint arXiv:1804.03160 (2018)","DOI":"10.1007\/978-3-030-01246-5_35"},{"key":"3_CR85","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Wang, Z., Fang, C., Bui, T., Berg, T.L.: Visual to sound: generating natural sound for videos in the wild. arXiv preprint arXiv:1712.01393 (2017)","DOI":"10.1109\/CVPR.2018.00374"},{"key":"3_CR86","doi-asserted-by":"publisher","first-page":"863","DOI":"10.1162\/089976601300014385","volume":"13","author":"M Zibulevsky","year":"2001","unstructured":"Zibulevsky, M., Pearlmutter, B.A.: Blind source separation by sparse decomposition in a signal dictionary. Neural Computat. 13, 863\u2013882 (2001)","journal-title":"Neural Computat."}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2018"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-01219-9_3","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,6]],"date-time":"2022-10-06T00:58:26Z","timestamp":1665017906000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-01219-9_3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018]]},"ISBN":["9783030012182","9783030012199"],"references-count":86,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-01219-9_3","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018]]},"assertion":[{"value":"7 October 2018","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Munich","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2018","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 September 2018","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 September 2018","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2018","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2018.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}