{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,25]],"date-time":"2026-06-25T16:25:28Z","timestamp":1782404728406,"version":"3.54.5"},"publisher-location":"Cham","reference-count":42,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030012458","type":"print"},{"value":"9783030012465","type":"electronic"}],"license":[{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018]]},"DOI":"10.1007\/978-3-030-01246-5_27","type":"book-chapter","created":{"date-parts":[[2018,10,5]],"date-time":"2018-10-05T16:14:56Z","timestamp":1538756096000},"page":"451-466","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":265,"title":["Objects that Sound"],"prefix":"10.1007","author":[{"given":"Relja","family":"Arandjelovi\u0107","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Andrew","family":"Zisserman","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2018,10,6]]},"reference":[{"key":"27_CR1","doi-asserted-by":"crossref","unstructured":"Aytar, Y., Vondrick, C., Torralba, A.: SoundNet: learning sound representations from unlabeled video. In: NIPS (2016)","DOI":"10.1109\/CVPR.2016.18"},{"key":"27_CR2","unstructured":"Harwath, D., Torralba, A., Glass, J.R.: Unsupervised learning of spoken language with visual context. In: NIPS (2016)"},{"key":"27_CR3","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"801","DOI":"10.1007\/978-3-319-46448-0_48","volume-title":"Computer Vision \u2013 ECCV 2016","author":"A Owens","year":"2016","unstructured":"Owens, A., Wu, J., McDermott, J.H., Freeman, W.T., Torralba, A.: Ambient sound provides supervision for visual learning. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 801\u2013816. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_48"},{"key":"27_CR4","doi-asserted-by":"crossref","unstructured":"Arandjelovi\u0107, R., Zisserman, A.: Look, listen and learn. In: Proceedings of ICCV (2017)","DOI":"10.1109\/ICCV.2017.73"},{"key":"27_CR5","first-page":"1107","volume":"3","author":"K Barnard","year":"2003","unstructured":"Barnard, K., Duygulu, P., de Freitas, N., Forsyth, D., Blei, D., Jordan, M.: Matching words and pictures. JMLR 3, 1107\u20131135 (2003)","journal-title":"JMLR"},{"key":"27_CR6","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"97","DOI":"10.1007\/3-540-47979-1_7","volume-title":"Computer Vision \u2014 ECCV 2002","author":"P Duygulu","year":"2002","unstructured":"Duygulu, P., Barnard, K., de Freitas, J.F.G., Forsyth, D.A.: Object recognition as machine translation: learning a lexicon for a fixed image vocabulary. In: Heyden, A., Sparr, G., Nielsen, M., Johansen, P. (eds.) ECCV 2002. LNCS, vol. 2353, pp. 97\u2013112. Springer, Heidelberg (2002). https:\/\/doi.org\/10.1007\/3-540-47979-1_7"},{"key":"27_CR7","unstructured":"Frome, A., et al.: Devise: a deep visual-semantic embedding model. In: NIPS (2013)"},{"key":"27_CR8","unstructured":"Xu, K., et al.: Show, attend and tell: neural image caption generation with visual attention. arXiv preprint arXiv:1502.03044 (2015)"},{"key":"27_CR9","unstructured":"de Sa, V.R.: Learning classification from unlabelled data. In: NIPS (1994)"},{"key":"27_CR10","unstructured":"Kidron, E., Schechner, Y.Y., Elad, M.: Pixels that sound. In: Proceedings of CVPR (2005)"},{"key":"27_CR11","doi-asserted-by":"crossref","unstructured":"Owens, A., Isola, P., McDermott, J.H., Torralba, A., Adelson, E.H., Freeman, W.T.: Visually indicated sounds. In: Proceedings of CVPR, pp. 2405\u20132413 (2016)","DOI":"10.1109\/CVPR.2016.264"},{"key":"27_CR12","unstructured":"Aytar, Y., Vondrick, C., Torralba, A.: See, hear, and read: deep aligned representations. CoRR abs\/1706.00932 (2017)"},{"key":"27_CR13","doi-asserted-by":"crossref","unstructured":"Dosovitskiy, A., Springenberg, J.T., Riedmiller, M., Brox, T.: Discriminative unsupervised feature learning with convolutional neural networks. In: NIPS (2014)","DOI":"10.1109\/CVPR.2015.7298761"},{"key":"27_CR14","doi-asserted-by":"crossref","unstructured":"Doersch, C., Gupta, A., Efros, A.A.: Unsupervised visual representation learning by context prediction. In: Proceedings of CVPR (2015)","DOI":"10.1109\/ICCV.2015.167"},{"key":"27_CR15","doi-asserted-by":"crossref","unstructured":"Agrawal, P., Carreira, J., Malik, J.: Learning to see by moving. In: Proceedings of ICCV (2015)","DOI":"10.1109\/ICCV.2015.13"},{"key":"27_CR16","doi-asserted-by":"crossref","unstructured":"Wang, X., Gupta, A.: Unsupervised learning of visual representations using videos. In: Proceedings of ICCV, pp. 2794\u20132802 (2015)","DOI":"10.1109\/ICCV.2015.320"},{"key":"27_CR17","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"649","DOI":"10.1007\/978-3-319-46487-9_40","volume-title":"Computer Vision \u2013 ECCV 2016","author":"R Zhang","year":"2016","unstructured":"Zhang, R., Isola, P., Efros, A.A.: Colorful image colorization. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9907, pp. 649\u2013666. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46487-9_40"},{"key":"27_CR18","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"527","DOI":"10.1007\/978-3-319-46448-0_32","volume-title":"Computer Vision \u2013 ECCV 2016","author":"I Misra","year":"2016","unstructured":"Misra, I., Zitnick, C.L., Hebert, M.: Shuffle and learn: unsupervised learning using temporal order verification. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 527\u2013544. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_32"},{"key":"27_CR19","doi-asserted-by":"crossref","unstructured":"Pathak, D., Kr\u00e4henb\u00fchl, P., Donahue, J., Darrell, T., Efros, A.A.: Context encoders: feature learning by inpainting. In: Proceedings of CVPR, pp. 2536\u20132544 (2016)","DOI":"10.1109\/CVPR.2016.278"},{"key":"27_CR20","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1007\/978-3-319-46466-4_5","volume-title":"Computer Vision \u2013 ECCV 2016","author":"M Noroozi","year":"2016","unstructured":"Noroozi, M., Favaro, P.: Unsupervised learning of visual representations by solving jigsaw puzzles. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9910, pp. 69\u201384. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46466-4_5"},{"key":"27_CR21","doi-asserted-by":"crossref","unstructured":"Fernando, B., Bilen, H., Gavves, E., Gould, S.: Self-supervised video representation learning with odd-one-out networks. In: Proceedings of ICCV (2017)","DOI":"10.1109\/CVPR.2017.607"},{"key":"27_CR22","doi-asserted-by":"crossref","unstructured":"Doersch, C., Zisserman, A.: Multi-task self-supervised visual learning. In: Proceedings of ICCV (2017)","DOI":"10.1109\/ICCV.2017.226"},{"key":"27_CR23","doi-asserted-by":"crossref","unstructured":"Gemmeke, J.F., et al.: Audio set: an ontology and human-labeled dataset for audio events. In: ICASSP (2017)","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"27_CR24","unstructured":"Arandjelovi\u0107, R., Zisserman, A.: Objects that sound. CoRR abs\/1712.06651 (2017)"},{"key":"27_CR25","unstructured":"Ioffe, S., Szegedy, C.: Batch normalization: accelerating deep network training by reducing internal covariate shift. In: Proceedings of ICML (2015)"},{"key":"27_CR26","doi-asserted-by":"crossref","unstructured":"Arandjelovi\u0107, R., Gronat, P., Torii, A., Pajdla, T., Sivic, J.: NetVLAD: CNN architecture for weakly supervised place recognition. In: IEEE PAMI (2017)","DOI":"10.1109\/CVPR.2016.572"},{"key":"27_CR27","doi-asserted-by":"crossref","unstructured":"Chopra, S., Hadsell, R., LeCun, Y.: Learning a similarity metric discriminatively, with application to face verification. In: Proceedings of CVPR, vol. 1, pp. 539\u2013546. IEEE (2005)","DOI":"10.1109\/CVPR.2005.202"},{"key":"27_CR28","doi-asserted-by":"crossref","unstructured":"Wang, L., Li, Y., Lazebnik, S.: Learning deep structure-preserving image-text embeddings. In: Proceedings of CVPR (2016)","DOI":"10.1109\/CVPR.2016.541"},{"key":"27_CR29","doi-asserted-by":"crossref","unstructured":"Hong, S., Im, W., S. Yang, H.: CBVMR: content-based video-music retrieval using soft intra-modal structure constraint. In: ACM ICMR (2018)","DOI":"10.1145\/3206025.3206046"},{"key":"27_CR30","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. In: International Conference on Learning Representations (2015)"},{"key":"27_CR31","unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos. In: NIPS (2014)"},{"key":"27_CR32","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. In: Proceedings of ICLR (2015)"},{"key":"27_CR33","doi-asserted-by":"crossref","unstructured":"Szegedy, C., et al.: Going deeper with convolutions. In: Proceedings of CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"27_CR34","doi-asserted-by":"crossref","unstructured":"Piczak, K.J.: ESC: dataset for environmental sound classification. In: Proceedings of ACMM (2015)","DOI":"10.1145\/2733373.2806390"},{"issue":"1\u20132","key":"27_CR35","doi-asserted-by":"publisher","first-page":"31","DOI":"10.1016\/S0004-3702(96)00034-3","volume":"89","author":"TG Dietterich","year":"1997","unstructured":"Dietterich, T.G., Lathrop, R.H., Lozano-Perez, T.: Solving the multiple instance problem with axis-parallel rectangles. Artif. Intell. 89(1\u20132), 31\u201371 (1997)","journal-title":"Artif. Intell."},{"key":"27_CR36","doi-asserted-by":"crossref","unstructured":"Oquab, M., Bottou, L., Laptev, I., Sivic, J.: Is object localization for free? - Weakly-supervised learning with convolutional neural networks. In: Proceedings of CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298668"},{"key":"27_CR37","doi-asserted-by":"crossref","unstructured":"Zhou, B., Khosla, A., Lapedriza, A., Oliva, A., Torralba, A.: Learning deep features for discriminative localization. In: Proceedings of CVPR (2016)","DOI":"10.1109\/CVPR.2016.319"},{"key":"27_CR38","unstructured":"Bahdanau, D., Cho, K., Bengio, Y.: Neural machine translation by jointly learning to align and translate. In: Proceedings of ICLR (2015)"},{"issue":"5","key":"27_CR39","doi-asserted-by":"publisher","first-page":"882","DOI":"10.1109\/JSTSP.2010.2057890","volume":"4","author":"ST Shivappa","year":"2010","unstructured":"Shivappa, S.T., Rao, B.D., Trivedi, M.M.: Audio-visual fusion and tracking with multilevel iterative decoding: framework and experimental evaluation. IEEE J. Sel. Top. Signal Process. 4(5), 882\u2013894 (2010)","journal-title":"IEEE J. Sel. Top. Signal Process."},{"key":"27_CR40","doi-asserted-by":"crossref","unstructured":"Senocak, A., Oh, T.H., Kim, J., Yang, M.H., Kweon, I.S.: On learning association of sound source and visual scenes. In: Proceedings of CVPR (2018)","DOI":"10.1109\/CVPR.2018.00458"},{"key":"27_CR41","doi-asserted-by":"crossref","unstructured":"Zhao, H., Gan, C., Rouditchenko, A., Vondrick, C., McDermott, J., Torralba, A.: The sound of pixels. In: Ferrari, (eds.) ECCV 2018, Part I. LNCS, vol. 11205, pp. 587\u2013604. Springer, Cham (2018)","DOI":"10.1007\/978-3-030-01246-5_35"},{"key":"27_CR42","doi-asserted-by":"crossref","unstructured":"Owens, A., Efros, A.A.: Audio-visual scene analysis with self-supervised multisensory features. In: Proceedings of ECCV (2018, to appear)","DOI":"10.1007\/978-3-030-01231-1_39"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2018"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-01246-5_27","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T18:37:03Z","timestamp":1775241423000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-01246-5_27"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018]]},"ISBN":["9783030012458","9783030012465"],"references-count":42,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-01246-5_27","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018]]},"assertion":[{"value":"6 October 2018","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Munich","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2018","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 September 2018","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 September 2018","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2018","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2018.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}