{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T16:47:12Z","timestamp":1777654032471,"version":"3.51.4"},"publisher-location":"Cham","reference-count":59,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030012304","type":"print"},{"value":"9783030012311","type":"electronic"}],"license":[{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018]]},"DOI":"10.1007\/978-3-030-01231-1_40","type":"book-chapter","created":{"date-parts":[[2018,10,5]],"date-time":"2018-10-05T16:03:25Z","timestamp":1538755405000},"page":"659-677","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":79,"title":["Jointly Discovering Visual Objects and Spoken Words from Raw Sensory Input"],"prefix":"10.1007","author":[{"given":"David","family":"Harwath","sequence":"first","affiliation":[]},{"given":"Adri\u00e0","family":"Recasens","sequence":"additional","affiliation":[]},{"given":"D\u00eddac","family":"Sur\u00eds","sequence":"additional","affiliation":[]},{"given":"Galen","family":"Chuang","sequence":"additional","affiliation":[]},{"given":"Antonio","family":"Torralba","sequence":"additional","affiliation":[]},{"given":"James","family":"Glass","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2018,10,6]]},"reference":[{"key":"40_CR1","doi-asserted-by":"crossref","unstructured":"Alishahi, A., Barking, M., Chrupala, G.: Encoding of phonology in a recurrent neural model of grounded speech. In: CoNLL (2017)","DOI":"10.18653\/v1\/K17-1037"},{"key":"40_CR2","doi-asserted-by":"crossref","unstructured":"Antol, S., et al.: VQA: visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV) (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"40_CR3","doi-asserted-by":"crossref","unstructured":"Arandjelovic, R., Zisserman, A.: Look, listen, and learn. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.73"},{"key":"40_CR4","doi-asserted-by":"crossref","unstructured":"Aytar, Y., Vondrick, C., Torralba, A.: SoundNet: learning sound representations from unlabeled video. In: Advances in Neural Information Processing Systems, vol. 29, pp. 892\u2013900 (2016)","DOI":"10.1109\/CVPR.2016.18"},{"key":"40_CR5","unstructured":"Bergamo, A., Bazzani, L., Anguelov, D., Torresani, L.: Self-taught object localization with deep networks. CoRR abs\/1409.3964 (2014). http:\/\/arxiv.org\/abs\/1409.3964"},{"key":"40_CR6","doi-asserted-by":"crossref","unstructured":"Bromley, J., Guyon, I., LeCun, Y., S\u00e4ckinger, E., Shah, R.: Signature verification using a \u201csiamese\u201d time delay neural network. In: Cowan, J.D., Tesauro, G., Alspector, J. (eds.) Advances in Neural Information Processing Systems, vol. 6, pp. 737\u2013744. Morgan-Kaufmann (1994)","DOI":"10.1142\/9789812797926_0003"},{"key":"40_CR7","doi-asserted-by":"crossref","unstructured":"Cho, M., Kwak, S., Schmid, C., Ponce, J.: Unsupervised object discovery and localization in the wild: part-based matching with bottom-up region proposals. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2015)","DOI":"10.1109\/CVPR.2015.7298724"},{"key":"40_CR8","doi-asserted-by":"crossref","unstructured":"Chrupala, G., Gelderloos, L., Alishahi, A.: Representations of language in a model of visually grounded speech signal. In: ACL (2017)","DOI":"10.18653\/v1\/P17-1057"},{"issue":"1","key":"40_CR9","doi-asserted-by":"publisher","first-page":"189","DOI":"10.1109\/TPAMI.2016.2535231","volume":"39","author":"R Cinbis","year":"2016","unstructured":"Cinbis, R., Verbeek, J., Schmid, C.: Weakly supervised object localization with multi-fold multiple instance learning. IEEE Trans. Pattern Anal. Mach. Intell. (PAMI) 39(1), 189\u2013203 (2016)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell. (PAMI)"},{"key":"40_CR10","doi-asserted-by":"crossref","unstructured":"Doersch, C., Gupta, A., Efros, A.A.: Unsupervised visual representation learning by context prediction. CoRR abs\/1505.05192 (2015). http:\/\/arxiv.org\/abs\/1505.05192","DOI":"10.1109\/ICCV.2015.167"},{"key":"40_CR11","doi-asserted-by":"crossref","unstructured":"Drexler, J., Glass, J.: Analysis of audio-visual features for unsupervised speech recognition. In: Grounded Language Understanding Workshop (2017)","DOI":"10.21437\/GLU.2017-12"},{"key":"40_CR12","doi-asserted-by":"publisher","first-page":"43","DOI":"10.1016\/j.cognition.2017.11.008","volume":"173","author":"E Dupoux","year":"2018","unstructured":"Dupoux, E.: Cognitive science in the era of artificial intelligence: a roadmap for reverse-engineering the infant language-learner. Cognition 173, 43\u201359 (2018)","journal-title":"Cognition"},{"key":"40_CR13","doi-asserted-by":"crossref","unstructured":"Fang, H., et al.: From captions to visual concepts and back. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2015)","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"40_CR14","unstructured":"Gao, H., Mao, J., Zhou, J., Huang, Z., Yuille, A.: Are you talking to a machine? Dataset and methods for multilingual image question answering. In: NIPS (2015)"},{"key":"40_CR15","unstructured":"Gelderloos, L., Chrupa\u0142a, G.: From phonemes to images: levels of representation in a recurrent neural model of visually-grounded language learning. arXiv:1610.03342 (2016)"},{"key":"40_CR16","doi-asserted-by":"crossref","unstructured":"Girshick, R., Donahue, J., Darrell, T., Malik, J.: Rich feature hierarchies for accurate object detection and semantic segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2013)","DOI":"10.1109\/CVPR.2014.81"},{"key":"40_CR17","doi-asserted-by":"crossref","unstructured":"Gu\u00e9rin, J., Gibaru, O., Thiery, S., Nyiri, E.: CNN features are also great at unsupervised classification. CoRR abs\/1707.01700 (2017). http:\/\/arxiv.org\/abs\/1707.01700","DOI":"10.5121\/csit.2018.80308"},{"key":"40_CR18","doi-asserted-by":"crossref","unstructured":"Harwath, D., Glass, J.: Learning word-like units from joint audio-visual analysis. In: Proceedings of the Annual Meeting of the Association for Computational Linguistics (ACL) (2017)","DOI":"10.18653\/v1\/P17-1047"},{"key":"40_CR19","unstructured":"Harwath, D., Torralba, A., Glass, J.R.: Unsupervised learning of spoken language with visual context. In: Proceedings of the Neural Information Processing Systems (NIPS) (2016)"},{"key":"40_CR20","unstructured":"Ioffe, S., Szegedy, C.: Batch normalization: accelerating deep network training by reducing internal covariate shift. J. Mach. Learn. Res. (JMLR) (2015)"},{"key":"40_CR21","doi-asserted-by":"crossref","unstructured":"Jansen, A., Church, K., Hermansky, H.: Toward spoken term discovery at scale with zero resources. In: Proceedings of the Annual Conference of International Speech Communication Association (INTERSPEECH) (2010)","DOI":"10.21437\/Interspeech.2010-483"},{"key":"40_CR22","doi-asserted-by":"crossref","unstructured":"Jansen, A., Van Durme, B.: Efficient spoken term discovery using randomized algorithms. In: Proceedings of the IEEE Workshop on Automfatic Speech Recognition and Understanding (ASRU) (2011)","DOI":"10.1109\/ASRU.2011.6163965"},{"key":"40_CR23","doi-asserted-by":"crossref","unstructured":"Johnson, J., Karpathy, A., Fei-Fei, L.: DenseCap: fully convolutional localization networks for dense captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016)","DOI":"10.1109\/CVPR.2016.494"},{"key":"40_CR24","doi-asserted-by":"crossref","unstructured":"Kamper, H., Elsner, M., Jansen, A., Goldwater, S.: Unsupervised neural network based feature extraction using weak top-down constraints. In: Proceedings of the International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2015)","DOI":"10.1109\/ICASSP.2015.7179087"},{"issue":"4","key":"40_CR25","doi-asserted-by":"publisher","first-page":"669","DOI":"10.1109\/TASLP.2016.2517567","volume":"24","author":"H Kamper","year":"2016","unstructured":"Kamper, H., Jansen, A., Goldwater, S.: Unsupervised word segmentation and lexicon discovery using acoustic word embeddings. IEEE Trans. Audio Speech Lang. Process. 24(4), 669\u2013679 (2016)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"40_CR26","doi-asserted-by":"crossref","unstructured":"Kamper, H., Settle, S., Shakhnarovich, G., Livescu, K.: Visually grounded learning of keyword prediction from untranscribed speech. In: INTERSPEECH (2017)","DOI":"10.21437\/Interspeech.2017-502"},{"key":"40_CR27","unstructured":"Karpathy, A., Joulin, A., Fei-Fei, L.: Deep fragment embeddings for bidirectional image sentence mapping. In: Proceedings of the Neural Information Processing Systems (NIPS) (2014)"},{"key":"40_CR28","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Li, F.F.: Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"issue":"11","key":"40_CR29","doi-asserted-by":"publisher","first-page":"2278","DOI":"10.1109\/5.726791","volume":"86","author":"Y LeCun","year":"1998","unstructured":"LeCun, Y., Bottou, L., Bengio, Y., Haffner, P.: Gradient-based learning applied to document recognition. Proc. IEEE 86(11), 2278\u20132324 (1998)","journal-title":"Proc. IEEE"},{"key":"40_CR30","unstructured":"Lee, C., Glass, J.: A nonparametric Bayesian approach to acoustic model discovery. In: Proceedings of the Annual Meeting of the Association for Computational Linguistics (ACL) (2012)"},{"key":"40_CR31","unstructured":"Lewis, M.P., Simon, G.F., Fennig, C.D.: Ethnologue: Languages of the World, 9th edn. SIL International (2016). http:\/\/www.ethnologue.com"},{"key":"40_CR32","doi-asserted-by":"crossref","unstructured":"Lin, T., et al.: Microsoft COCO: common objects in context. arXiv:1405.0312 (2015)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"40_CR33","unstructured":"Malinowski, M., Fritz, M.: A multi-world approach to question answering about real-world scenes based on uncertain input. In: NIPS (2014)"},{"key":"40_CR34","doi-asserted-by":"crossref","unstructured":"Malinowski, M., Rohrbach, M., Fritz, M.: Ask your neurons: a neural-based approach to answering questions about images. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.9"},{"key":"40_CR35","doi-asserted-by":"publisher","first-page":"80","DOI":"10.1016\/j.procs.2016.04.033","volume":"81","author":"Lucas Ondel","year":"2016","unstructured":"Ondel, L., Burget, L., Cernocky, J.: Variational inference for acoustic unit discovery. In: 5th Workshop on Spoken Language Technology for Under-Resourced Language (2016)","journal-title":"Procedia Computer Science"},{"key":"40_CR36","doi-asserted-by":"crossref","unstructured":"Owens, A., Isola, P., McDermott, J.H., Torralba, A., Adelson, E.H., Freeman, W.T.: Visually indicated sounds. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2016, Las Vegas, NV, USA, 27\u201330 June 2016, pp. 2405\u20132413 (2016)","DOI":"10.1109\/CVPR.2016.264"},{"key":"40_CR37","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"801","DOI":"10.1007\/978-3-319-46448-0_48","volume-title":"Computer Vision \u2013 ECCV 2016","author":"A Owens","year":"2016","unstructured":"Owens, A., Wu, J., McDermott, J.H., Freeman, W.T., Torralba, A.: Ambient sound provides supervision for visual learning. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 801\u2013816. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_48"},{"issue":"1","key":"40_CR38","doi-asserted-by":"publisher","first-page":"186","DOI":"10.1109\/TASL.2007.909282","volume":"16","author":"A Park","year":"2008","unstructured":"Park, A., Glass, J.: Unsupervised pattern discovery in speech. IEEE Trans. Audio Speech Lang. Process. 16(1), 186\u2013197 (2008)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"40_CR39","doi-asserted-by":"crossref","unstructured":"Redmon, J., Divvala, S., Girshick, R., Farhadi, A.: You only look once: unified, real-time object detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016)","DOI":"10.1109\/CVPR.2016.91"},{"key":"40_CR40","unstructured":"Reed, S.E., Akata, Z., Yan, X., Logeswaran, L., Schiele, B., Lee, H.: Generative adversarial text to image synthesis. CoRR abs\/1605.05396 (2016). http:\/\/arxiv.org\/abs\/1605.05396"},{"key":"40_CR41","unstructured":"Ren, M., Kiros, R., Zemel, R.: Exploring models and data for image question answering. In: NIPS (2015)"},{"key":"40_CR42","doi-asserted-by":"crossref","unstructured":"Renshaw, D., Kamper, H., Jansen, A., Goldwater, S.: A comparison of neural network methods for unsupervised representation learning on the zero resource speech challenge. In: Proceedings of the Annual Conference of International Speech Communication Association (INTERSPEECH) (2015)","DOI":"10.21437\/Interspeech.2015-644"},{"issue":"2","key":"40_CR43","doi-asserted-by":"publisher","first-page":"197","DOI":"10.1109\/TMM.2003.811618","volume":"5","author":"D Roy","year":"2003","unstructured":"Roy, D.: Grounded spoken language acquisition: experiments in word learning. IEEE Trans. Multimed. 5(2), 197\u2013209 (2003)","journal-title":"IEEE Trans. Multimed."},{"key":"40_CR44","doi-asserted-by":"publisher","first-page":"113","DOI":"10.1207\/s15516709cog2601_4","volume":"26","author":"D Roy","year":"2002","unstructured":"Roy, D., Pentland, A.: Learning words from sights and sounds: a computational model. Cogn. Sci. 26, 113\u2013146 (2002)","journal-title":"Cogn. Sci."},{"key":"40_CR45","unstructured":"Russell, B., Efros, A., Sivic, J., Freeman, W., Zisserman, A.: Using multiple segmentations to discover objects and their extent in image collections. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2006)"},{"key":"40_CR46","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. CoRR abs\/1409.1556 (2014)"},{"issue":"1","key":"40_CR47","doi-asserted-by":"publisher","first-page":"29","DOI":"10.1016\/0364-0213(90)90025-R","volume":"14","author":"ES Spelke","year":"1990","unstructured":"Spelke, E.S.: Principles of object perception. Cogn. Sci. 14(1), 29\u201356 (1990). https:\/\/doi.org\/10.1016\/0364-0213(90)90025-R. http:\/\/www.sciencedirect.com\/science\/article\/pii\/036402139090025R","journal-title":"Cogn. Sci."},{"key":"40_CR48","doi-asserted-by":"crossref","unstructured":"Thiolliere, R., Dunbar, E., Synnaeve, G., Versteegh, M., Dupoux, E.: A hybrid dynamic time warping-deep neural network architecture for unsupervised acoustic modeling. In: Proceedings of the Annual Conference of International Speech Communication Association (INTERSPEECH) (2015)","DOI":"10.21437\/Interspeech.2015-640"},{"key":"40_CR49","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: a neural image caption generator. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"40_CR50","doi-asserted-by":"crossref","unstructured":"de Vries, H., Strub, F., Chandar, S., Pietquin, O., Larochelle, H., Courville, A.C.: GuessWhat?! Visual object discovery through multi-modal dialogue. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2017)","DOI":"10.1109\/CVPR.2017.475"},{"key":"40_CR51","unstructured":"Weber, M., Welling, M., Perona, P.: Towards automatic discovery of object categories. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2010)"},{"key":"40_CR52","unstructured":"Xu, K., et al.: Show, attend and tell: neural image caption generation with visual attention. In: ICML (2015)"},{"key":"40_CR53","doi-asserted-by":"crossref","unstructured":"Zhang, T., Ramakrishnan, R., Livny, M.: BIRCH: an efficient data clustering method for very large databases. In: ACM SIGMOD International Conference on Management of Data, pp. 103\u2013114 (1996)","DOI":"10.1145\/235968.233324"},{"key":"40_CR54","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Salakhutdinov, R., Chang, H.A., Glass, J.: Resource configurable spoken query detection using deep Boltzmann machines. In: Proceedings of the International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2012)","DOI":"10.1109\/ICASSP.2012.6289082"},{"key":"40_CR55","unstructured":"Zhou, B., Khosla, A., Lapedriza, A., Oliva, A., Torralba, A.: Object detectors emerge in deep scene CNNs. arXiv preprint arXiv:1412.6856 (2014)"},{"key":"40_CR56","unstructured":"Zhou, B., Khosla, A., Lapedriza, A., Oliva, A., Torralba, A.: Object detectors emerge in deep scene CNNs. In: Proceedings of the International Conference on Learning Representations (ICLR) (2015)"},{"key":"40_CR57","doi-asserted-by":"crossref","unstructured":"Zhou, B., Khosla, A., Lapedriza, A., Oliva, A., Torralba, A.: Learning deep features for discriminative localization. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016)","DOI":"10.1109\/CVPR.2016.319"},{"key":"40_CR58","unstructured":"Zhou, B., Lapedriza, A., Xiao, J., Torralba, A., Oliva, A.: Learning deep features for scene recognition using places database. In: Proceedings of the Neural Information Processing Systems (NIPS) (2014)"},{"key":"40_CR59","doi-asserted-by":"crossref","unstructured":"Zhou, B., Zhao, H., Puig, X., Fidler, S., Barriuso, A., Torralba, A.: Scene parsing through ADE20K dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2017)","DOI":"10.1109\/CVPR.2017.544"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2018"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-01231-1_40","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T18:37:10Z","timestamp":1775241430000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-01231-1_40"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018]]},"ISBN":["9783030012304","9783030012311"],"references-count":59,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-01231-1_40","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018]]},"assertion":[{"value":"6 October 2018","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Munich","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2018","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 September 2018","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 September 2018","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2018","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2018.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}