{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,31]],"date-time":"2026-03-31T16:42:09Z","timestamp":1774975329739,"version":"3.50.1"},"publisher-location":"Cham","reference-count":58,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030012489","type":"print"},{"value":"9783030012496","type":"electronic"}],"license":[{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018]]},"DOI":"10.1007\/978-3-030-01249-6_44","type":"book-chapter","created":{"date-parts":[[2018,10,5]],"date-time":"2018-10-05T11:35:46Z","timestamp":1538739346000},"page":"729-745","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["A Dataset and Architecture for Visual Reasoning with a Working Memory"],"prefix":"10.1007","author":[{"given":"Guangyu Robert","family":"Yang","sequence":"first","affiliation":[]},{"given":"Igor","family":"Ganichev","sequence":"additional","affiliation":[]},{"given":"Xiao-Jing","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Jonathon","family":"Shlens","sequence":"additional","affiliation":[]},{"given":"David","family":"Sussillo","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2018,10,6]]},"reference":[{"issue":"2","key":"44_CR1","doi-asserted-by":"publisher","first-page":"245","DOI":"10.1016\/j.neuron.2017.06.011","volume":"95","author":"D Hassabis","year":"2017","unstructured":"Hassabis, D., Kumaran, D., Summerfield, C., Botvinick, M.: Neuroscience-inspired artificial intelligence. Neuron 95(2), 245\u2013258 (2017)","journal-title":"Neuron"},{"key":"44_CR2","doi-asserted-by":"crossref","unstructured":"Hu, R., Andreas, J., Rohrbach, M., Darrell, T., Saenko, K.: Learning to reason: end-to-end module networks for visual question answering. CoRR, abs\/1704.05526, vol. 3 (2017)","DOI":"10.1109\/ICCV.2017.93"},{"key":"44_CR3","doi-asserted-by":"crossref","unstructured":"Johnson, J., et al.: Inferring and executing programs for visual reasoning. arXiv preprint arXiv:1705.03633 (2017)","DOI":"10.1109\/ICCV.2017.325"},{"key":"44_CR4","unstructured":"Santoro, A., et al.: A simple neural network module for relational reasoning. In: Advances in Neural Information Processing Systems, pp. 4974\u20134983 (2017)"},{"key":"44_CR5","doi-asserted-by":"crossref","unstructured":"Perez, E., Strub, F., De Vries, H., Dumoulin, V., Courville, A.: Film: visual reasoning with a general conditioning layer. arXiv preprint arXiv:1709.07871 (2017)","DOI":"10.1609\/aaai.v32i1.11671"},{"key":"44_CR6","doi-asserted-by":"crossref","unstructured":"Antol, S., et al.: VQA: visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2425\u20132433 (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"44_CR7","unstructured":"Gao, H., Mao, J., Zhou, J., Huang, Z., Wang, L., Xu, W.: Are you talking to a machine? Dataset and methods for multilingual image question. In: Advances in Neural Information Processing Systems, pp. 2296\u20132304 (2015)"},{"key":"44_CR8","unstructured":"Malinowski, M., Fritz, M.: A multi-world approach to question answering about real-world scenes based on uncertain input. In: Advances in Neural Information Processing Systems, pp. 1682\u20131690 (2014)"},{"key":"44_CR9","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Groth, O., Bernstein, M., Fei-Fei, L.: Visual7W: grounded question answering in images. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4995\u20135004 (2016)","DOI":"10.1109\/CVPR.2016.540"},{"key":"44_CR10","doi-asserted-by":"crossref","unstructured":"Johnson, J., Hariharan, B., van der Maaten, L., Fei-Fei, L., Zitnick, C.L., Girshick, R.: CLEVR: a diagnostic dataset for compositional language and elementary visual reasoning. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1988\u20131997. IEEE (2017)","DOI":"10.1109\/CVPR.2017.215"},{"issue":"6","key":"44_CR11","doi-asserted-by":"publisher","first-page":"1636","DOI":"10.1109\/TMM.2014.2330697","volume":"16","author":"BL Sturm","year":"2014","unstructured":"Sturm, B.L.: A simple method to determine if a music information retrieval system is a horse. IEEE Trans. Multimed. 16(6), 1636\u20131644 (2014)","journal-title":"IEEE Trans. Multimed."},{"key":"44_CR12","doi-asserted-by":"crossref","unstructured":"Agrawal, A., Batra, D., Parikh, D.: Analyzing the behavior of visual question answering models. arXiv preprint arXiv:1606.07356 (2016)","DOI":"10.18653\/v1\/D16-1203"},{"key":"44_CR13","volume-title":"Understanding Natural Language","author":"T Winograd","year":"1972","unstructured":"Winograd, T.: Understanding Natural Language. Academic Press Inc., Orlando (1972)"},{"key":"44_CR14","unstructured":"Mnih, V., et al.: Playing atari with deep reinforcement learning. arXiv preprint arXiv:1312.5602 (2013)"},{"issue":"7540","key":"44_CR15","doi-asserted-by":"publisher","first-page":"529","DOI":"10.1038\/nature14236","volume":"518","author":"V Mnih","year":"2015","unstructured":"Mnih, V., et al.: Human-level control through deep reinforcement learning. Nature 518(7540), 529 (2015)","journal-title":"Nature"},{"key":"44_CR16","unstructured":"Vinyals, O., et al.: StarCraft II: a new challenge for reinforcement learning. arXiv preprint arXiv:1708.04782 (2017)"},{"key":"44_CR17","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Toderici, G., Shetty, S., Leung, T., Sukthankar, R., Fei-Fei, L.: Large-scale video classification with convolutional neural networks. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.223"},{"key":"44_CR18","unstructured":"Abu-El-Haija, S., et al.: YouTube-8M: a large-scale video classification benchmark. arXiv preprint arXiv:1609.08675 (2016)"},{"key":"44_CR19","doi-asserted-by":"crossref","unstructured":"Caba Heilbron, F., Escorcia, V., Ghanem, B., Carlos Niebles, J.: ActivityNet: a large-scale video benchmark for human activity understanding. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 961\u2013970 (2015)","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"44_CR20","unstructured":"Hudson, D.A., Manning, C.D.: Compositional attention networks for machine reasoning. In: International Conference on Learning Representations (2018)"},{"key":"44_CR21","doi-asserted-by":"publisher","first-page":"135","DOI":"10.1146\/annurev-psych-113011-143750","volume":"64","author":"A Diamond","year":"2013","unstructured":"Diamond, A.: Executive functions. Ann. Rev. Psychol. 64, 135\u2013168 (2013)","journal-title":"Ann. Rev. Psychol."},{"issue":"1","key":"44_CR22","doi-asserted-by":"publisher","first-page":"49","DOI":"10.1006\/cogp.1999.0734","volume":"41","author":"A Miyake","year":"2000","unstructured":"Miyake, A., Friedman, N.P., Emerson, M.J., Witzki, A.H., Howerter, A., Wager, T.D.: The unity and diversity of executive functions and their contributions to complex frontal lobe tasks: a latent variable analysis. Cogn. Psychol. 41(1), 49\u2013100 (2000)","journal-title":"Cogn. Psychol."},{"issue":"1","key":"44_CR23","doi-asserted-by":"publisher","first-page":"15","DOI":"10.1080\/00221309.1948.9918159","volume":"39","author":"EA Berg","year":"1948","unstructured":"Berg, E.A.: A simple objective technique for measuring flexibility in thinking. J. Gen. Psychol. 39(1), 15\u201322 (1948)","journal-title":"J. Gen. Psychol."},{"issue":"1","key":"44_CR24","doi-asserted-by":"publisher","first-page":"90","DOI":"10.1001\/archneur.1963.00460070100010","volume":"9","author":"B Milner","year":"1963","unstructured":"Milner, B.: Effects of different brain lesions on card sorting: the role of the frontal lobes. Arch. Neurol. 9(1), 90\u2013100 (1963)","journal-title":"Arch. Neurol."},{"issue":"5044","key":"44_CR25","doi-asserted-by":"publisher","first-page":"556","DOI":"10.1126\/science.1736359","volume":"255","author":"A Baddeley","year":"1992","unstructured":"Baddeley, A.: Working memory. Science 255(5044), 556\u2013559 (1992)","journal-title":"Science"},{"issue":"16","key":"44_CR26","doi-asserted-by":"publisher","first-page":"5154","DOI":"10.1523\/JNEUROSCI.16-16-05154.1996","volume":"16","author":"EK Miller","year":"1996","unstructured":"Miller, E.K., Erickson, C.A., Desimone, R.: Neural mechanisms of visual working memory in prefrontal cortex of the macaque. J. Neurosci. 16(16), 5154\u20135167 (1996)","journal-title":"J. Neurosci."},{"issue":"1","key":"44_CR27","doi-asserted-by":"publisher","first-page":"167","DOI":"10.1146\/annurev.neuro.24.1.167","volume":"24","author":"EK Miller","year":"2001","unstructured":"Miller, E.K., Cohen, J.D.: An integrative theory of prefrontal cortex function. Ann. Rev. Neurosci. 24(1), 167\u2013202 (2001)","journal-title":"Ann. Rev. Neurosci."},{"issue":"6237","key":"44_CR28","doi-asserted-by":"publisher","first-page":"52","DOI":"10.1038\/341052a0","volume":"341","author":"WT Newsome","year":"1989","unstructured":"Newsome, W.T., Britten, K.H., Movshon, J.A.: Neuronal correlates of a perceptual decision. Nature 341(6237), 52 (1989)","journal-title":"Nature"},{"issue":"3","key":"44_CR29","doi-asserted-by":"publisher","first-page":"203","DOI":"10.1038\/nrn1058","volume":"4","author":"R Romo","year":"2003","unstructured":"Romo, R., Salinas, E.: Cognitive neuroscience: flutter discrimination: neural codes, perception, memory and decision making. Nat. Rev. Neurosci. 4(3), 203 (2003)","journal-title":"Nat. Rev. Neurosci."},{"issue":"7474","key":"44_CR30","doi-asserted-by":"publisher","first-page":"78","DOI":"10.1038\/nature12742","volume":"503","author":"V Mante","year":"2013","unstructured":"Mante, V., Sussillo, D., Shenoy, K.V., Newsome, W.T.: Context-dependent computation by recurrent dynamics in prefrontal cortex. Nature 503(7474), 78 (2013)","journal-title":"Nature"},{"issue":"7451","key":"44_CR31","doi-asserted-by":"publisher","first-page":"585","DOI":"10.1038\/nature12160","volume":"497","author":"M Rigotti","year":"2013","unstructured":"Rigotti, M., et al.: The importance of mixed selectivity in complex cognitive tasks. Nature 497(7451), 585 (2013)","journal-title":"Nature"},{"issue":"1","key":"44_CR32","doi-asserted-by":"publisher","first-page":"7","DOI":"10.1177\/001872086300500102","volume":"5","author":"DB Yntema","year":"1963","unstructured":"Yntema, D.B.: Keeping track of several things at once. Hum. Factors 5(1), 7\u201317 (1963)","journal-title":"Hum. Factors"},{"issue":"1","key":"44_CR33","doi-asserted-by":"publisher","first-page":"37","DOI":"10.1016\/S0885-2014(96)90027-1","volume":"11","author":"PD Zelazo","year":"1996","unstructured":"Zelazo, P.D., Frye, D., Rapus, T.: An age-related dissociation between knowing rules and using them. Cogn. Dev. 11(1), 37\u201363 (1996)","journal-title":"Cogn. Dev."},{"issue":"1","key":"44_CR34","doi-asserted-by":"publisher","first-page":"46","DOI":"10.1002\/hbm.20131","volume":"25","author":"AM Owen","year":"2005","unstructured":"Owen, A.M., McMillan, K.M., Laird, A.R., Bullmore, E.: N-back working memory paradigm: a meta-analysis of normative functional neuroimaging studies. Hum. Brain Mapp. 25(1), 46\u201359 (2005)","journal-title":"Hum. Brain Mapp."},{"key":"44_CR35","unstructured":"Graves, A., Wayne, G., Danihelka, I.: Neural turing machines. CoRR abs\/1410.5401 (2014)"},{"key":"44_CR36","unstructured":"Joulin, A., Mikolov, T.: Inferring algorithmic patterns with stack-augmented recurrent nets. CoRR abs\/1503.01007 (2015)"},{"key":"44_CR37","unstructured":"Collins, J., Sohl-Dickstein, J., Sussillo, D.: Capacity and trainability in recurrent neural networks. Stat 1050, 28 (2017)"},{"issue":"8","key":"44_CR38","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"issue":"7626","key":"44_CR39","doi-asserted-by":"publisher","first-page":"471","DOI":"10.1038\/nature20101","volume":"538","author":"A Graves","year":"2016","unstructured":"Graves, A., et al.: Hybrid computing using a neural network with dynamic external memory. Nature 538(7626), 471\u2013476 (2016)","journal-title":"Nature"},{"key":"44_CR40","unstructured":"Kay, W., et al.: The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)"},{"key":"44_CR41","doi-asserted-by":"crossref","unstructured":"Ng, J.Y.H., Hausknecht, M., Vijayanarasimhan, S., Vinyals, O., Monga, R., Toderici, G.: Beyond short snippets: deep networks for video classification. In: 2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 4694\u20134702. IEEE (2015)","DOI":"10.1109\/CVPR.2015.7299101"},{"key":"44_CR42","unstructured":"Weston, J., et al.: Towards AI-complete question answering: a set of prerequisite toy tasks. arXiv preprint arXiv:1502.05698 (2015)"},{"key":"44_CR43","doi-asserted-by":"crossref","unstructured":"Zitnick, C.L., Parikh, D.: Bringing semantics into focus using visual abstraction. In: 2013 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3009\u20133016. IEEE (2013)","DOI":"10.1109\/CVPR.2013.387"},{"key":"44_CR44","unstructured":"Kuhnle, A., Copestake, A.: ShapeWorld-a new test methodology for multimodal language understanding. arXiv preprint arXiv:1704.04517 (2017)"},{"key":"44_CR45","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"451","DOI":"10.1007\/978-3-319-46478-7_28","volume-title":"Computer Vision \u2013 ECCV 2016","author":"H Xu","year":"2016","unstructured":"Xu, H., Saenko, K.: Ask, attend and answer: exploring question-guided spatial attention for visual question answering. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9911, pp. 451\u2013466. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46478-7_28"},{"key":"44_CR46","unstructured":"Dumoulin, V., Shlens, J., Kudlur, M.: A learned representation for artistic style. In: International Conference on Learning Representations (ICLR) (2017)"},{"issue":"6657","key":"44_CR47","doi-asserted-by":"publisher","first-page":"279","DOI":"10.1038\/36846","volume":"390","author":"SJ Luck","year":"1997","unstructured":"Luck, S.J., Vogel, E.K.: The capacity of visual working memory for features and conjunctions. Nature 390(6657), 279 (1997)","journal-title":"Nature"},{"issue":"1","key":"44_CR48","doi-asserted-by":"publisher","first-page":"1","DOI":"10.3758\/s13415-012-0125-7","volume":"13","author":"MW Cole","year":"2013","unstructured":"Cole, M.W., Laurent, P., Stocco, A.: Rapid instructed task learning: a new window into the human brains unique capacity for flexible cognitive control. Cogn. Affect. Behav. Neurosci. 13(1), 1\u201322 (2013)","journal-title":"Cogn. Affect. Behav. Neurosci."},{"key":"44_CR49","unstructured":"Graves, A.: Adaptive computation time for recurrent neural networks. arXiv preprint arXiv:1603.08983 (2016)"},{"key":"44_CR50","unstructured":"Ioffe, S., Szegedy, C.: Batch normalization: accelerating deep network training by reducing internal covariate shift. In: International Conference on Machine Learning, pp. 448\u2013456 (2015)"},{"key":"44_CR51","unstructured":"Xu, K., et al.: Show, attend and tell: neural image caption generation with visual attention. In: International Conference on Machine Learning, pp. 2048\u20132057 (2015)"},{"issue":"11","key":"44_CR52","doi-asserted-by":"publisher","first-page":"2673","DOI":"10.1109\/78.650093","volume":"45","author":"M Schuster","year":"1997","unstructured":"Schuster, M., Paliwal, K.K.: Bidirectional recurrent neural networks. IEEE Trans. Sig. Process. 45(11), 2673\u20132681 (1997)","journal-title":"IEEE Trans. Sig. Process."},{"key":"44_CR53","unstructured":"Bahdanau, D., Cho, K., Bengio, Y.: Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:1409.0473 (2014)"},{"issue":"1","key":"44_CR54","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1146\/annurev.neuro.20.1.303","volume":"20","author":"RA Andersen","year":"1997","unstructured":"Andersen, R.A., Snyder, L.H., Bradley, D.C., Xing, J.: Multimodal representation of space in the posterior parietal cortex and its use in planning movements. Ann. Rev. Neurosci. 20(1), 303\u2013330 (1997)","journal-title":"Ann. Rev. Neurosci."},{"key":"44_CR55","unstructured":"Xingjian, S., Chen, Z., Wang, H., Yeung, D.Y., Wong, W.K., Woo, W.C.: Convolutional LSTM network: a machine learning approach for precipitation nowcasting. In: Advances in Neural Information Processing Systems, pp. 802\u2013810 (2015)"},{"key":"44_CR56","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)"},{"key":"44_CR57","doi-asserted-by":"crossref","unstructured":"Yang, G.R., Song, H.F., Newsome, W.T., Wang, X.J.: Clustering and compositionality of task representations in a neural network trained to perform many cognitive tasks. bioRxiv, p. 183632 (2017)","DOI":"10.1101\/183632"},{"key":"44_CR58","unstructured":"Mikolov, T., Sutskever, I., Chen, K., Corrado, G.S., Dean, J.: Distributed representations of words and phrases and their compositionality. In: Advances in Neural Information Processing Systems, pp. 3111\u20133119 (2013)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2018"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-01249-6_44","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,31]],"date-time":"2026-03-31T15:10:16Z","timestamp":1774969816000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-01249-6_44"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018]]},"ISBN":["9783030012489","9783030012496"],"references-count":58,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-01249-6_44","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018]]},"assertion":[{"value":"6 October 2018","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Munich","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2018","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 September 2018","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 September 2018","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2018","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2018.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}