{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T16:00:58Z","timestamp":1743004858539,"version":"3.40.3"},"publisher-location":"Cham","reference-count":42,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783030110086"},{"type":"electronic","value":"9783030110093"}],"license":[{"start":{"date-parts":[[2019,1,1]],"date-time":"2019-01-01T00:00:00Z","timestamp":1546300800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2019,1,1]],"date-time":"2019-01-01T00:00:00Z","timestamp":1546300800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2019]]},"DOI":"10.1007\/978-3-030-11009-3_32","type":"book-chapter","created":{"date-parts":[[2019,1,24]],"date-time":"2019-01-24T06:24:44Z","timestamp":1548311084000},"page":"521-537","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Answering Visual What-If Questions: From Actions to Predicted Scene Descriptions"],"prefix":"10.1007","author":[{"given":"Misha","family":"Wagner","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hector","family":"Basevi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rakshith","family":"Shetty","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wenbin","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mateusz","family":"Malinowski","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mario","family":"Fritz","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ale\u0161","family":"Leonardis","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2019,1,23]]},"reference":[{"key":"32_CR1","unstructured":"Mathieu, M., Couprie, C., LeCun, Y.: Deep multi-scale video prediction beyond mean square error. In: International Conference on Learning Representations (2016)"},{"key":"32_CR2","doi-asserted-by":"crossref","unstructured":"Bhattacharyya, A., Malinowski, M., Schiele, B., Fritz, M.: Long-term image boundary extrapolation. In: Association for the Advancement of Artificial Intelligence (2018)","DOI":"10.1609\/aaai.v32i1.11811"},{"key":"32_CR3","unstructured":"Lerer, A., Gross, S., Fergus, R.: Learning physical intuition of block towers by example. In: International Conference on Machine Learning (2016)"},{"key":"32_CR4","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"269","DOI":"10.1007\/978-3-319-46493-0_17","volume-title":"Computer Vision \u2013 ECCV 2016","author":"R Mottaghi","year":"2016","unstructured":"Mottaghi, R., Rastegari, M., Gupta, A., Farhadi, A.: \u201cWhat happens if...\u201d learning to predict the effect of forces in images. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9908, pp. 269\u2013285. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46493-0_17"},{"issue":"4","key":"32_CR5","doi-asserted-by":"publisher","first-page":"122","DOI":"10.1038\/scientificamerican0483-122","volume":"248","author":"M McCloskey","year":"1983","unstructured":"McCloskey, M.: Intuitive physics. Sci. Am. 248(4), 122\u2013131 (1983)","journal-title":"Sci. Am."},{"key":"32_CR6","doi-asserted-by":"crossref","unstructured":"Grzeszczuk, R., Terzopoulos, D., Hinton, G.: Neuroanimator: fast neural network emulation and control of physics-based models. In: Proceedings of the 25th Annual Conference on Computer Graphics and Interactive Techniques, pp. 9\u201320. ACM (1998)","DOI":"10.1145\/280814.280816"},{"issue":"45","key":"32_CR7","doi-asserted-by":"publisher","first-page":"18327","DOI":"10.1073\/pnas.1306572110","volume":"110","author":"PW Battaglia","year":"2013","unstructured":"Battaglia, P.W., Hamrick, J.B., Tenenbaum, J.B.: Simulation as an engine of physical scene understanding. Proc. Natl. Acad. Sci. 110(45), 18327\u201318332 (2013)","journal-title":"Proc. Natl. Acad. Sci."},{"key":"32_CR8","unstructured":"Battaglia, P., Pascanu, R., Lai, M., Rezende, D.J., et al.: Interaction networks for learning about objects, relations and physics. In: Advances in Neural Information Processing Systems, pp. 4502\u20134510 (2016)"},{"key":"32_CR9","unstructured":"Watters, N., Tacchetti, A., Weber, T., Pascanu, R., Battaglia, P., Zoran, D.: Visual interaction networks. In: Advances in Neural Information Processing Systems (2017)"},{"key":"32_CR10","unstructured":"Wu, J., Yildirim, I., Lim, J.J., Freeman, B., Tenenbaum, J.: Galileo: perceiving physical object properties by integrating a physics engine with deep learning. In: Advances in Neural Information Processing Systems, pp. 127\u2013135 (2015)"},{"key":"32_CR11","doi-asserted-by":"crossref","unstructured":"Li, W., Leonardis, A., Fritz, M.: Visual stability prediction for robotic manipulation. In: Proceedings of the IEEE International Conference on Robotics and Automation (2017)","DOI":"10.1109\/ICRA.2017.7989304"},{"key":"32_CR12","unstructured":"Li, W., Azimi, S., Leonardis, A., Fritz, M.: To fall or not to fall: a visual approach to physical stability prediction. CoRR abs\/1604.00066 (2016)"},{"key":"32_CR13","unstructured":"Ranzato, M., Szlam, A., Bruna, J., Mathieu, M., Collobert, R., Chopra, S.: Video (language) modeling: a baseline for generative models of natural videos. CoRR abs\/1412.6604 (2014)"},{"key":"32_CR14","doi-asserted-by":"crossref","unstructured":"Mottaghi, R., Bagherinezhad, H., Rastegari, M., Farhadi, A.: Newtonian scene understanding: unfolding the dynamics of objects in static images. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3521\u20133529 (2016)","DOI":"10.1109\/CVPR.2016.383"},{"key":"32_CR15","unstructured":"Malinowski, M., Fritz, M.: A multi-world approach to question answering about real-world scenes based on uncertain input. In: Advances in Neural Information Processing Systems, pp. 1682\u20131690 (2014)"},{"key":"32_CR16","unstructured":"Malinowski, M., Fritz, M.: Towards a visual turing challenge. CoRR abs\/1410.8027 (2014)"},{"issue":"12","key":"32_CR17","doi-asserted-by":"crossref","first-page":"3618","DOI":"10.1073\/pnas.1422953112","volume":"112","author":"D Geman","year":"2015","unstructured":"Geman, D., Geman, S., Hallonquist, N., Younes, L.: Visual Turing test for computer vision systems. Proc. Natl. Acad. Sci. 112(12), 3618\u20133623 (2015)","journal-title":"Proc. Natl. Acad. Sci."},{"key":"32_CR18","unstructured":"Ren, M., Kiros, R., Zemel, R.: Exploring models and data for image question answering. In: Advances in Neural Information Processing Systems (2015)"},{"key":"32_CR19","doi-asserted-by":"crossref","unstructured":"Yu, L., Park, E., Berg, A.C., Berg, T.L.: Visual madlibs: fill in the blank description generation and question answering. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2461\u20132469. IEEE (2015)","DOI":"10.1109\/ICCV.2015.283"},{"key":"32_CR20","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Groth, O., Bernstein, M., Fei-Fei, L.: Visual7W: grounded question answering in images. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4995\u20135004 (2016)","DOI":"10.1109\/CVPR.2016.540"},{"key":"32_CR21","doi-asserted-by":"crossref","unstructured":"Tapaswi, M., Zhu, Y., Stiefelhagen, R., Torralba, A., Urtasun, R., Fidler, S.: MovieQA: understanding stories in movies through question-answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4631\u20134640 (2016)","DOI":"10.1109\/CVPR.2016.501"},{"key":"32_CR22","doi-asserted-by":"crossref","unstructured":"Agrawal, A., Batra, D., Parikh, D., Kembhavi, A.: Don\u2019t just assume; look and answer: overcoming priors for visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2018)","DOI":"10.1109\/CVPR.2018.00522"},{"key":"32_CR23","doi-asserted-by":"crossref","unstructured":"Kafle, K., Cohen, S., Price, B., Kanan, C.: DVQA: understanding data visualizations via question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2018)","DOI":"10.1109\/CVPR.2018.00592"},{"key":"32_CR24","doi-asserted-by":"crossref","unstructured":"Antol, S., et al.: VQA: visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2425\u20132433 (2015)","DOI":"10.1109\/ICCV.2015.279"},{"issue":"1\u20133","key":"32_CR25","doi-asserted-by":"publisher","first-page":"110","DOI":"10.1007\/s11263-017-1038-2","volume":"125","author":"M Malinowski","year":"2017","unstructured":"Malinowski, M., Rohrbach, M., Fritz, M.: Ask your neurons: a deep learning approach to visual question answering. Int. J. Comput. Vis. 125(1\u20133), 110\u2013135 (2017)","journal-title":"Int. J. Comput. Vis."},{"key":"32_CR26","doi-asserted-by":"crossref","unstructured":"Yang, Z., He, X., Gao, J., Deng, L., Smola, A.: Stacked attention networks for image question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 21\u201329 (2016)","DOI":"10.1109\/CVPR.2016.10"},{"key":"32_CR27","doi-asserted-by":"crossref","unstructured":"Fukui, A., Park, D.H., Yang, D., Rohrbach, A., Darrell, T., Rohrbach, M.: Multimodal compact bilinear pooling for visual question answering and visual grounding. CoRR abs\/1606.01847 (2016)","DOI":"10.18653\/v1\/D16-1044"},{"key":"32_CR28","doi-asserted-by":"crossref","unstructured":"Hu, R., Andreas, J., Rohrbach, M., Darrell, T., Saenko, K.: Learning to reason: end-to-end module networks for visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision (2017)","DOI":"10.1109\/ICCV.2017.93"},{"key":"32_CR29","unstructured":"Santoro, A., et al.: A simple neural network module for relational reasoning. In: Advances in Neural Information Processing Systems, pp. 4974\u20134983 (2017)"},{"key":"32_CR30","unstructured":"Ehrhardt, S., Monszpart, A., Mitra, N.J., Vedaldi, A.: Taking visual motion prediction to new heightfields. CoRR abs\/1712.09448 (2017)"},{"key":"32_CR31","volume-title":"Reinforcement Learning: An Introduction","author":"RS Sutton","year":"1998","unstructured":"Sutton, R.S., Barto, A.G.: Reinforcement Learning: An Introduction, vol. 1. MIT Press, Cambridge (1998)"},{"key":"32_CR32","unstructured":"Beattie, C., et al.: DeepMind Lab. CoRR abs\/1612.03801 (2016)"},{"key":"32_CR33","doi-asserted-by":"crossref","unstructured":"Kempka, M., Wydmuch, M., Runc, G., Toczek, J., Ja\u015bkowski, W.: ViZDoom: a doom-based AI research platform for visual reinforcement learning. In: IEEE Conference on Computational Intelligence and Games, pp. 1\u20138. IEEE (2016)","DOI":"10.1109\/CIG.2016.7860433"},{"key":"32_CR34","series-title":"Springer Proceedings in Advanced Robotics","doi-asserted-by":"publisher","first-page":"621","DOI":"10.1007\/978-3-319-67361-5_40","volume-title":"Field and Service Robotics","author":"S Shah","year":"2018","unstructured":"Shah, S., Dey, D., Lovett, C., Kapoor, A.: AirSim: high-fidelity visual and physical simulation for autonomous vehicles. In: Hutter, M., Siegwart, R. (eds.) Field and Service Robotics. SPAR, vol. 5, pp. 621\u2013635. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-319-67361-5_40"},{"key":"32_CR35","unstructured":"Wu, Y., Wu, Y., Gkioxari, G., Tian, Y.: Building generalizable agents with a realistic and rich 3D environment. CoRR abs\/1801.02209 (2018)"},{"key":"32_CR36","unstructured":"Finn, C., Goodfellow, I., Levine, S.: Unsupervised learning for physical interaction through video prediction. In: Advances in Neural Information Processing Systems, pp. 64\u201372 (2016)"},{"key":"32_CR37","doi-asserted-by":"crossref","unstructured":"Cordts, M., et al.: The cityscapes dataset for semantic urban scene understanding. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2016)","DOI":"10.1109\/CVPR.2016.350"},{"key":"32_CR38","unstructured":"\u00c7alli, B., Walsman, A., Singh, A., Srinivasa, S., Abbeel, P., Dollar, A.M.: Benchmarking in manipulation research: the YCB object and model set and benchmarking protocols. CoRR abs\/1502.03143 (2015)"},{"key":"32_CR39","unstructured":"Coumans, E.: Bullet 3 (2018). https:\/\/github.com\/bulletphysics\/bullet3"},{"key":"32_CR40","doi-asserted-by":"crossref","unstructured":"Pennington, J., Socher, R., Manning, C.D.: GloVe: global vectors for word representation. In: Empirical Methods in Natural Language Processing (EMNLP), pp. 1532\u20131543 (2014)","DOI":"10.3115\/v1\/D14-1162"},{"key":"32_CR41","unstructured":"Chollet, F., et al.: Keras (2015). https:\/\/github.com\/keras-team\/keras"},{"key":"32_CR42","unstructured":"Abadi, M., et al.: TensorFlow: large-scale machine learning on heterogeneous systems (2015). Software available from tensorflow.org"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2018 Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-11009-3_32","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,1,22]],"date-time":"2023-01-22T01:11:11Z","timestamp":1674349871000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-11009-3_32"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019]]},"ISBN":["9783030110086","9783030110093"],"references-count":42,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-11009-3_32","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2019]]},"assertion":[{"value":"23 January 2019","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Munich","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2018","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 September 2018","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 September 2018","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2018","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2018.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}