{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,4]],"date-time":"2026-02-04T16:29:34Z","timestamp":1770222574354,"version":"3.49.0"},"publisher-location":"Cham","reference-count":49,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783319464923","type":"print"},{"value":"9783319464930","type":"electronic"}],"license":[{"start":{"date-parts":[[2016,1,1]],"date-time":"2016-01-01T00:00:00Z","timestamp":1451606400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2016,1,1]],"date-time":"2016-01-01T00:00:00Z","timestamp":1451606400000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2016]]},"DOI":"10.1007\/978-3-319-46493-0_12","type":"book-chapter","created":{"date-parts":[[2016,9,16]],"date-time":"2016-09-16T14:59:53Z","timestamp":1474037993000},"page":"186-201","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":92,"title":["A Multi-scale CNN for Affordance Segmentation in RGB Images"],"prefix":"10.1007","author":[{"given":"Anirban","family":"Roy","sequence":"first","affiliation":[]},{"given":"Sinisa","family":"Todorovic","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2016,9,17]]},"reference":[{"key":"12_CR1","unstructured":"Gibson, J.J.: The theory of affordances. In: Perceiving, Acting, and Knowing: Toward and Ecological Psychology, pp. 62\u201382. Erlbaum (1977)"},{"key":"12_CR2","doi-asserted-by":"crossref","DOI":"10.4324\/9781315740218","volume-title":"The Ecological Approach to Visual Perception","author":"JJ Gibson","year":"2014","unstructured":"Gibson, J.J.: The Ecological Approach to Visual Perception, Classic edn. Psychology Press, UK (2014)","edition":"Classic"},{"key":"12_CR3","unstructured":"Barrow, H., Tenenbaum, J.: Recovering intrinsic scene characteristics. Comput. Vis. Syst. 3\u201326 (1978)"},{"issue":"1","key":"12_CR4","doi-asserted-by":"publisher","first-page":"14","DOI":"10.1109\/TPAMI.2015.2430335","volume":"38","author":"HS Koppula","year":"2016","unstructured":"Koppula, H.S., Saxena, A.: Anticipating human activities using object affordances for reactive robotic response. Pattern Anal. Mach. Intell. 38(1), 14\u201329 (2016)","journal-title":"Pattern Anal. Mach. Intell."},{"issue":"8","key":"12_CR5","doi-asserted-by":"publisher","first-page":"951","DOI":"10.1177\/0278364913478446","volume":"32","author":"HS Koppula","year":"2013","unstructured":"Koppula, H.S., Gupta, R., Saxena, A.: Learning human activities and object affordances from RGB-D videos. Int. J. Robot. Res. 32(8), 951\u2013970 (2013)","journal-title":"Int. J. Robot. Res."},{"issue":"10","key":"12_CR6","doi-asserted-by":"publisher","first-page":"1775","DOI":"10.1109\/TPAMI.2009.83","volume":"31","author":"A Gupta","year":"2009","unstructured":"Gupta, A., Kembhavi, A., Davis, L.S.: Observing human-object interactions: using spatial and functional compatibility for recognition. Pattern Anal. Mach. Intell. 31(10), 1775\u20131789 (2009)","journal-title":"Pattern Anal. Mach. Intell."},{"issue":"3","key":"12_CR7","doi-asserted-by":"publisher","first-page":"259","DOI":"10.1007\/s11263-014-0710-z","volume":"110","author":"DF Fouhey","year":"2014","unstructured":"Fouhey, D.F., Delaitre, V., Gupta, A., Efros, A.A., Laptev, I., Sivic, J.: People watching: human actions as a cue for single view geometry. Int. J. Comput. Vis. 110(3), 259\u2013274 (2014)","journal-title":"Int. J. Comput. Vis."},{"key":"12_CR8","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"crossref","first-page":"284","DOI":"10.1007\/978-3-642-33783-3_21","volume-title":"ECCV 2012","author":"V Delaitre","year":"2012","unstructured":"Delaitre, V., Fouhey, D.F., Laptev, I., Sivic, J., Gupta, A., Efros, A.A.: Scene semantics from long-term observation of people. In: Fitzgibbon, A., Lazebnik, S., Perona, P., Sato, Y., Schmid, C. (eds.) ECCV 2012. LNCS, vol. 7577, pp. 284\u2013298. Springer, Heidelberg (2012)"},{"key":"12_CR9","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"408","DOI":"10.1007\/978-3-319-10605-2_27","volume-title":"Computer Vision \u2013 ECCV 2014","author":"Y Zhu","year":"2014","unstructured":"Zhu, Y., Fathi, A., Fei-Fei, L.: Reasoning about object affordances in a knowledge base representation. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8690, pp. 408\u2013424. Springer, Heidelberg (2014). doi: 10.1007\/978-3-319-10605-2_27"},{"issue":"1","key":"12_CR10","doi-asserted-by":"publisher","first-page":"81","DOI":"10.1016\/j.cviu.2010.08.002","volume":"115","author":"H Kjellstr\u00f6m","year":"2011","unstructured":"Kjellstr\u00f6m, H., Romero, J., Kragi\u0107, D.: Visual object-action recognition: inferring object affordances from human demonstration. Comput. Vis. Image Underst. 115(1), 81\u201390 (2011)","journal-title":"Comput. Vis. Image Underst."},{"key":"12_CR11","doi-asserted-by":"crossref","unstructured":"Yao, B., Ma, J., Fei-Fei, L.: Discovering object functionality. In: ICCV (2013)","DOI":"10.1109\/ICCV.2013.312"},{"key":"12_CR12","doi-asserted-by":"crossref","unstructured":"Farhadi, A., Endres, I., Hoiem, D., Forsyth, D.: Describing objects by their attributes. In: CVPR (2009)","DOI":"10.1109\/CVPR.2009.5206772"},{"key":"12_CR13","doi-asserted-by":"crossref","unstructured":"Hoiem, D., Efros, A.A., Hebert, M.: Geometric context from a single image. In:\u00a0ICCV (2005)","DOI":"10.1109\/ICCV.2005.107"},{"key":"12_CR14","doi-asserted-by":"crossref","unstructured":"Chen, C., Seff, A., Kornhauser, A., Xiao, J.: Deepdriving: learning affordance for direct perception in autonomous driving. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.312"},{"issue":"1","key":"12_CR15","doi-asserted-by":"publisher","first-page":"151","DOI":"10.1007\/s11263-006-0031-y","volume":"75","author":"D Hoiem","year":"2007","unstructured":"Hoiem, D., Efros, A.A., Hebert, M.: Recovering surface layout from an image. Int. J. Comput. Vis. 75(1), 151\u2013172 (2007)","journal-title":"Int. J. Comput. Vis."},{"key":"12_CR16","doi-asserted-by":"crossref","unstructured":"Hoiem, D., Efros, A.A., Hebert, M.: Closing the loop in scene interpretation. In: CVPR (2008)","DOI":"10.1109\/CVPR.2008.4587587"},{"issue":"8","key":"12_CR17","doi-asserted-by":"publisher","first-page":"1915","DOI":"10.1109\/TPAMI.2012.231","volume":"35","author":"C Farabet","year":"2013","unstructured":"Farabet, C., Couprie, C., Najman, L., LeCun, Y.: Learning hierarchical features for scene labeling. Pattern Anal. Mach. Intell. 35(8), 1915\u20131929 (2013)","journal-title":"Pattern Anal. Mach. Intell."},{"key":"12_CR18","unstructured":"Pinheiro, P.H., Collobert, R.: Recurrent convolutional neural networks for scene parsing. In: ICML (2014)"},{"key":"12_CR19","unstructured":"Socher, R., Lin, C.C., Manning, C., Ng, A.Y.: Parsing natural scenes and natural language with recursive neural networks. In: ICML (2011)"},{"key":"12_CR20","unstructured":"Couprie, C., Farabet, C., Najman, L., LeCun, Y.: Indoor semantic segmentation using depth information. In: ICLR (2013)"},{"issue":"9","key":"12_CR21","doi-asserted-by":"publisher","first-page":"1360","DOI":"10.1109\/TIP.2005.852470","volume":"14","author":"F Ning","year":"2005","unstructured":"Ning, F., Delhomme, D., LeCun, Y., Piano, F., Bottou, L., Barbano, P.E.: Toward automatic phenotyping of developing embryos from videos. Image Process. 14(9), 1360\u20131371 (2005)","journal-title":"Image Process."},{"key":"12_CR22","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"297","DOI":"10.1007\/978-3-319-10584-0_20","volume-title":"Computer Vision \u2013 ECCV 2014","author":"B Hariharan","year":"2014","unstructured":"Hariharan, B., Arbel\u00e1ez, P., Girshick, R., Malik, J.: Simultaneous detection and segmentation. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8695, pp. 297\u2013312. Springer, Heidelberg (2014). doi: 10.1007\/978-3-319-10584-0_20"},{"key":"12_CR23","doi-asserted-by":"crossref","unstructured":"Ganin, Y., Lempitsky, V.: $$N^{4}$$-fields: neural network nearest neighbor fields for image transforms. In: ACCV (2014)","DOI":"10.1007\/978-3-319-16808-1_36"},{"key":"12_CR24","doi-asserted-by":"crossref","unstructured":"Eigen, D., Fergus, R.: Predicting depth, surface normals and semantic labels with a common multi-scale convolutional architecture. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.304"},{"key":"12_CR25","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Zhao, Y., Chun Zhu, S.: Understanding tools: task-oriented object modeling, learning and recognition. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298903"},{"key":"12_CR26","doi-asserted-by":"crossref","unstructured":"Myers, A., Kanazawa, A., Fermuller, C., Aloimonos, Y.: Affordance of object parts from geometric features. In: Workshop on Vision meets Cognition, CVPR (2014)","DOI":"10.1109\/ICRA.2015.7139369"},{"key":"12_CR27","unstructured":"Hermans, T., Rehg, J.M., Bobick, A.: Affordance prediction via learned object attributes. In: ICRA: Workshop on Semantic Perception, Mapping, and Exploration (2011)"},{"key":"12_CR28","doi-asserted-by":"crossref","unstructured":"Gupta, A., Satkin, S., Efros, A.A., Hebert, M.: From 3D scene geometry to human workspace. In: CVPR (2011)","DOI":"10.1109\/CVPR.2011.5995448"},{"key":"12_CR29","unstructured":"Fouhey, D.F., Wang, X., Gupta, A.: In defense of the direct perception of affordances (2015). arXiv preprint arXiv:1505.01085"},{"key":"12_CR30","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"746","DOI":"10.1007\/978-3-642-33715-4_54","volume-title":"Computer Vision \u2013 ECCV 2012","author":"N Silberman","year":"2012","unstructured":"Silberman, N., Hoiem, D., Kohli, P., Fergus, R.: Indoor segmentation and support inference from RGBD images. In: Fitzgibbon, A., Lazebnik, S., Perona, P., Sato, Y., Schmid, C. (eds.) ECCV 2012. LNCS, vol. 7576, pp. 746\u2013760. Springer, Heidelberg (2012). doi: 10.1007\/978-3-642-33715-4_54"},{"key":"12_CR31","doi-asserted-by":"crossref","unstructured":"Gupta, A., Davis, L.S.: Objects in action: an approach for combining action understanding and object perception. In: CVPR (2007)","DOI":"10.1109\/CVPR.2007.383331"},{"key":"12_CR32","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"336","DOI":"10.1007\/978-3-540-88688-4_25","volume-title":"Computer Vision \u2013 ECCV 2008","author":"H Kjellstr\u00f6m","year":"2008","unstructured":"Kjellstr\u00f6m, H., Romero, J., Mart\u00ednez, D., Kragi\u0107, D.: Simultaneous visual recognition of manipulation actions and manipulated objects. In: Forsyth, D., Torr, P., Zisserman, A. (eds.) ECCV 2008. LNCS, vol. 5303, pp. 336\u2013349. Springer, Heidelberg (2008). doi: 10.1007\/978-3-540-88688-4_25"},{"key":"12_CR33","doi-asserted-by":"crossref","unstructured":"Yao, B., Fei-Fei, L.: Modeling mutual context of object and human pose in human-object interaction activities. In: CVPR (2010)","DOI":"10.1109\/CVPR.2010.5540235"},{"issue":"3","key":"12_CR34","doi-asserted-by":"publisher","first-page":"207","DOI":"10.1109\/TAMD.2011.2106782","volume":"3","author":"C Castellini","year":"2011","unstructured":"Castellini, C., Tommasi, T., Noceti, N., Odone, F., Caputo, B.: Using object affordances to improve object recognition. Auton. Mental Dev. 3(3), 207\u2013215 (2011)","journal-title":"Auton. Mental Dev."},{"key":"12_CR35","unstructured":"Winston, P.H., Binford, T.O., Katz, B., Lowry, M.: Learning physical descriptions from functional definitions, examples, and precedents. Department of Computer Science, Stanford University (1983)"},{"issue":"10","key":"12_CR36","doi-asserted-by":"publisher","first-page":"1097","DOI":"10.1109\/34.99242","volume":"13","author":"L Stark","year":"1991","unstructured":"Stark, L., Bowyer, K.: Achieving generalized object recognition through reasoning about association of function to structure. Pattern Anal. Mach. Intell. 13(10), 1097\u20131104 (1991)","journal-title":"Pattern Anal. Mach. Intell."},{"issue":"2","key":"12_CR37","doi-asserted-by":"publisher","first-page":"164","DOI":"10.1006\/cviu.1995.1048","volume":"62","author":"E Rivlin","year":"1995","unstructured":"Rivlin, E., Dickinson, S.J., Rosenfeld, A.: Recognition by functional parts. Comput. Vis. Image Underst. 62(2), 164\u2013176 (1995)","journal-title":"Comput. Vis. Image Underst."},{"key":"12_CR38","doi-asserted-by":"crossref","unstructured":"Grabner, H., Gall, J., Van Gool, L.: What makes a chair a chair? In: CVPR (2011)","DOI":"10.1109\/CVPR.2011.5995327"},{"key":"12_CR39","doi-asserted-by":"crossref","unstructured":"Jiang, Y., Koppula, H., Saxena, A.: Hallucinated humans as the hidden context for labeling 3D scenes. In: CVPR (2013)","DOI":"10.1109\/CVPR.2013.385"},{"issue":"2","key":"12_CR40","doi-asserted-by":"publisher","first-page":"157","DOI":"10.1177\/0278364907087172","volume":"27","author":"A Saxena","year":"2008","unstructured":"Saxena, A., Driemeyer, J., Ng, A.Y.: Robotic grasping of novel objects using vision. Int. J. Robot. Res. 27(2), 157\u2013173 (2008)","journal-title":"Int. J. Robot. Res."},{"key":"12_CR41","doi-asserted-by":"crossref","unstructured":"Yu, L.F., Duncan, N., Yeung, S.K.: Fill and transfer: a simple physics-based approach for containability reasoning. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.88"},{"key":"12_CR42","unstructured":"Xie, D., Todorovic, S., Zhu, S.C.: Inferring dark matter and dark energy from videos. In: ICCV (2013)"},{"key":"12_CR43","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"831","DOI":"10.1007\/978-3-319-10578-9_54","volume-title":"Computer Vision \u2013 ECCV 2014","author":"HS Koppula","year":"2014","unstructured":"Koppula, H.S., Saxena, A.: Physically grounded spatio-temporal object affordances. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8691, pp. 831\u2013847. Springer, Heidelberg (2014). doi: 10.1007\/978-3-319-10578-9_54"},{"key":"12_CR44","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Zhu, S.C.: Scene parsing by integrating function, geometry and appearance models. In: CVPR (2013)","DOI":"10.1109\/CVPR.2013.401"},{"key":"12_CR45","doi-asserted-by":"crossref","unstructured":"Gupta, S., Arbelaez, P., Malik, J.: Perceptual organization and recognition of indoor scenes from RGB-D images. In: CVPR (2013)","DOI":"10.1109\/CVPR.2013.79"},{"key":"12_CR46","doi-asserted-by":"crossref","unstructured":"Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"12_CR47","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition (2014). arXiv preprint arXiv:1409.1556"},{"key":"12_CR48","unstructured":"Eigen, D., Puhrsch, C., Fergus, R.: Depth map prediction from a single image using a multi-scale deep network. In: NIPS (2014)"},{"key":"12_CR49","doi-asserted-by":"crossref","unstructured":"Hedau, V., Hoiem, D., Forsyth, D.: Recovering the spatial layout of cluttered rooms. In: ICCV (2009)","DOI":"10.1109\/ICCV.2009.5459411"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2016"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-46493-0_12","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,10]],"date-time":"2025-06-10T19:20:59Z","timestamp":1749583259000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-319-46493-0_12"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016]]},"ISBN":["9783319464923","9783319464930"],"references-count":49,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-46493-0_12","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2016]]},"assertion":[{"value":"17 September 2016","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Amsterdam","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"The Netherlands","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2016","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 October 2016","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16 October 2016","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2016","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/www.eccv2016.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}