{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T11:05:39Z","timestamp":1769943939667,"version":"3.49.0"},"reference-count":221,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2019,1,1]],"date-time":"2019-01-01T00:00:00Z","timestamp":1546300800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/OAPA.html"}],"funder":[{"DOI":"10.13039\/501100000923","name":"Australian Research Council","doi-asserted-by":"publisher","award":["DP150104645"],"award-info":[{"award-number":["DP150104645"]}],"id":[{"id":"10.13039\/501100000923","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Access"],"published-print":{"date-parts":[[2019]]},"DOI":"10.1109\/access.2018.2886133","type":"journal-article","created":{"date-parts":[[2018,12,12]],"date-time":"2018-12-12T20:22:08Z","timestamp":1544646128000},"page":"1859-1887","source":"Crossref","is-referenced-by-count":99,"title":["Indoor Scene Understanding in 2.5\/3D for Autonomous Agents: A Survey"],"prefix":"10.1109","volume":"7","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7663-7161","authenticated-orcid":false,"given":"Muzammal","family":"Naseer","sequence":"first","affiliation":[]},{"given":"Salman","family":"Khan","sequence":"additional","affiliation":[]},{"given":"Fatih","family":"Porikli","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref170","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2014.2305100"},{"key":"ref172","doi-asserted-by":"publisher","DOI":"10.1109\/3DV.2016.35"},{"key":"ref171","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.359"},{"key":"ref174","doi-asserted-by":"publisher","DOI":"10.5244\/C.27.112"},{"key":"ref173","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2017.323"},{"key":"ref176","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2017.2682981"},{"key":"ref175","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.257"},{"key":"ref178","doi-asserted-by":"publisher","DOI":"10.4324\/9781315740218"},{"key":"ref177","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.78"},{"key":"ref168","first-page":"92","article-title":"RGBD salient object detection: A benchmark and algorithms","author":"peng","year":"2014","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref169","first-page":"454","article-title":"Leveraging stereopsis for saliency analysis","author":"niu","year":"2012","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit (CVPR)"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2012.6385773"},{"key":"ref38","doi-asserted-by":"crossref","first-page":"139","DOI":"10.1145\/2897824.2925867","article-title":"PiGraphs: Learning interaction snapshots from observations","volume":"35","author":"savva","year":"2016","journal-title":"ACM Trans Graph"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.261"},{"key":"ref32","author":"chang","year":"2017","journal-title":"Matterport3d Learning from rgb-d data in indoor environments"},{"key":"ref31","author":"armeni","year":"2017","journal-title":"Joint 2d-3d-semantic data for indoor scene understanding"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298655"},{"key":"ref37","author":"mccormac","year":"2016","journal-title":"SceneNet RGB-D 5M Photorealistic Images of Synthetic Indoor Trajectories with Ground Truth"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/3DV.2016.18"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2011.5980382"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.28"},{"key":"ref181","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.385"},{"key":"ref180","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995327"},{"key":"ref185","first-page":"1351","article-title":"Towards holistic scene understanding: Feedback enabled cascaded classification models","author":"li","year":"2010","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref184","first-page":"186","article-title":"A multi-scale CNN for affordance segmentation in RGB images","author":"roy","year":"2016","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref183","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2017.7989535"},{"key":"ref182","first-page":"1","article-title":"Functional descriptors for object affordances","author":"pieropan","year":"2015","journal-title":"Proc of Workshop on IROS91"},{"key":"ref189","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2011.231"},{"key":"ref188","first-page":"70","article-title":"A generic model to compose vision modules for holistic scene understanding","author":"li","year":"2010","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref187","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-014-0777-6"},{"key":"ref186","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.12"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10584-0_23"},{"key":"ref27","author":"simonyan","year":"2014","journal-title":"Very Deep Convolutional Networks for Large-scale Image Recognition"},{"key":"ref179","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206772"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.458"},{"key":"ref20","article-title":"Machine perception of three-dimensional solids","author":"roberts","year":"1963"},{"key":"ref22","article-title":"Visual perception by computer","author":"binford","year":"1971","journal-title":"Proc IEEE Syst Contr Conf"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/1476589.1476631"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.94"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1016\/0734-189X(85)90002-7"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"ref25","author":"zhang","year":"2016","journal-title":"Deepcontext Context-encoding neural pathways for 3d holistic scene understanding"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10599-4_43"},{"key":"ref51","first-page":"307","article-title":"FPNN: Field probing neural networks for 3D data","author":"li","year":"2016","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref154","author":"krull","year":"2016","journal-title":"Poseagent Budget-constrained 6d object pose estimation via reinforcement learning"},{"key":"ref153","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126326"},{"key":"ref156","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.115"},{"key":"ref155","first-page":"535","article-title":"Generic 3D representation via pose estimation and matching","author":"zamir","year":"2016","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref150","first-page":"536","article-title":"Learning 6D object pose estimation using 3D object coordinates","author":"brachmann","year":"2014","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref152","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2011.70"},{"key":"ref151","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.372"},{"key":"ref146","first-page":"5","article-title":"Automated risk assessment for scene understanding and domestic robots using RGB-D data and 2.5 D CNNs at a patch level","author":"dupre","year":"2017","journal-title":"Proc IEEE Conf Comp Vis Pattern Recognit"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2008.4587597"},{"key":"ref148","first-page":"1","article-title":"TextonBoost: Joint appearance, shape and context modeling for multi-class object recognition and segmentation","author":"shotton","year":"2006","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref149","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2015.7139363"},{"key":"ref59","first-page":"205","article-title":"Deep learning of local RGB-D patches for 3D object detection and 6D pose estimation","author":"kehl","year":"2016","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref58","first-page":"2048","article-title":"Show, attend and tell: Neural image caption generation with visual attention","author":"xu","year":"2015","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref57","first-page":"628","article-title":"3D-R2N2: A unified approach for single and multi-view 3D object reconstruction","author":"choy","year":"2016","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref56","author":"graves","year":"2014","journal-title":"Neural Turing machines"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2005.06.042"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/W14-4012"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.114"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2014.6836101"},{"key":"ref167","first-page":"101","article-title":"Depth matters: Influence of depth cues on visual saliency","author":"lang","year":"2012","journal-title":"Computer"},{"key":"ref166","first-page":"365","article-title":"Single image 3D interpreter network","author":"wu","year":"2016","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref165","author":"riegler","year":"2017","journal-title":"Octnetfusion Learning depth fusion from data"},{"key":"ref164","first-page":"5556","article-title":"Robust reconstruction of indoor scenes","author":"choi","year":"2015","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref163","doi-asserted-by":"publisher","DOI":"10.1145\/237170.237216"},{"key":"ref162","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299105"},{"key":"ref161","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref160","doi-asserted-by":"publisher","DOI":"10.1109\/ISMAR.2011.6092378"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/MC.2012.111"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1117\/12.479722"},{"key":"ref6","year":"2017","journal-title":"Qualcomm Announces 3D Camera Technology for Android Ecosystem"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/IVS.2000.898373"},{"key":"ref8","first-page":"16","article-title":"Obstacle avoidance system for assisting visually impaired people","volume":"35","author":"rodr\u00edguez","year":"2012","journal-title":"Proc IEEE Intell Veh Symp Workshops"},{"key":"ref159","article-title":"Kintinuous: Spatially extended KinectFusion","author":"whelan","year":"2012","journal-title":"RSS Workshop on RGB-D Advanced Reasoning with Depth Cameras"},{"key":"ref7","year":"2017","journal-title":"Who Vision Impairment and Blindness"},{"key":"ref49","author":"qi","year":"2016","journal-title":"Pointnet Deep learning on point sets for 3d classification and segmentation"},{"key":"ref157","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298930"},{"key":"ref9","author":"song","year":"2016","journal-title":"Semantic scene completion from a single depth image"},{"key":"ref158","author":"doumanoglou","year":"2016","journal-title":"Siamese Regression Networks with Efficient Mid-level Feature Extraction for 3D Object Pose Estimation"},{"key":"ref46","author":"badrinarayanan","year":"2015","journal-title":"Segnet A Deep Convolutional Encoder-Decoder Architecture for Image Segmentation"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.179"},{"key":"ref47","first-page":"656","article-title":"Convolutional-recursive deep learning for 3D object classification","author":"socher","year":"2012","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2011.6130298"},{"key":"ref41","first-page":"51","article-title":"The Stixel world&#x2014;A compact medium level representation of the 3d-world","author":"badino","year":"2009","journal-title":"Proc DAGM-Symp"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-009-0275-4"},{"key":"ref43","year":"2018","journal-title":"Planner 5D"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.234"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1177\/0278364913514283"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2015.7139369"},{"key":"ref70","first-page":"801","article-title":"Efficient sparse coding algorithms","author":"lee","year":"2007","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298639"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.586"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1023\/A:1010933404324"},{"key":"ref75","first-page":"462","article-title":"Latent-class Hough forests for 3D object detection and pose estimation","author":"tejani","year":"2014","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref78","first-page":"520","article-title":"Robust instance recognition in presence of occlusion and clutter","author":"bonde","year":"2014","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/5254.708428"},{"key":"ref60","author":"kendall","year":"2015","journal-title":"Bayesian segnet Model uncertainty in deep convolutional encoder-decoder architectures for scene understanding"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2016.7759429"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.701"},{"key":"ref63","author":"dai","year":"2016","journal-title":"Shape completion using 3d-encoder-predictor cnns and shape synthesis"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.179"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299022"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299091"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.180"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2015.2501811"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2015.2430335"},{"key":"ref197","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298938"},{"key":"ref198","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2014.7025222"},{"key":"ref199","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2015.7301391"},{"key":"ref193","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2015.2506118"},{"key":"ref194","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0822-0"},{"key":"ref195","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.165"},{"key":"ref196","first-page":"110","article-title":"Saliency detection via cellular automata","author":"qin","year":"2015","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref95","author":"hegde","year":"2016","journal-title":"Fusionnet 3d object classification using multiple data representations"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2015.2480802"},{"key":"ref190","author":"brock","year":"2016","journal-title":"Generative and discriminative voxel modeling with convolutional neural networks"},{"key":"ref93","author":"chatfield","year":"2014","journal-title":"Return of the devil in the details Delving deep into convolutional nets"},{"key":"ref191","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.495"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1145\/360825.360839"},{"key":"ref192","author":"qi","year":"2017","journal-title":"Frustum pointnets for 3d object detection from rgb-d data"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2015.7353481"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1162\/neco.2006.18.7.1527"},{"key":"ref98","first-page":"2672","article-title":"Generative adversarial nets","author":"goodfellow","year":"2014","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10578-9_38"},{"key":"ref96","author":"qi","year":"2017","journal-title":"Pointnet++ Deep hierarchical feature learning on point sets in a metric space"},{"key":"ref97","author":"zeng","year":"2017","journal-title":"3DContextNet K-d tree guided hierarchical learning of point clouds using local and global contextual cues"},{"key":"ref82","first-page":"82","article-title":"Learning a probabilistic latent space of object shapes via 3D generative-adversarial modeling","author":"wu","year":"2016","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.79"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2010.161"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.609"},{"key":"ref80","first-page":"85","article-title":"Support vector machines for classification and regression","volume":"14","author":"gunn","year":"1998"},{"key":"ref89","first-page":"1912","article-title":"3D ShapeNets: A deep representation for volumetric shapes","author":"wu","year":"2015","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2006.68"},{"key":"ref86","first-page":"215","article-title":"An analysis of single-layer networks in unsupervised feature learning","author":"coates","year":"2011","journal-title":"Proc 14th Int Conf Artif Intell Statist"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2015.7353446"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.645"},{"key":"ref200","year":"2017","journal-title":"Accelerating AI With GPUS"},{"key":"ref101","first-page":"424","article-title":"3D object proposals for accurate object class detection","author":"chen","year":"2015","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref100","first-page":"634","article-title":"Sliding shapes for 3D object detection in depth images","author":"song","year":"2014","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref209","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.401"},{"key":"ref203","author":"frosst","year":"2017","journal-title":"Distilling a neural network into a soft decision tree"},{"key":"ref204","author":"yuan","year":"2017","journal-title":"Adversarial examples Attacks and defenses for deep learning"},{"key":"ref201","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.292"},{"key":"ref202","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2016.XII.041"},{"key":"ref207","year":"2018","journal-title":"Oms2world"},{"key":"ref208","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2006.97"},{"key":"ref205","year":"0","journal-title":"Datasets"},{"key":"ref206","author":"hackel","year":"2017","journal-title":"Semantic3d net A new large-scale point cloud classification benchmark"},{"key":"ref211","first-page":"756","article-title":"Efficient joint segmentation, occlusion labeling, stereo and flow estimation","author":"yamaguchi","year":"2014","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref210","first-page":"3013","article-title":"Towards 3D object detection with bimodal deep Boltzmann machines over RGBD imagery","author":"liu","year":"2015","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref212","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2014.6907430"},{"key":"ref213","doi-asserted-by":"publisher","DOI":"10.1023\/B:VISI.0000029664.99615.94"},{"key":"ref214","year":"2017","journal-title":"Who Number of Road Traffic Deaths"},{"key":"ref215","first-page":"580","article-title":"A semantics-based decision theory region analyser","author":"yakimovsky","year":"1973","journal-title":"Proc IJCAI"},{"key":"ref216","volume":"4","author":"ohta","year":"1985","journal-title":"Knowledge-Based Interpretation of Outdoor Natural Color Scenes"},{"key":"ref217","first-page":"752","article-title":"An analysis system for scenes containing objects with substructures","author":"ohta","year":"1978","journal-title":"Proc 4th Int Joint Conf Pattern Recognit"},{"key":"ref218","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.1983.4767366"},{"key":"ref219","doi-asserted-by":"publisher","DOI":"10.1016\/0146-664X(81)90019-8"},{"key":"ref220","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.1981.4767147"},{"key":"ref221","article-title":"VISIONS: A computer system for interpreting scenes","author":"hanson","year":"1978","journal-title":"Computer Vision Systems"},{"key":"ref127","first-page":"91","article-title":"Faster R-CNN: Towards real-time object detection with region proposal networks","author":"ren","year":"2015","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"ref125","first-page":"1097","article-title":"ImageNet classification with deep convolutional neural networks","author":"krizhevsky","year":"2012","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.81"},{"key":"ref129","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2015.2505283"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.164"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2572683"},{"key":"ref133","author":"gal","year":"2015","journal-title":"Bayesian convolutional neural networks with bernoulli approximate variational inference"},{"key":"ref134","first-page":"1929","article-title":"Dropout: A simple way to prevent neural networks from overfitting","volume":"15","author":"srivastava","year":"2014","journal-title":"J Mach Learn Res"},{"key":"ref131","first-page":"1050","article-title":"Dropout as a Bayesian approximation: Representing model uncertainty in deep learning","author":"gal","year":"2016","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref132","author":"yu","year":"2015","journal-title":"Multi-scale context aggregation by dilated convolutions"},{"key":"ref136","first-page":"3521","article-title":"Newtonian scene understanding: Unfolding the dynamics of objects in static images","author":"mottaghi","year":"2016","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref135","author":"tchapmi","year":"2017","journal-title":"Segcloud Semantic segmentation of 3d point clouds"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.402"},{"key":"ref137","first-page":"127","article-title":"Galileo: Perceiving physical object properties by integrating a physics engine with deep learning","author":"wu","year":"2015","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref139","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2005.161"},{"key":"ref140","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2014.2359435"},{"key":"ref141","doi-asserted-by":"crossref","first-page":"18327","DOI":"10.1073\/pnas.1306572110","article-title":"Simulation as an engine of physical scene understanding","volume":"110","author":"battaglia","year":"2013","journal-title":"Proc Nat Acad Sci USA"},{"key":"ref142","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.8"},{"key":"ref143","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.664"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1007\/s10846-011-9608-y"},{"key":"ref144","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995333"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6248074"},{"key":"ref145","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2014.6907351"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.169"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126229"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1109\/ACSSC.1993.342465"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/TSP.2006.881199"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-014-0780-y"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.282"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.558"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.50"},{"key":"ref111","author":"sedaghat","year":"2016","journal-title":"Orientation-boosted voxel nets for 3d object recognition"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.49"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2013.6630857"},{"key":"ref10","first-page":"746","article-title":"Indoor segmentation and support inference from RGBD images","author":"silberman","year":"2012","journal-title":"Vision Computer"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.1999.790410"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2005.177"},{"key":"ref13","first-page":"404","article-title":"SURF: Speeded up robust features","author":"bay","year":"2006","journal-title":"Vision Computer"},{"key":"ref14","first-page":"589","article-title":"Region covariance: A fast descriptor for detection and classification","author":"tuzel","year":"2006","journal-title":"Proc Eur Conf Comput Vis (ECCV)"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2006.244"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1001\/archneur.1968.00480030127018"},{"key":"ref118","first-page":"1","article-title":"Toward real-time indoor semantic segmentation using depth information","volume":"1","author":"couprie","year":"2014","journal-title":"J Mach Learn Res"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.4324\/9781315009292"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.202"},{"key":"ref18","volume":"2","author":"barrow","year":"1978","journal-title":"Computer Vision Systems"},{"key":"ref19","author":"marr","year":"1982","journal-title":"Vision A Computational Investigation into the Human Representation and Processing of Visual Information"},{"key":"ref119","author":"kalogerakis","year":"2016","journal-title":"3d shape segmentation with projective convolutional networks"},{"key":"ref114","first-page":"1","article-title":"Semantic parsing for priming object detection in RGB-D scenes","author":"cadena","year":"2013","journal-title":"3rd Workshop on Semantic Perception Mapping and Exploration (SPME)"},{"key":"ref113","doi-asserted-by":"crossref","first-page":"238","DOI":"10.1145\/2980179.2980224","article-title":"3D attention-driven depth acquisition for object identification","volume":"35","author":"xu","year":"2016","journal-title":"ACM Trans Graph"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2014.6907236"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2014.6907778"},{"key":"ref120","doi-asserted-by":"crossref","first-page":"98","DOI":"10.1007\/978-3-319-59126-1_9","article-title":"Multimodal neural networks: RGB-D for semantic segmentation and object detection","author":"schneider","year":"2017","journal-title":"Proc Scand Conf Image Anal"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"ref122","first-page":"213","article-title":"FuseNet: Incorporating depth into semantic segmentation via fusion-based CNN architecture","author":"hazirbas","year":"2016","journal-title":"Proc Asian Conf Comput Vis"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2013.6738875"}],"container-title":["IEEE Access"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6287639\/8600701\/08573760.pdf?arnumber=8573760","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,1,25]],"date-time":"2022-01-25T23:56:52Z","timestamp":1643155012000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8573760\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019]]},"references-count":221,"URL":"https:\/\/doi.org\/10.1109\/access.2018.2886133","relation":{},"ISSN":["2169-3536"],"issn-type":[{"value":"2169-3536","type":"electronic"}],"subject":[],"published":{"date-parts":[[2019]]}}}