{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T15:25:44Z","timestamp":1776093944558,"version":"3.50.1"},"reference-count":88,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2019,7,8]],"date-time":"2019-07-08T00:00:00Z","timestamp":1562544000000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2019,7,8]],"date-time":"2019-07-08T00:00:00Z","timestamp":1562544000000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2020,5]]},"DOI":"10.1007\/s11263-019-01188-y","type":"journal-article","created":{"date-parts":[[2019,7,8]],"date-time":"2019-07-08T12:03:11Z","timestamp":1562587391000},"page":"1239-1285","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":208,"title":["Self-Supervised Model Adaptation for Multimodal Semantic Segmentation"],"prefix":"10.1007","volume":"128","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4710-3114","authenticated-orcid":false,"given":"Abhinav","family":"Valada","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5067-4279","authenticated-orcid":false,"given":"Rohit","family":"Mohan","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5680-6500","authenticated-orcid":false,"given":"Wolfram","family":"Burgard","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2019,7,8]]},"reference":[{"key":"1188_CR1","unstructured":"Abadi, M., Agarwal, A., Barham, P., Brevdo, E., Chen, Z., Citro, C., et al. (2015). TensorFlow: Large-scale machine learning on heterogeneous systems. Software available from tensorflow.org."},{"issue":"3","key":"1188_CR2","first-page":"32","volume":"13","author":"S Anwar","year":"2017","unstructured":"Anwar, S., Hwang, K., & Sung, W. (2017). Structured pruning of deep convolutional neural networks. ACM Journal on Emerging Technologies in Computing Systems (JETC), 13(3), 32.","journal-title":"ACM Journal on Emerging Technologies in Computing Systems (JETC)"},{"key":"1188_CR3","doi-asserted-by":"publisher","first-page":"20","DOI":"10.1016\/j.isprsjprs.2017.11.011","volume":"140","author":"N Audebert","year":"2018","unstructured":"Audebert, N., Le Saux, B., & Lef\u00e8vre, S. (2018). Beyond rgb: Very high resolution urban remote sensing with multimodal deep networks. ISPRS Journal of Photogrammetry and Remote Sensing, 140, 20\u201332.","journal-title":"ISPRS Journal of Photogrammetry and Remote Sensing"},{"key":"1188_CR4","unstructured":"Badrinarayanan, V., Kendall, A., & Cipolla, R. (2015). Segnet: A deep convolutional encoder-decoder architecture for image segmentation. arXiv preprint \narXiv:1511.00561\n\n."},{"key":"1188_CR5","unstructured":"Boniardi, F., Valada, A., Mohan, R., Caselitz, T., & Burgard, W. (2019). Robot localization in floor plans using a room layout edge extraction network. arXiv preprint \narXiv:1903.01804\n\n."},{"key":"1188_CR6","doi-asserted-by":"crossref","unstructured":"Brostow, G. J., Shotton, J., Fauqueur, J., & Cipolla, R. (2008). Segmentation and recognition using structure from motion point clouds. In D. Forsyth, P. Torr, & A. Zisserman (Eds.), Proceedings of the European conference on computer vision.","DOI":"10.1007\/978-3-540-88682-2_5"},{"key":"1188_CR7","doi-asserted-by":"crossref","unstructured":"Bul\u00f2, S. R., Porzi, L., & Kontschieder, P. (2018). In-place activated batchnorm for memory-optimized training of dnns. In Proceedings of the conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2018.00591"},{"key":"1188_CR8","unstructured":"Chattopadhyay, A., Sarkar, A., Howlader, P., & Balasubramanian, V. N. (2017). Grad-cam++: Generalized gradient-based visual explanations for deep convolutional networks. arXiv preprint \narXiv:1710.11063\n\n."},{"key":"1188_CR9","unstructured":"Chen, L., Papandreou, G., Kokkinos, I., Murphy, K., & Yuille, A. L. (2016). Deeplab: Semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected crfs. arxiv preprint \narXiv: 1606.00915\n\n."},{"key":"1188_CR10","unstructured":"Chen, L. C., Collins, M., Zhu, Y., Papandreou, G., Zoph, B., Schroff, F., Adam, H., & Shlens, J. (2018a). Searching for efficient multi-scale architectures for dense image prediction. In Advances in neural information processing systems (pp. 8713\u20138724)."},{"key":"1188_CR11","unstructured":"Chen, L. C., Papandreou, G., Schroff, F., & Adam, H. (2017). Rethinking atrous convolution for semantic image segmentation. arXiv preprint \narXiv:1706.05587\n\n."},{"key":"1188_CR12","doi-asserted-by":"crossref","unstructured":"Chen, L. C., Zhu, Y., Papandreou, G., Schroff, F., & Adam, H. (2018b). Encoder\u2013decoder with atrous separable convolution for semantic image segmentation. arXiv preprint \narXiv:1802.02611\n\n.","DOI":"10.1007\/978-3-030-01234-2_49"},{"key":"1188_CR13","unstructured":"Chollet, F. (2016). Xception: Deep learning with depthwise separable convolutions. arXiv preprint \narXiv:1610.02357\n\n."},{"issue":"8","key":"1188_CR14","doi-asserted-by":"publisher","first-page":"3563","DOI":"10.1093\/cercor\/bhw135","volume":"26","author":"RM Cichy","year":"2016","unstructured":"Cichy, R. M., Pantazis, D., & Oliva, A. (2016). Similarity-based fusion of meg and fmri reveals spatio-temporal dynamics in human cortex during visual object recognition. Cerebral Cortex, 26(8), 3563\u20133579.","journal-title":"Cerebral Cortex"},{"key":"1188_CR15","doi-asserted-by":"crossref","unstructured":"Cordts, M., Omran, M., Ramos, S., Rehfeld, T., Enzweiler, M., Benenson, R., et al. (2016). The cityscapes dataset for semantic urban scene understanding. In Proceedings of the conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2016.350"},{"key":"1188_CR16","unstructured":"Couprie, C., Farabet, C., Najman, L., & LeCun, Y. (2013). Indoor semantic segmentation using depth information. arXiv preprint \narXiv:1301.3572\n\n."},{"key":"1188_CR17","doi-asserted-by":"crossref","unstructured":"Dai, A., Chang, A. X., Savva, M., Halber, M., Funkhouser, T., & Nie\u00dfner, M. (2017). Scannet: Richly-annotated 3d reconstructions of indoor scenes. In Proceedings of the conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2017.261"},{"key":"1188_CR18","unstructured":"Dai, A., & Nie\u00dfner, M. (2018). 3dmv: Joint 3d-multi-view prediction for 3d semantic scene segmentation. arXiv preprint \narXiv:1803.10409\n\n."},{"key":"1188_CR19","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L. J., Li, K., & Fei-Fei, L. (2009). Imagenet: A large-scale hierarchical image database. In Proceedings of the conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"1188_CR20","doi-asserted-by":"crossref","unstructured":"Eitel, A., Springenberg, J. T., Spinello, L., Riedmiller, M. A., & Burgard, W. (2015). Multimodal deep learning for robust rgb-d object recognition. In Proceedings of the IEEE\/RSJ international conference on intelligent robots and systems.","DOI":"10.1109\/IROS.2015.7353446"},{"issue":"1","key":"1188_CR21","doi-asserted-by":"publisher","first-page":"98","DOI":"10.1007\/s11263-014-0733-5","volume":"111","author":"M Everingham","year":"2015","unstructured":"Everingham, M., Eslami, S. A., Van Gool, L., Williams, C. K., Winn, J., & Zisserman, A. (2015). The pascal visual object classes challenge: A retrospective. International Journal of Computer Vision, 111(1), 98\u2013136.","journal-title":"International Journal of Computer Vision"},{"key":"1188_CR22","unstructured":"Farabet, C., Couprie, C., Najman, L., & LeCun, Y. (2012). Scene parsing with multiscale feature learning, purity trees, and optimal covers. In Proceedings of the international conference on machine learning."},{"issue":"8","key":"1188_CR23","doi-asserted-by":"publisher","first-page":"863","DOI":"10.1167\/4.8.863","volume":"4","author":"L Fei-Fei","year":"2004","unstructured":"Fei-Fei, L., Koch, C., Iyer, A., & Perona, P. (2004). What do we see when we glance at a scene? Journal of Vision, 4(8), 863\u2013863.","journal-title":"Journal of Vision"},{"key":"1188_CR24","doi-asserted-by":"crossref","unstructured":"Fulkerson, B., Vedaldi, A., & Soatto, S. (2009). Class segmentation and object localization with superpixel neighborhoods. In Proceedings of the international conference on computer vision.","DOI":"10.1109\/ICCV.2009.5459175"},{"key":"1188_CR25","doi-asserted-by":"crossref","unstructured":"Ghiasi, G., & Fowlkes, C. C. (2016). Laplacian pyramid reconstruction and refinement for semantic segmentation. In European conference on computer vision (pp. 519\u2013534).","DOI":"10.1007\/978-3-319-46487-9_32"},{"key":"1188_CR26","unstructured":"Grangier, D., Bottou, L., & Collobert, R. (2009). Deep convolutional networks for scene parsing. In ICML workshop on deep learning."},{"key":"1188_CR27","doi-asserted-by":"crossref","unstructured":"Gupta, S., Girshick, R., Arbel\u00e1ez, P., & Malik, J. (2014). Learning rich features from rgb-d images for object detection and segmentation. In Proceedings of the European conference on computer vision.","DOI":"10.1007\/978-3-319-10584-0_23"},{"key":"1188_CR28","unstructured":"Hazirbas, C., Ma, L., Domokos, C., & Cremers, D. (2016). Fusenet: Incorporating depth into semantic segmentation via fusion-based cnn architecture. In Proceedings of the Asian conference on computer vision."},{"key":"1188_CR29","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2015a). Deep residual learning for image recognition. In Proceedings of the conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2016.90"},{"key":"1188_CR30","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2015b). Delving deep into rectifiers: Surpassing human-level performance on imagenet classification. In Proceedings of the international conference on computer vision.","DOI":"10.1109\/ICCV.2015.123"},{"issue":"9","key":"1188_CR31","doi-asserted-by":"publisher","first-page":"1904","DOI":"10.1109\/TPAMI.2015.2389824","volume":"37","author":"K He","year":"2015","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2015c). Spatial pyramid pooling in deep convolutional networks for visual recognition. IEEE Transactions on Pattern Analysis and Machine Intelligence, 37(9), 1904\u20131916.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"1188_CR32","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J. (2016). Identity mappings in deep residual networks. In Proceedings of the European conference on computer vision (pp. 630\u2013645).","DOI":"10.1007\/978-3-319-46493-0_38"},{"key":"1188_CR33","doi-asserted-by":"crossref","unstructured":"Hermans, A., Floros, G., & Leibe, B. (2014). Dense 3d semantic mapping of indoor scenes from rgb-d images. In Proceedings of the IEEE international conference on robotics and automation.","DOI":"10.1109\/ICRA.2014.6907236"},{"key":"1188_CR34","unstructured":"Hu, J., Shen, L., & Sun, G. (2017). Squeeze-and-excitation networks. arXiv preprint \narXiv:1709.01507\n\n."},{"key":"1188_CR35","doi-asserted-by":"crossref","unstructured":"Hu, J., Shen, L., & Sun, G. (2018). Squeeze-and-excitation networks. In Proceedings of the conference on computer vision and pattern recognition (pp. 7132\u20137141).","DOI":"10.1109\/CVPR.2018.00745"},{"key":"1188_CR36","first-page":"213","volume":"3","author":"A Huete","year":"1999","unstructured":"Huete, A., Justice, C., & Van Leeuwen, W. (1999). Modis vegetation index (mod13). Algorithm Theoretical Basis Document, 3, 213.","journal-title":"Algorithm Theoretical Basis Document"},{"key":"1188_CR37","doi-asserted-by":"crossref","unstructured":"Janoch, A., Karayev, S., Jia, Y., Barron, J. T., Fritz, M., Saenko, K., et al. (2013). A category-level 3d object dataset: Putting the kinect to work. In Proceedings of the IEEE international conference on consumer depth cameras for computer vision (pp. 141\u2013165).","DOI":"10.1007\/978-1-4471-4640-7_8"},{"key":"1188_CR38","unstructured":"Kim, D. K., Maturana, D., Uenoyama, M., & Scherer, S. (2017). Season-invariant semantic segmentation with a deep multimodal network. In Field and service robotics."},{"issue":"3","key":"1188_CR39","doi-asserted-by":"publisher","first-page":"302","DOI":"10.1007\/s11263-008-0202-0","volume":"82","author":"P Kohli","year":"2009","unstructured":"Kohli, P., Torr, P. H., et al. (2009). Robust higher order potentials for enforcing label consistency. International Journal of Computer Vision, 82(3), 302\u2013324.","journal-title":"International Journal of Computer Vision"},{"key":"1188_CR40","unstructured":"Krizhevsky, A., Sutskever, I., & Hinton, G. E. (2012). Imagenet classification with deep convolutional neural networks. In Advances in neural information processing systems (pp. 1097\u20131105)."},{"key":"1188_CR41","unstructured":"Ku, J., Harakeh, A., & Waslander, S. L. (2018). In defense of classical image processing: Fast depth completion on the cpu. arXiv preprint \narXiv:1802.00036\n\n."},{"key":"1188_CR42","unstructured":"LeCun, Y., Denker, J. S., & Solla, S. A. (1990). Optimal brain damage. In Advances in neural information processing systems (pp. 598\u2013605)."},{"key":"1188_CR43","unstructured":"Lee, C. Y., Xie, S., Gallagher, P., Zhang, Z., & Tu, Z. (2015). Deeply-supervised nets. In Artificial intelligence and statistics (pp. 562\u2013570)."},{"key":"1188_CR44","unstructured":"Li, H., Kadav, A., Durdanovic, I., Samet, H., & Graf, H. P. (2016). Pruning filters for efficient convnets. arXiv preprint \narXiv:1608.08710"},{"key":"1188_CR45","doi-asserted-by":"crossref","unstructured":"Li, Z., Gan, Y., Liang, X., Yu, Y., Cheng, H., & Lin, L. (2016). Lstm-cf: Unifying context modeling and fusion with lstms for rgb-d scene labeling. In Proceedings of the European conference on computer vision.","DOI":"10.1007\/978-3-319-46475-6_34"},{"key":"1188_CR46","unstructured":"Liang-Chieh, C., Papandreou, G., Kokkinos, I., Murphy, K., & Yuille, A. (2015). Semantic image segmentation with deep convolutional nets and fully connected crfs. In International conference on learning representations."},{"key":"1188_CR47","doi-asserted-by":"crossref","unstructured":"Lin, G., Milan, A., Shen, C., & Reid, I. D. (2017). Refinenet: Multi-path refinement networks for high-resolution semantic segmentation. In Proceedings of the conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2017.549"},{"key":"1188_CR48","unstructured":"Lin, M., Chen, Q., & Yan, S. (2013). Network in network. arXiv preprint \narXiv:1312.4400\n\n."},{"key":"1188_CR49","unstructured":"Liu, W., Rabinovich, A., & Berg, A. C. (2015). Parsenet: Looking wider to see better. arXiv preprint \narXiv:1506.04579\n\n."},{"key":"1188_CR50","doi-asserted-by":"crossref","unstructured":"Liu, Z., Li, J., Shen, Z., Huang, G., Yan, S., & Zhang, C. (2017). Learning efficient convolutional networks through network slimming. In Proceedings of the international conference on computer vision.","DOI":"10.1109\/ICCV.2017.298"},{"key":"1188_CR51","doi-asserted-by":"crossref","unstructured":"Long, J., Shelhamer, E., & Darrell, T. (2015). Fully convolutional networks for semantic segmentation. In Proceedings of the conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"1188_CR52","unstructured":"Molchanov, P., Tyree, S., Karras, T., Aila, T., & Kautz, J. (2017). Pruning convolutional neural networks for resource efficient inference. Proceedings of the international conference on learning representation."},{"key":"1188_CR53","doi-asserted-by":"crossref","unstructured":"Munoz, D., Bagnell, J. A., & Hebert, M. (2012). Co-inference for multi-modal scene analysis. In Proceedings of the European conference on computer vision.","DOI":"10.1007\/978-3-642-33783-3_48"},{"key":"1188_CR54","unstructured":"Noh, H., Araujo, A., Sim, J., Weyand, T., & Han, B. (2017). Largescale image retrieval with attentive deep local features. In Proceedings of the IEEE International conference on computer vision (pp. 3456\u20133465)."},{"key":"1188_CR55","doi-asserted-by":"crossref","unstructured":"Noh, H., Hong, S., & Han, B. (2015). Learning deconvolution network for semantic segmentation. In Proceedings of the international conference on computer vision (pp. 1520\u20131528).","DOI":"10.1109\/ICCV.2015.178"},{"key":"1188_CR56","doi-asserted-by":"crossref","unstructured":"Oliveira, G., Valada, A., Bollen, C., Burgard, W., & Brox, T. (2016). Deep learning for human part discovery in images. In Proceedings of the IEEE international conference on robotics and automation.","DOI":"10.1109\/ICRA.2016.7487304"},{"key":"1188_CR57","unstructured":"Paszke, A., Chaurasia, A., Kim, S., & Culurciello, E. (2016). Enet: A deep neural network architecture for real-time semantic segmentation. arXiv preprint \narXiv:1606.02147\n\n."},{"key":"1188_CR58","unstructured":"Pinheiro, P. O., & Collobert, R. (2014). Recurrent convolutional neural networks for scene labeling. In Proceedings of the international conference on machine learning."},{"key":"1188_CR59","doi-asserted-by":"crossref","unstructured":"Plath, N., Toussaint, M., & Nakajima, S. (2009). Multi-class image segmentation using conditional random fields and global classification. In Proceedings of the international conference on machine learning.","DOI":"10.1145\/1553374.1553479"},{"key":"1188_CR60","doi-asserted-by":"crossref","unstructured":"Qi, X., Liao, R., Jia, J., Fidler, S., & Urtasun, R. (2017). 3d graph neural networks for rgbd semantic segmentation. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 5199\u20135208).","DOI":"10.1109\/ICCV.2017.556"},{"key":"1188_CR61","unstructured":"Radwan, N., Valada, A., & Burgard, W. (2018a). Multimodal interaction-aware motion prediction for autonomous street crossing. arXiv preprint \narXiv:1808.06887\n\n."},{"issue":"4","key":"1188_CR62","doi-asserted-by":"publisher","first-page":"4407","DOI":"10.1109\/LRA.2018.2869640","volume":"3","author":"N Radwan","year":"2018","unstructured":"Radwan, N., Valada, A., & Burgard, W. (2018b). Vlocnet++: Deep multitask learning for semantic visual localization and odometry. IEEE Robotics and Automation Letters (RA-L), 3(4), 4407\u20134414.","journal-title":"IEEE Robotics and Automation Letters (RA-L)"},{"key":"1188_CR63","unstructured":"Ren, X., Bo, L., & Fox, D. (2012). Rgb-(d) scene labeling: Features and algorithms. In Proceedings of the conference on computer vision and pattern recognition."},{"issue":"1","key":"1188_CR64","doi-asserted-by":"publisher","first-page":"263","DOI":"10.1109\/TITS.2017.2750080","volume":"19","author":"E Romera","year":"2018","unstructured":"Romera, E., Alvarez, J. M., Bergasa, L. M., & Arroyo, R. (2018). Erfnet: Efficient residual factorized convnet for real-time semantic segmentation. IEEE Transactions on Intelligent Transportation Systems, 19(1), 263\u2013272.","journal-title":"IEEE Transactions on Intelligent Transportation Systems"},{"key":"1188_CR65","doi-asserted-by":"crossref","unstructured":"Ronneberger, O., Fischer, P., & Brox, T. (2015). U-net: Convolutional networks for biomedical image segmentation. In International conference on medical image computing and computer-assisted intervention (pp. 234\u2013241).","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"1188_CR66","doi-asserted-by":"crossref","unstructured":"Ros, G., Sellart, L., Materzynska, J., Vazquez, D., & Lopez, A. M. (2016). The SYNTHIA dataset: A large collection of synthetic images for semantic segmentation of urban scenes. In Proceedings of the conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2016.352"},{"key":"1188_CR67","unstructured":"Running, S. W., Nemani, R., Glassy, J. M., & Thornton, P. E. (1999). Modis daily photosynthesis (psn) and annual net primary production (npp) product (mod17) algorithm theoretical basis document. University of Montana, SCF At-Launch Algorithm ATBD Documents."},{"key":"1188_CR68","doi-asserted-by":"crossref","unstructured":"Sandler, M., Howard, A., Zhu, M., Zhmoginov, A., & Chen, L. C. (2018). Mobilenetv2: Inverted residuals and linear bottlenecks. In Proceedings of the conference on computer vision and pattern recognition (pp. 4510\u20134520).","DOI":"10.1109\/CVPR.2018.00474"},{"key":"1188_CR69","doi-asserted-by":"publisher","first-page":"98","DOI":"10.1007\/978-3-319-59126-1_9","volume-title":"Image analysis","author":"L Schneider","year":"2017","unstructured":"Schneider, L., Jasch, M., Fr\u00f6hlich, B., Weber, T., Franke, U., Pollefeys, M., et al. (2017). Multimodal neural networks: Rgb-d for semantic segmentation and object detection. In P. Sharma & F. M. Bianchi (Eds.), Image analysis (pp. 98\u2013109). Cham: Springer."},{"key":"1188_CR70","doi-asserted-by":"crossref","unstructured":"Shotton, J., Johnson, M., & Cipolla, R. (2008). Semantic texton forests for image categorization and segmentation. In Proceedings of the conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2008.4587503"},{"key":"1188_CR71","doi-asserted-by":"crossref","unstructured":"Silberman, N., Hoiem, D., Kohli, P., & Fergus, R. (2012). Indoor segmentation and support inference from rgbd images. In Proceedings of the European conference on computer vision.","DOI":"10.1007\/978-3-642-33715-4_54"},{"key":"1188_CR72","unstructured":"Simonyan, K., & Zisserman, A. (2014). Very deep convolutional networks for large-scale image recognition. arXiv preprint \narXiv:1409.1556\n\n."},{"key":"1188_CR73","doi-asserted-by":"crossref","unstructured":"Song, S., Lichtenberg, S. P., & Xiao, J. (2015). Sun rgb-d: A rgb-d scene understanding benchmark suite. In Proceedings of the conference on computer vision and pattern recognition (Vol.\u00a05, p.\u00a06).","DOI":"10.1109\/CVPR.2015.7298655"},{"key":"1188_CR74","doi-asserted-by":"crossref","unstructured":"Sturgess, P., Alahari, K., Ladicky, L., & Torr, P. H. S. (2009). Combining appearance and structure from motion features for road scene understanding. In Proceedings of the British machine vision conference.","DOI":"10.5244\/C.23.62"},{"key":"1188_CR75","unstructured":"Valada, A., Dhall, A., & Burgard, W. (2016a). Convoluted mixture of deep experts for robust semantic segmentation. In IEEE\/RSJ International conference on intelligent robots and systems (IROS) workshop, state estimation and terrain perception for all terrain mobile robots."},{"key":"1188_CR76","doi-asserted-by":"crossref","unstructured":"Valada, A., Oliveira, G., Brox, T., & Burgard, W. (2016b). Deep multispectral semantic scene understanding of forested environments using multimodal fusion. In Proceedings of the international symposium for experimental robotics.","DOI":"10.1007\/978-3-319-50115-4_41"},{"key":"1188_CR77","unstructured":"Valada, A., Oliveira, G., Brox, T., & Burgard, W. (2016c). Towards robust semantic segmentation using deep fusion. In Robotics: Science and systems (RSS 2016) workshop, are the sceptics right? Limits and potentials of deep learning in robotics."},{"key":"1188_CR78","doi-asserted-by":"crossref","unstructured":"Valada, A., Vertens, J., Dhall, A., & Burgard, W. (2017). Adapnet: Adaptive semantic segmentation in adverse environmental conditions. In Proceedings of the IEEE international conference on robotics and automation.","DOI":"10.1109\/ICRA.2017.7989540"},{"key":"1188_CR79","unstructured":"Wen, W., Wu, C., Wang, Y., Chen, Y., & Li, H. (2016). Learning structured sparsity in deep neural networks. In Advances in neural information processing systems (pp. 2074\u20132082)."},{"key":"1188_CR80","unstructured":"Xiang, Y., & Fox, D. (2017). Da-rnn: Semantic mapping with data associated recurrent neural networks. arXiv preprint \narXiv:1703.03098\n\n."},{"key":"1188_CR81","doi-asserted-by":"crossref","unstructured":"Xiao, J., Owens, A., & Torralba, A. (2013). Sun3d: A database of big spaces reconstructed using sfm and object labels. In Proceedings of the international conference on computer vision.","DOI":"10.1109\/ICCV.2013.458"},{"key":"1188_CR82","doi-asserted-by":"crossref","unstructured":"Xie, S., Girshick, R., Doll\u00e1r, P., Tu, Z., & He, K. (2017). Aggregated residual transformations for deep neural networks. In Proceedings of the conference on computer vision and pattern recognition (pp. 5987\u20135995).","DOI":"10.1109\/CVPR.2017.634"},{"key":"1188_CR83","doi-asserted-by":"crossref","unstructured":"Yang, M., Yu, K., Zhang, C., Li, Z., & Yang, K. (2018). Denseaspp for semantic segmentation in street scenes. In Proceedings of the conference on computer vision and pattern recognition (pp. 3684\u20133692).","DOI":"10.1109\/CVPR.2018.00388"},{"key":"1188_CR84","unstructured":"Yu, F., & Koltun, V. (2016). Multi-scale context aggregation by dilated convolutions. In International conference on learning representations."},{"key":"1188_CR85","doi-asserted-by":"crossref","unstructured":"Zhang, C., Wang, L., & Yang, R. (2010). Semantic segmentation of urban scenes using dense depth maps. In K. Daniilidis, P. Maragos, & N. Paragios (Eds.), Proceedings of the European conference on computer vision.","DOI":"10.1007\/978-3-642-15561-1_51"},{"key":"1188_CR86","doi-asserted-by":"crossref","unstructured":"Zhao, H., Shi, J., Qi, X., Wang, X., & Jia, J. (2017). Pyramid scene parsing network. In Proceedings of the conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2017.660"},{"key":"1188_CR87","unstructured":"Zhou, B., Khosla, A., Lapedriza, A., Oliva, A., & Torralba, A. (2014). Object detectors emerge in deep scene cnns. arXiv preprint \narXiv:1412.6856\n\n."},{"key":"1188_CR88","doi-asserted-by":"crossref","unstructured":"Zhuang, Y., Yang, F., Tao, L., Ma, C., Zhang, Z., Li, Y., et al. (2018). Dense relation network: Learning consistent and context-aware representation for semantic image segmentation. In 2018 25th IEEE international conference on image processing (ICIP) (pp. 3698\u20133702).","DOI":"10.1109\/ICIP.2018.8451830"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-019-01188-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11263-019-01188-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-019-01188-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,7,6]],"date-time":"2020-07-06T23:10:04Z","timestamp":1594077004000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11263-019-01188-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,7,8]]},"references-count":88,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2020,5]]}},"alternative-id":["1188"],"URL":"https:\/\/doi.org\/10.1007\/s11263-019-01188-y","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2019,7,8]]},"assertion":[{"value":"24 July 2018","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 June 2019","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"8 July 2019","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}