{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,30]],"date-time":"2026-06-30T15:35:36Z","timestamp":1782833736141,"version":"3.54.5"},"publisher-location":"Cham","reference-count":57,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030012519","type":"print"},{"value":"9783030012526","type":"electronic"}],"license":[{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018]]},"DOI":"10.1007\/978-3-030-01252-6_2","type":"book-chapter","created":{"date-parts":[[2018,10,5]],"date-time":"2018-10-05T13:48:05Z","timestamp":1538747285000},"page":"20-37","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":57,"title":["Self-Supervised Relative Depth Learning for Urban Scene Understanding"],"prefix":"10.1007","author":[{"given":"Huaizu","family":"Jiang","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Gustav","family":"Larsson","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Michael","family":"Maire","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Greg","family":"Shakhnarovich","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Erik","family":"Learned-Miller","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2018,10,6]]},"reference":[{"key":"2_CR1","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. CoRR abs\/1409.1556 (2014)"},{"key":"2_CR2","doi-asserted-by":"publisher","first-page":"640","DOI":"10.1109\/TPAMI.2016.2572683","volume":"39","author":"E Shelhamer","year":"2017","unstructured":"Shelhamer, E., Long, J., Darrell, T.: Fully convolutional networks for semantic segmentation. TPAMI 39, 640\u2013651 (2017)","journal-title":"TPAMI"},{"key":"2_CR3","doi-asserted-by":"crossref","unstructured":"Larsson, G., Maire, M., Shakhnarovich, G.: Colorization as a proxy task for visual understanding. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.96"},{"key":"2_CR4","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"649","DOI":"10.1007\/978-3-319-46487-9_40","volume-title":"Computer Vision \u2013 ECCV 2016","author":"R Zhang","year":"2016","unstructured":"Zhang, R., Isola, P., Efros, A.A.: Colorful image colorization. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9907, pp. 649\u2013666. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46487-9_40"},{"key":"2_CR5","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1007\/978-3-319-46466-4_5","volume-title":"Computer Vision \u2013 ECCV 2016","author":"M Noroozi","year":"2016","unstructured":"Noroozi, M., Favaro, P.: Unsupervised learning of visual representations by solving jigsaw puzzles. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9910, pp. 69\u201384. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46466-4_5"},{"key":"2_CR6","doi-asserted-by":"crossref","unstructured":"Wang, X., Gupta, A.: Unsupervised learning of visual representations using videos. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.320"},{"key":"2_CR7","doi-asserted-by":"crossref","unstructured":"Agrawal, P., Carreira, J., Malik, J.: Learning to see by moving. In: CVPR (2015)","DOI":"10.1109\/ICCV.2015.13"},{"key":"2_CR8","doi-asserted-by":"crossref","unstructured":"Pathak, D., Girshick, R.B., Doll\u00e1r, P., Darrell, T., Hariharan, B.: Learning features by watching objects move. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.638"},{"key":"2_CR9","unstructured":"Misra, I., Zitnick, C.L., Hebert, M.: Unsupervised learning using sequential verification for action recognition. In: ECCV (2016)"},{"key":"2_CR10","doi-asserted-by":"crossref","unstructured":"Pathak, D., Kr\u00e4henb\u00fchl, P., Donahue, J., Darrell, T., Efros, A.: Context encoders: feature learning by inpainting. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.278"},{"key":"2_CR11","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"248","DOI":"10.1007\/978-3-319-54193-8_16","volume-title":"Computer Vision \u2013 ACCV 2016","author":"R Gao","year":"2017","unstructured":"Gao, R., Jayaraman, D., Grauman, K.: Object-centric representation learning from unlabeled videos. In: Lai, S.-H., Lepetit, V., Nishino, K., Sato, Y. (eds.) ACCV 2016. LNCS, vol. 10115, pp. 248\u2013263. Springer, Cham (2017). https:\/\/doi.org\/10.1007\/978-3-319-54193-8_16"},{"key":"2_CR12","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"801","DOI":"10.1007\/978-3-319-46448-0_48","volume-title":"Computer Vision \u2013 ECCV 2016","author":"A Owens","year":"2016","unstructured":"Owens, A., Wu, J., McDermott, J.H., Freeman, W.T., Torralba, A.: Ambient sound provides supervision for visual learning. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 801\u2013816. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_48"},{"key":"2_CR13","doi-asserted-by":"crossref","unstructured":"Lee, H., Huang, J., Singh, M., Yang, M.: Unsupervised representation learning by sorting sequences. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.79"},{"key":"2_CR14","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"433","DOI":"10.1007\/978-3-319-46484-8_26","volume-title":"Computer Vision \u2013 ECCV 2016","author":"P Bideau","year":"2016","unstructured":"Bideau, P., Learned-Miller, E.: It\u2019s moving! a probabilistic model for causal motion segmentation in moving camera videos. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9912, pp. 433\u2013449. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46484-8_26"},{"key":"2_CR15","doi-asserted-by":"crossref","unstructured":"Jayaraman, D., Grauman, K.: Learning image representations tied to ego-motion. In: ICCV, pp. 1413\u20131421 (2015)","DOI":"10.1109\/ICCV.2015.166"},{"key":"2_CR16","doi-asserted-by":"crossref","unstructured":"Geiger, A., Lenz, P., Urtasun, R.: Are we ready for autonomous driving? the KITTI vision benchmark suite. In: CVPR (2012)","DOI":"10.1109\/CVPR.2012.6248074"},{"issue":"11","key":"2_CR17","doi-asserted-by":"publisher","first-page":"1231","DOI":"10.1177\/0278364913491297","volume":"32","author":"A Geiger","year":"2013","unstructured":"Geiger, A., Lenz, P., Stiller, C., Urtasun, R.: Vision meets robotics: the KITTI dataset. Int. J. Robot. Res. (IJRR) 32, 1231\u20131237 (2013)","journal-title":"The International Journal of Robotics Research"},{"key":"2_CR18","doi-asserted-by":"crossref","unstructured":"Cordts, M., et al.: The cityscapes dataset for semantic urban scene understanding. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.350"},{"key":"2_CR19","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: CVPR (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"2_CR20","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"2_CR21","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"577","DOI":"10.1007\/978-3-319-46493-0_35","volume-title":"Computer Vision \u2013 ECCV 2016","author":"G Larsson","year":"2016","unstructured":"Larsson, G., Maire, M., Shakhnarovich, G.: Learning representations for automatic colorization. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9908, pp. 577\u2013593. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46493-0_35"},{"key":"2_CR22","unstructured":"Ranzato, M., Szlam, A., Bruna, J., Mathieu, M., Collobert, R., Chopra, S.: Video (language) modeling: a baseline for generative models of natural videos. arXiv preprint arXiv:1412.6604 (2014)"},{"key":"2_CR23","unstructured":"Srivastava, N., Mansimov, E., Salakhutdinov, R.: Unsupervised learning of video representations using LSTMs. In: ICML (2015)"},{"key":"2_CR24","unstructured":"Mathieu, M., Couprie, C., LeCun, Y.: Deep multi-scale video prediction beyond mean square error. arXiv preprint arXiv:1511.05440 (2015)"},{"key":"2_CR25","unstructured":"Vondrick, C., Pirsiavash, H., Torralba, A.: Generating videos with scene dynamics. In: NIPS (2016)"},{"key":"2_CR26","unstructured":"Xue, T., Wu, J., Bouman, K., Freeman, B.: Visual dynamics: probabilistic future frame synthesis via cross convolutional networks. In: NIPS (2016)"},{"key":"2_CR27","unstructured":"Lotter, W., Kreiman, G., Cox, D.: Deep predictive coding networks for video prediction and unsupervised learning. In: ICLR (2017)"},{"key":"2_CR28","unstructured":"Radford, A., Metz, L., Chintala, S.: Unsupervised representation learning with deep convolutional generative adversarial networks (2016)"},{"key":"2_CR29","unstructured":"Springenberg, J.T.: Unsupervised and semi-supervised learning with categorical generative adversarial networks. In: ICLR (2016)"},{"key":"2_CR30","unstructured":"Donahue, J., Kr\u00e4henb\u00fchl, P., Darrell, T.: Adversarial feature learning. In: ICLR (2017)"},{"key":"2_CR31","doi-asserted-by":"crossref","unstructured":"Doersch, C., Gupta, A., Efros, A.A.: Unsupervised visual representation learning by context prediction. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.167"},{"key":"2_CR32","doi-asserted-by":"crossref","unstructured":"Mobahi, H., Collobert, R., Weston, J.: Deep learning from temporal coherence in video. In: ICML (2009)","DOI":"10.1145\/1553374.1553469"},{"key":"2_CR33","unstructured":"Isola, P., Zoran, D., Krishnan, D., Adelson, E.H.: Learning visual groups from co-occurrences in space and time. arXiv preprint arXiv:1511.06811 (2015)"},{"key":"2_CR34","doi-asserted-by":"crossref","unstructured":"Jayaraman, D., Grauman, K.: Slow and steady feature analysis: higher order temporal coherence in video. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.418"},{"issue":"4","key":"2_CR35","doi-asserted-by":"publisher","first-page":"715","DOI":"10.1162\/089976602317318938","volume":"14","author":"L Wiskott","year":"2002","unstructured":"Wiskott, L., Sejnowski, T.J.: Slow feature analysis: unsupervised learning of invariances. Neural Comput. 14(4), 715\u2013770 (2002)","journal-title":"Neural Comput."},{"key":"2_CR36","doi-asserted-by":"crossref","unstructured":"Doersch, C., Zisserman, A.: Multi-task self-supervised visual learning. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.226"},{"key":"2_CR37","doi-asserted-by":"crossref","unstructured":"Wang, X., He, K., Gupta, A.: Transitive invariance for self-supervised visual representation learning. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.149"},{"key":"2_CR38","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-46484-8_45","volume-title":"Computer Vision \u2013 ECCV 2016","author":"R Garg","year":"2016","unstructured":"Garg, R., B.G., V.K., Carneiro, G., Reid, I.: Unsupervised CNN for single view depth estimation: geometry to the rescue. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9912, pp. 740\u2013756. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46484-8_45"},{"key":"2_CR39","doi-asserted-by":"crossref","unstructured":"Godard, C., Mac Aodha, O., Brostow, G.J.: Unsupervised monocular depth estimation with left-right consistency. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.699"},{"key":"2_CR40","doi-asserted-by":"crossref","unstructured":"Zhou, T., Brown, M., Snavely, N., Lowe, D.G.: Unsupervised learning of depth and ego-motion from video. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.700"},{"key":"2_CR41","unstructured":"Vijayanarasimhan, S., Ricco, S., Schmid, C., Sukthankar, R., Fragkiadaki, K.: Sfm-Net: learning of structure and motion from video. arXiv (2017)"},{"key":"2_CR42","doi-asserted-by":"crossref","unstructured":"Li, Y., Paluri, M., Rehg, J.M., Doll\u00e1r, P.: Unsupervised learning of edges. In: CVPR, pp. 1619\u20131627 (2016)","DOI":"10.1109\/CVPR.2016.179"},{"key":"2_CR43","volume-title":"Robot Vision","author":"BKP Horn","year":"1986","unstructured":"Horn, B.K.P.: Robot Vision. MIT Press, Cambridge (1986)"},{"key":"2_CR44","doi-asserted-by":"crossref","unstructured":"Hu, Y., Li, Y., Song, R.: Robust interpolation of correspondences for large displacement optical flow. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.509"},{"issue":"8","key":"2_CR45","doi-asserted-by":"publisher","first-page":"1558","DOI":"10.1109\/TPAMI.2014.2377715","volume":"37","author":"P Doll\u00e1r","year":"2015","unstructured":"Doll\u00e1r, P., Zitnick, C.L.: Fast edge detection using structured forests. IEEE Trans. Patt. Anal. Mach. Intell. 37(8), 1558\u20131570 (2015)","journal-title":"IEEE Trans. Patt. Anal. Mach. Intell."},{"key":"2_CR46","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: ImageNet classification with deep convolutional neural networks. In: NIPS, pp. 1106\u20131114 (2012)"},{"key":"2_CR47","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. In: ICLR (2015)"},{"key":"2_CR48","unstructured":"Teichmann, M., Weber, M., Z\u00f6llner, J.M., Cipolla, R., Urtasun, R.: MultiNet: real-time joint semantic reasoning for autonomous driving. In: CVPR (2016)"},{"key":"2_CR49","doi-asserted-by":"crossref","unstructured":"Ros, G., Ramos, S., Granados, M., Bakhtiary, A., V\u00e1zquez, D., L\u00f3pez, A.M.: Vision-based offline-online perception paradigm for autonomous driving. In: WACV (2015)","DOI":"10.1109\/WACV.2015.38"},{"key":"2_CR50","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"44","DOI":"10.1007\/978-3-540-88682-2_5","volume-title":"Computer Vision \u2013 ECCV 2008","author":"GJ Brostow","year":"2008","unstructured":"Brostow, G.J., Shotton, J., Fauqueur, J., Cipolla, R.: Segmentation and recognition using structure from motion point clouds. In: Forsyth, D., Torr, P., Zisserman, A. (eds.) ECCV 2008. LNCS, vol. 5302, pp. 44\u201357. Springer, Heidelberg (2008). https:\/\/doi.org\/10.1007\/978-3-540-88682-2_5"},{"issue":"2","key":"2_CR51","doi-asserted-by":"publisher","first-page":"88","DOI":"10.1016\/j.patrec.2008.04.005","volume":"30","author":"Gabriel J. Brostow","year":"2009","unstructured":"Brostow, G.J., Fauqueur, J., Cipolla, R.: Semantic object classes in video: a high-definition ground truth database. Patt. Recogn. Lett. 30, 88\u201397 (2008)","journal-title":"Pattern Recognition Letters"},{"key":"2_CR52","doi-asserted-by":"crossref","unstructured":"Zhang, R., Isola, P., Efros, A.A.: Split-brain autoencoders: unsupervised learning by cross-channel prediction. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.76"},{"key":"2_CR53","doi-asserted-by":"crossref","unstructured":"Kundu, A., Vineet, V., Koltun, V.: Feature space optimization for semantic video segmentation. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.345"},{"key":"2_CR54","doi-asserted-by":"publisher","first-page":"234","DOI":"10.1007\/978-3-319-24574-4_28","volume-title":"Lecture Notes in Computer Science","author":"Olaf Ronneberger","year":"2015","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-Net: convolutional networks for biomedical image segmentation. In: Navab, N., Hornegger, J., Wells, W., Frangi, A. (eds.) Medical Image Computing and Computer-Assisted Intervention \u2013 MICCAI 2015. MICCAI 2015. Lecture Notes in Computer Science, vol. 9351, pp. 234\u2013241. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-24574-4_28"},{"key":"2_CR55","unstructured":"Eigen, D., Puhrsch, C., Fergus, R.: Depth map prediction from a single image using a multi-scale deep network. In: NIPS (2014)"},{"issue":"10","key":"2_CR56","doi-asserted-by":"publisher","first-page":"2024","DOI":"10.1109\/TPAMI.2015.2505283","volume":"38","author":"F Liu","year":"2016","unstructured":"Liu, F., Shen, C., Lin, G., Reid, I.D.: Learning depth from single monocular images using deep convolutional neural fields. IEEE TPAMI 38(10), 2024\u20132039 (2016)","journal-title":"IEEE TPAMI"},{"key":"2_CR57","doi-asserted-by":"crossref","unstructured":"Kuznietsov, Y., St\u00fcckler, J., Leibe, B.: Semi-supervised deep learning for monocular depth map prediction. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.238"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2018"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-01252-6_2","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,5]],"date-time":"2022-10-05T01:02:11Z","timestamp":1664931731000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-01252-6_2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018]]},"ISBN":["9783030012519","9783030012526"],"references-count":57,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-01252-6_2","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018]]},"assertion":[{"value":"6 October 2018","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Munich","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2018","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 September 2018","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 September 2018","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2018","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2018.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}