{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,12]],"date-time":"2026-05-12T17:29:05Z","timestamp":1778606945693,"version":"3.51.4"},"publisher-location":"Cham","reference-count":55,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030585679","type":"print"},{"value":"9783030585686","type":"electronic"}],"license":[{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2020]]},"DOI":"10.1007\/978-3-030-58568-6_42","type":"book-chapter","created":{"date-parts":[[2020,11,12]],"date-time":"2020-11-12T14:03:09Z","timestamp":1605189789000},"page":"710-727","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":64,"title":["Learning Monocular Visual Odometry via Self-Supervised Long-Term Modeling"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8374-6662","authenticated-orcid":false,"given":"Yuliang","family":"Zou","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6213-554X","authenticated-orcid":false,"given":"Pan","family":"Ji","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1396-6544","authenticated-orcid":false,"given":"Quoc-Huy","family":"Tran","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0536-3658","authenticated-orcid":false,"given":"Jia-Bin","family":"Huang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4683-2454","authenticated-orcid":false,"given":"Manmohan","family":"Chandraker","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2020,11,13]]},"reference":[{"key":"42_CR1","unstructured":"Bian, J.W., et al.: Unsupervised scale-consistent depth and ego-motion learning from monocular video. In: NeurIPS (2019)"},{"key":"42_CR2","doi-asserted-by":"crossref","unstructured":"Bloesch, M., Czarnowski, J., Clark, R., Leutenegger, S., Davison, A.J.: CodesLAM\u2013learning a compact, optimisable representation for dense visual slam. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00271"},{"key":"42_CR3","doi-asserted-by":"crossref","unstructured":"Bogdan, O., Eckstein, V., Rameau, F., Bazin, J.C.: DeepCalib: a deep learning approach for automatic intrinsic calibration of wide field-of-view cameras. In: CVMP (2018)","DOI":"10.1145\/3278471.3278479"},{"issue":"6","key":"42_CR4","doi-asserted-by":"publisher","first-page":"1309","DOI":"10.1109\/TRO.2016.2624754","volume":"32","author":"C Cadena","year":"2016","unstructured":"Cadena, C., et al.: Past, present, and future of simultaneous localization and mapping: toward the robust-perception age. IEEE Trans. Robot. 32(6), 1309\u20131332 (2016)","journal-title":"IEEE Trans. Robot."},{"key":"42_CR5","unstructured":"Chorowski, J., Bahdanau, D., Cho, K., Bengio, Y.: End-to-end continuous speech recognition using attention-based recurrent NN: first results. arXiv preprint arXiv:1412.1602 (2014)"},{"key":"42_CR6","doi-asserted-by":"crossref","unstructured":"Dhiman, V., Tran, Q.H., Corso, J.J., Chandraker, M.: A continuous occlusion model for road scene understanding. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.469"},{"key":"42_CR7","doi-asserted-by":"crossref","unstructured":"Dosovitskiy, A., et al.: FlowNet: learning optical flow with convolutional networks. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.316"},{"issue":"3","key":"42_CR8","doi-asserted-by":"publisher","first-page":"611","DOI":"10.1109\/TPAMI.2017.2658577","volume":"40","author":"J Engel","year":"2017","unstructured":"Engel, J., Koltun, V., Cremers, D.: Direct sparse odometry. TPAMI 40(3), 611\u2013625 (2017)","journal-title":"TPAMI"},{"key":"42_CR9","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"834","DOI":"10.1007\/978-3-319-10605-2_54","volume-title":"Computer Vision","author":"J Engel","year":"2014","unstructured":"Engel, J., Sch\u00f6ps, T., Cremers, D.: LSD-SLAM: large-scale direct monocular SLAM. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8690, pp. 834\u2013849. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10605-2_54"},{"key":"42_CR10","doi-asserted-by":"crossref","unstructured":"Forster, C., Pizzoli, M., Scaramuzza, D.: SVO: fast semi-direct monocular visual odometry. In: ICRA (2014)","DOI":"10.1109\/ICRA.2014.6906584"},{"issue":"11","key":"42_CR11","first-page":"1231","volume":"32","author":"A Geiger","year":"2013","unstructured":"Geiger, A., Lenz, P., Stiller, C., Urtasun, R.: Vision meets robotics: the kitti dataset. IJRR 32(11), 1231\u20131237 (2013)","journal-title":"IJRR"},{"key":"42_CR12","doi-asserted-by":"crossref","unstructured":"Geiger, A., Lenz, P., Urtasun, R.: Are we ready for autonomous driving? The kitti vision benchmark suite. In: CVPR (2012)","DOI":"10.1109\/CVPR.2012.6248074"},{"key":"42_CR13","doi-asserted-by":"crossref","unstructured":"Godard, C., Mac Aodha, O., Firman, M., Brostow, G.: Digging into self-supervised monocular depth estimation. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00393"},{"key":"42_CR14","unstructured":"Graves, A., Jaitly, N.: Towards end-to-end speech recognition with recurrent neural networks. In: ICML (2014)"},{"key":"42_CR15","unstructured":"Grupp, M.: evo: Python package for the evaluation of odometry and SLAM (2017). https:\/\/github.com\/MichaelGrupp\/evo"},{"issue":"2","key":"42_CR16","first-page":"146","volume":"68","author":"RI Hartley","year":"1997","unstructured":"Hartley, R.I., Sturm, P.: Triangulation. CVIU 68(2), 146\u2013157 (1997)","journal-title":"CVIU"},{"key":"42_CR17","unstructured":"Kingma, D., Ba, J.: Adam: a method for stochastic optimization. In: ICLR (2014)"},{"key":"42_CR18","doi-asserted-by":"crossref","unstructured":"Klein, G., Murray, D.: Parallel tracking and mapping for small AR workspaces. In: ISMAR (2007)","DOI":"10.1109\/ISMAR.2007.4538852"},{"key":"42_CR19","unstructured":"K\u00fcmmerle, R., Grisetti, G., Strasdat, H., Konolige, K., Burgard, W.: g$$^2$$o: a general framework for graph optimization. In: ICRA (2011)"},{"key":"42_CR20","doi-asserted-by":"crossref","unstructured":"Li, R., Wang, S., Long, Z., Gu, D.: UnDeepVO: monocular visual odometry through unsupervised deep learning. In: ICRA (2018)","DOI":"10.1109\/ICRA.2018.8461251"},{"key":"42_CR21","doi-asserted-by":"crossref","unstructured":"Li, Y., Ushiku, Y., Harada, T.: Pose graph optimization for unsupervised monocular visual odometry. In: ICRA (2019)","DOI":"10.1109\/ICRA.2019.8793706"},{"key":"42_CR22","doi-asserted-by":"crossref","unstructured":"Mahjourian, R., Wicke, M., Angelova, A.: Unsupervised learning of depth and ego-motion from monocular video using 3D geometric constraints. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00594"},{"issue":"5","key":"42_CR23","doi-asserted-by":"publisher","first-page":"1147","DOI":"10.1109\/TRO.2015.2463671","volume":"31","author":"R Mur-Artal","year":"2015","unstructured":"Mur-Artal, R., Montiel, J.M.M., Tardos, J.D.: ORB-SLAM: a versatile and accurate monocular SLAM system. IEEE Trans. Robot. 31(5), 1147\u20131163 (2015)","journal-title":"IEEE Trans. Robot."},{"issue":"5","key":"42_CR24","doi-asserted-by":"publisher","first-page":"1255","DOI":"10.1109\/TRO.2017.2705103","volume":"33","author":"R Mur-Artal","year":"2017","unstructured":"Mur-Artal, R., Tard\u00f3s, J.D.: ORB-SLAM2: an open-source SLAM system for monocular, stereo and RGB-D cameras. IEEE Trans. Robot. 33(5), 1255\u20131262 (2017)","journal-title":"IEEE Trans. Robot."},{"key":"42_CR25","unstructured":"Nist\u00e9r, D., Naroditsky, O., Bergen, J.: Visual odometry. In: CVPR (2004)"},{"key":"42_CR26","doi-asserted-by":"crossref","unstructured":"Ranjan, A., et al.: Competitive collaboration: joint unsupervised learning of depth, camera motion, optical flow and motion segmentation. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.01252"},{"issue":"4","key":"42_CR27","doi-asserted-by":"publisher","first-page":"80","DOI":"10.1109\/MRA.2011.943233","volume":"18","author":"D Scaramuzza","year":"2011","unstructured":"Scaramuzza, D., Fraundorfer, F.: Visual odometry [Tutorial]. IEEE Robot. Autom. Mag. 18(4), 80\u201392 (2011)","journal-title":"IEEE Robot. Autom. Mag."},{"key":"42_CR28","doi-asserted-by":"crossref","unstructured":"Schubert, D., Demmel, N., Usenko, V., Stuckler, J., Cremers, D.: Direct sparse odometry with rolling shutter. In: ECCV (2018)","DOI":"10.1007\/978-3-030-01237-3_42"},{"key":"42_CR29","doi-asserted-by":"crossref","unstructured":"Shen, T., et al.: Beyond photometric loss for self-supervised ego-motion estimation. In: ICRA (2019)","DOI":"10.1109\/ICRA.2019.8793479"},{"key":"42_CR30","doi-asserted-by":"crossref","unstructured":"Sheng, L., Xu, D., Ouyang, W., Wang, X.: Unsupervised collaborative learning of keyframe detection and visual odometry towards monocular deep SLAM. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00440"},{"key":"42_CR31","unstructured":"Shi, X., Chen, Z., Wang, H., Yeung, D.Y., Wong, W.K., Woo, W.C.: Convolutional LSTM network: a machine learning approach for precipitation nowcasting. In: NeurIPS (2015)"},{"key":"42_CR32","doi-asserted-by":"crossref","unstructured":"Song, S., Chandraker, M.: Robust scale estimation in real-time monocular SFM for autonomous driving. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.203"},{"key":"42_CR33","unstructured":"Srivastava, N., Mansimov, E., Salakhudinov, R.: Unsupervised learning of video representations using LSTMs. In: ICML (2015)"},{"key":"42_CR34","doi-asserted-by":"crossref","unstructured":"Sturm, J., Engelhard, N., Endres, F., Burgard, W., Cremers, D.: A benchmark for the evaluation of RGB-D SLAM systems. In: IROS (2012)","DOI":"10.1109\/IROS.2012.6385773"},{"key":"42_CR35","unstructured":"Tang, C., Tan, P.: BA-Net: dense bundle adjustment network. In: ICLR (2019)"},{"key":"42_CR36","unstructured":"Teed, Z., Deng, J.: DeepV2D: video to depth with differentiable structure from motion. In: ICLR (2020)"},{"key":"42_CR37","doi-asserted-by":"crossref","unstructured":"Tiwari, L., Ji, P., Tran, Q.H., Zhuang, B., Anand, S., Chandraker, M.: Pseudo RGB-D for self-improving monocular SLAM and depth prediction. In: ECCV (2020)","DOI":"10.1007\/978-3-030-58621-8_26"},{"key":"42_CR38","doi-asserted-by":"crossref","unstructured":"Triggs, B., McLauchlan, P.F., Hartley, R.I., Fitzgibbon, A.W.: Bundle adjustment\u2013a modern synthesis. In: International Workshop on Vision Algorithms (1999)","DOI":"10.1007\/3-540-44480-7_21"},{"key":"42_CR39","doi-asserted-by":"crossref","unstructured":"Ummenhofer, B., et al.: Demon: depth and motion network for learning monocular stereo. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.596"},{"key":"42_CR40","unstructured":"Villegas, R., Yang, J., Zou, Y., Sohn, S., Lin, X., Lee, H.: Learning to generate long-term future via hierarchical prediction. In: ICML (2017)"},{"key":"42_CR41","doi-asserted-by":"crossref","unstructured":"Wang, C., Miguel Buenaposada, J., Zhu, R., Lucey, S.: Learning depth from monocular videos using direct methods. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00216"},{"key":"42_CR42","doi-asserted-by":"crossref","unstructured":"Wang, R., Pizer, S.M., Frahm, J.M.: Recurrent neural network for (un-) supervised learning of monocular video visual odometry and depth. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00570"},{"key":"42_CR43","doi-asserted-by":"crossref","unstructured":"Wang, R., Schworer, M., Cremers, D.: Stereo DSO: large-scale direct sparse visual odometry with stereo cameras. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.421"},{"key":"42_CR44","doi-asserted-by":"crossref","unstructured":"Wang, S., Clark, R., Wen, H., Trigoni, N.: DeepVO: towards end-to-end visual odometry with deep recurrent convolutional neural networks. In: ICRA (2017)","DOI":"10.1109\/ICRA.2017.7989236"},{"issue":"4\u20135","key":"42_CR45","first-page":"513","volume":"37","author":"S Wang","year":"2018","unstructured":"Wang, S., Clark, R., Wen, H., Trigoni, N.: End-to-end, sequence-to-sequence probabilistic visual odometry through deep neural networks. IJRR 37(4\u20135), 513\u2013542 (2018)","journal-title":"IJRR"},{"key":"42_CR46","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"293","DOI":"10.1007\/978-3-030-20876-9_19","volume-title":"Computer Vision","author":"F Xue","year":"2019","unstructured":"Xue, F., Wang, Q., Wang, X., Dong, W., Wang, J., Zha, H.: Guided feature selection for deep visual odometry. In: Jawahar, C.V., Li, H., Mori, G., Schindler, K. (eds.) ACCV 2018. LNCS, vol. 11366, pp. 293\u2013308. Springer, Cham (2019). https:\/\/doi.org\/10.1007\/978-3-030-20876-9_19"},{"key":"42_CR47","doi-asserted-by":"crossref","unstructured":"Xue, F., Wang, X., Li, S., Wang, Q., Wang, J., Zha, H.: Beyond tracking: selecting memory and refining poses for deep visual odometry. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00877"},{"issue":"4","key":"42_CR48","first-page":"2878","volume":"3","author":"N Yang","year":"2018","unstructured":"Yang, N., Wang, R., Gao, X., Cremers, D.: Challenges in monocular visual odometry: photometric calibration, motion bias, and rolling shutter effect. RAL 3(4), 2878\u20132885 (2018)","journal-title":"RAL"},{"key":"42_CR49","doi-asserted-by":"crossref","unstructured":"Yin, Z., Shi, J.: GeoNet: unsupervised learning of dense depth, optical flow and camera pose. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00212"},{"key":"42_CR50","doi-asserted-by":"crossref","unstructured":"Zhan, H., Garg, R., Saroj Weerasekera, C., Li, K., Agarwal, H., Reid, I.: Unsupervised learning of monocular depth estimation and visual odometry with deep feature reconstruction. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00043"},{"key":"42_CR51","doi-asserted-by":"crossref","unstructured":"Zhou, H., Ummenhofer, B., Brox, T.: DeepTAM: deep tracking and mapping. In: ECCV (2018)","DOI":"10.1007\/978-3-030-01270-0_50"},{"key":"42_CR52","doi-asserted-by":"crossref","unstructured":"Zhou, T., Brown, M., Snavely, N., Lowe, D.G.: Unsupervised learning of depth and ego-motion from video. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.700"},{"key":"42_CR53","doi-asserted-by":"crossref","unstructured":"Zhuang, B., Tran, Q.H., Ji, P., Cheong, L.F., Chandraker, M.: Learning structure-and-motion-aware rolling shutter correction. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00468"},{"key":"42_CR54","doi-asserted-by":"crossref","unstructured":"Zhuang, B., Tran, Q.H., Lee, G.H., Cheong, L.F., Chandraker, M.: Degeneracy in self-calibration revisited and a deep learning solution for uncalibrated SLAM. In: IROS (2019)","DOI":"10.1109\/IROS40897.2019.8967912"},{"key":"42_CR55","doi-asserted-by":"crossref","unstructured":"Zou, Y., Luo, Z., Huang, J.B.: DF-Net: unsupervised joint learning of depth and flow using cross-task consistency. In: ECCV (2018)","DOI":"10.1007\/978-3-030-01228-1_3"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2020"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-58568-6_42","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,12]],"date-time":"2024-11-12T00:26:31Z","timestamp":1731371191000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-58568-6_42"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020]]},"ISBN":["9783030585679","9783030585686"],"references-count":55,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-58568-6_42","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020]]},"assertion":[{"value":"13 November 2020","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Glasgow","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"United Kingdom","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2020","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 August 2020","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 August 2020","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2020","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2020.eu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"OpenReview","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5025","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1360","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"27% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"7","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"The conference was held virtually due to the COVID-19 pandemic. From the ECCV Workshops 249 full papers, 18 short papers, and 21 further contributions were published out of a total of 467 submissions.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}