{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,2]],"date-time":"2025-11-02T12:57:01Z","timestamp":1762088221208,"version":"build-2065373602"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,6,12]],"date-time":"2023-06-12T00:00:00Z","timestamp":1686528000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,6,12]]},"DOI":"10.1145\/3591106.3592264","type":"proceedings-article","created":{"date-parts":[[2023,6,8]],"date-time":"2023-06-08T22:33:38Z","timestamp":1686263618000},"page":"388-397","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Towards Practical Consistent Video Depth Estimation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-4763-4320","authenticated-orcid":false,"given":"Pengzhi","family":"Li","sequence":"first","affiliation":[{"name":"Tsinghua University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6916-9371","authenticated-orcid":false,"given":"Yikang","family":"Ding","sequence":"additional","affiliation":[{"name":"Tsinghua University, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-3409-1875","authenticated-orcid":false,"given":"Linge","family":"Li","sequence":"additional","affiliation":[{"name":"Huawei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-2302-2986","authenticated-orcid":false,"given":"Jingwei","family":"Guan","sequence":"additional","affiliation":[{"name":"Huawei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1523-1114","authenticated-orcid":false,"given":"Zhiheng","family":"Li","sequence":"additional","affiliation":[{"name":"Tsinghua University, China"}]}],"member":"320","published-online":{"date-parts":[[2023,6,12]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/2816795.2818107"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-33783-3_44"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018001"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3386569.3392457"},{"key":"e_1_3_2_1_5_1","unstructured":"David Eigen Christian Puhrsch and Rob Fergus. 2014. Depth map prediction from a single image using a multi-scale deep network. Advances in neural information processing systems 27."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46484-8_45"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6248074"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.699"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00393"},{"key":"e_1_3_2_1_10_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980.","author":"Kingma P","year":"2014","unstructured":"Diederik\u00a0P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980."},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings, Part XX 16","author":"Klingner Marvin","year":"2020","unstructured":"Marvin Klingner, Jan-Aike Term\u00f6hlen, Jonas Mikolajczyk, and Tim Fingscheidt. 2020. Self-supervised monocular depth estimation: Solving the dynamic object problem by semantic guidance. In Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XX 16. Springer, 582\u2013600."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00166"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01267-0_11"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/3DV.2016.32"},{"key":"e_1_3_2_1_15_1","first-page":"1083","article-title":"Blind video temporal consistency via deep video prior","volume":"33","author":"Lei Chenyang","year":"2020","unstructured":"Chenyang Lei, Yazhou Xing, and Qifeng Chen. 2020. Blind video temporal consistency via deep video prior. Advances in Neural Information Processing Systems 33, 1083\u20131093.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00134"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00465"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00218"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01419"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01124"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3386569.3392377"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2020.3017478"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01580"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.85"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00037"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01196"},{"key":"e_1_3_2_1_27_1","volume-title":"Towards robust monocular depth estimation: Mixing datasets for zero-shot cross-dataset transfer","author":"Ranftl Ren\u00e9","year":"2020","unstructured":"Ren\u00e9 Ranftl, Katrin Lasinger, David Hafner, Konrad Schindler, and Vladlen Koltun. 2020. Towards robust monocular depth estimation: Mixing datasets for zero-shot cross-dataset transfer. IEEE transactions on pattern analysis and machine intelligence 44, 3 (2020), 1623\u20131637."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01252"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126544"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.445"},{"key":"e_1_3_2_1_31_1","unstructured":"Xingjian Shi Zhourong Chen Hao Wang Dit-Yan Yeung Wai-Kin Wong and Wang-chun Woo. 2015. Convolutional LSTM network: A machine learning approach for precipitation nowcasting. Advances in neural information processing systems 28."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Nathan Silberman Derek Hoiem Pushmeet Kohli and Rob Fergus. 2012. Indoor segmentation and support inference from rgbd images.ECCV (5) 7576 746\u2013760.","DOI":"10.1007\/978-3-642-33715-4_54"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.asoc.2019.105524"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2012.6385773"},{"key":"e_1_3_2_1_35_1","volume-title":"Proceedings of the European Conference on Computer Vision (ECCV) Workshops. 0\u20130.","author":"Tananaev Denis","year":"2018","unstructured":"Denis Tananaev, Huizhong Zhou, Benjamin Ummenhofer, and Thomas Brox. 2018. Temporally consistent depth estimation in videos with recurrent architectures. In Proceedings of the European Conference on Computer Vision (ECCV) Workshops. 0\u20130."},{"key":"e_1_3_2_1_36_1","volume-title":"BA-Net: Dense Bundle Adjustment Networks. In International Conference on Learning Representations.","author":"Tang Chengzhou","year":"2019","unstructured":"Chengzhou Tang and Ping Tan. 2019. BA-Net: Dense Bundle Adjustment Networks. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_37_1","volume-title":"International Conference on Learning Representations.","author":"Teed Zachary","year":"2020","unstructured":"Zachary Teed and Jia Deng. 2020. DeepV2D: Video to Depth with Differentiable Structure from Motion. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00283"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00216"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/3DV.2019.00046"},{"key":"e_1_3_2_1_41_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","author":"Wang Qiang","year":"2019","unstructured":"Qiang Wang, Shizhen Zheng, Qingsong Yan, Fei Deng, Kaiyong Zhao, and Xiaowen Chu. 2019. IRS: A Large Naturalistic Indoor Robotics Stereo Dataset to Train Deep Models for Disparity and Surface Normal Estimation. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547978"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00122"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00379"},{"key":"e_1_3_2_1_45_1","unstructured":"Yuxin Wu Alexander Kirillov Francisco Massa Wan-Yen Lo and Ross Girshick. 2019. Detectron2. https:\/\/github.com\/facebookresearch\/detectron2."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00069"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00795"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3097396"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00027"},{"key":"e_1_3_2_1_50_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 5336\u20135345","author":"Yoon Jae\u00a0Shin","year":"2020","unstructured":"Jae\u00a0Shin Yoon, Kihwan Kim, Orazio Gallo, Hyun\u00a0Soo Park, and Jan Kautz. 2020. Novel view synthesis of dynamic scenes with globally coherent depths from a monocular camera. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 5336\u20135345."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00198"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00493"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00181"},{"volume-title":"Synthetic Defocus and Look-Ahead Autofocus for Casual Videography","author":"Zhang Xuaner","key":"e_1_3_2_1_54_1","unstructured":"Xuaner Zhang, Kevin Matzen, Vivien Nguyen, Dillon Yao, You Zhang, and Ren Ng. 2019. Synthetic Defocus and Look-Ahead Autofocus for Casual Videography. Association for Computing Machinery."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3478513.3480500"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01270-0_50"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.700"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01228-1_3"}],"event":{"name":"ICMR '23: International Conference on Multimedia Retrieval","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Thessaloniki Greece","acronym":"ICMR '23"},"container-title":["Proceedings of the 2023 ACM International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3591106.3592264","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3591106.3592264","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:37:30Z","timestamp":1750178250000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3591106.3592264"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6,12]]},"references-count":58,"alternative-id":["10.1145\/3591106.3592264","10.1145\/3591106"],"URL":"https:\/\/doi.org\/10.1145\/3591106.3592264","relation":{},"subject":[],"published":{"date-parts":[[2023,6,12]]},"assertion":[{"value":"2023-06-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}