{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,21]],"date-time":"2026-01-21T11:21:37Z","timestamp":1768994497872,"version":"3.49.0"},"publisher-location":"Singapore","reference-count":30,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819556922","type":"print"},{"value":"9789819556939","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-5693-9_27","type":"book-chapter","created":{"date-parts":[[2026,1,20]],"date-time":"2026-01-20T21:23:18Z","timestamp":1768944198000},"page":"388-402","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["HM-Net: Hierarchical Attention-Weighted Multimodal Feature Fusion Network for\u00a0Echo-Visual Depth Estimation"],"prefix":"10.1007","author":[{"given":"Wenjie","family":"Zhang","sequence":"first","affiliation":[]},{"given":"Long","family":"Ma","sequence":"additional","affiliation":[]},{"given":"Yi","family":"Li","sequence":"additional","affiliation":[]},{"given":"Baolong","family":"Li","sequence":"additional","affiliation":[]},{"given":"Mingliang","family":"Xu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,1,21]]},"reference":[{"key":"27_CR1","doi-asserted-by":"crossref","unstructured":"Agarwal, A., Arora, C.: Attention attention everywhere: monocular depth prediction with skip attention. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 5861\u20135870 (2023)","DOI":"10.1109\/WACV56688.2023.00581"},{"key":"27_CR2","unstructured":"Bhat, S.F., Alhashim, I., Wonka, P.: Adabins: depth estimation using adaptive bins. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4009\u20134018 (2021)"},{"key":"27_CR3","doi-asserted-by":"crossref","unstructured":"Brunetto, A., Hornauer, S., Stella, X.Y., Moutarde, F.: The audio-visual batvision dataset for research on sight and sound. In: IEEE\/RSJ International Conference on Intelligent Robots and Systems, pp.\u00a01\u20138 (2023)","DOI":"10.1109\/IROS55552.2023.10341715"},{"key":"27_CR4","doi-asserted-by":"crossref","unstructured":"Chang, A., et al.: Matterport3d: learning from RGB-D data in indoor environments. In: International Conference on 3D Vision, pp. 667\u2013676 (2017)","DOI":"10.1109\/3DV.2017.00081"},{"key":"27_CR5","doi-asserted-by":"crossref","unstructured":"Chen, C., et al.: Soundspaces: audio-visual navigaton in 3D environments. In: European Conference on Computer Vision, pp. 17\u201336 (2020)","DOI":"10.1007\/978-3-030-58539-6_2"},{"key":"27_CR6","doi-asserted-by":"crossref","unstructured":"Christensen, J.H., Hornauer, S., Stella, X.Y.: Batvision: learning to see 3D spatial layout with two ears. In: IEEE International Conference on Robotics and Automation, pp. 1581\u20131587 (2020)","DOI":"10.1109\/ICRA40945.2020.9196934"},{"key":"27_CR7","doi-asserted-by":"crossref","unstructured":"Devulapally, A., Khan, M.F.F., Advani, S., Narayanan, V.: Multi-modal fusion of event and RGB for monocular depth estimation using a unified transformer-based architecture. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2081\u20132089 (2024)","DOI":"10.1109\/CVPRW63382.2024.00213"},{"key":"27_CR8","unstructured":"Eigen, D., Puhrsch, C., Fergus, R.: Depth map prediction from a single image using a multi-scale deep network. In: Advances in Neural Information Processing Systems, pp.\u00a01\u20139 (2014)"},{"key":"27_CR9","doi-asserted-by":"crossref","unstructured":"Fu, H., Gong, M., Wang, C., Batmanghelich, K., Tao, D.: Deep ordinal regression network for monocular depth estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2002\u20132011 (2018)","DOI":"10.1109\/CVPR.2018.00214"},{"key":"27_CR10","doi-asserted-by":"crossref","unstructured":"Gao, R., Chen, C., Al-Halah, Z., Schissler, C., Grauman, K.: Visualechoes: spatial image representation learning through echolocation. In: European Conference on Computer Vision, pp. 658\u2013676 (2020)","DOI":"10.1007\/978-3-030-58545-7_38"},{"key":"27_CR11","doi-asserted-by":"crossref","unstructured":"Irie, G., Shibata, T., Kimura, A.: Co-attention-guided bilinear model for echo-based depth estimation. In: IEEE International Conference on Acoustics, Speech and Signal Processing, pp. 4648\u20134652 (2022)","DOI":"10.1109\/ICASSP43922.2022.9746476"},{"key":"27_CR12","doi-asserted-by":"crossref","unstructured":"Khan, M.F.F., Devulapally, A., Advani, S., Narayanan, V.: Robust multimodal depth estimation using transformer based generative adversarial networks. In: Proceedings of the ACM International Conference on Multimedia, pp. 3559\u20133568 (2022)","DOI":"10.1145\/3503161.3548418"},{"key":"27_CR13","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. In: International Conference on Learning Representation, pp. 1\u201311 (2015)"},{"issue":"6","key":"27_CR14","doi-asserted-by":"publisher","first-page":"7499","DOI":"10.1109\/TMC.2023.3334271","volume":"23","author":"D Li","year":"2024","unstructured":"Li, D., Xu, J., Yang, Z., Ma, Q., Zhang, L., Chen, P.: Leovr: motion-inspired visual-lidar fusion for environment depth estimation. IEEE Trans. Mob. Comput. 23(6), 7499\u20137516 (2024)","journal-title":"IEEE Trans. Mob. Comput."},{"key":"27_CR15","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"27_CR16","doi-asserted-by":"publisher","first-page":"13","DOI":"10.1016\/j.dsp.2017.11.003","volume":"77","author":"C Mateo","year":"2018","unstructured":"Mateo, C., Talavera, J.A.: Short-time fourier transform with the window size fixed in the frequency domain. Digit. Signal Process. 77, 13\u201321 (2018)","journal-title":"Digit. Signal Process."},{"key":"27_CR17","doi-asserted-by":"crossref","unstructured":"Parida, K.K., Srivastava, S., Sharma, G.: Beyond image to depth: improving depth prediction using echoes. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8268\u20138277 (2021)","DOI":"10.1109\/CVPR46437.2021.00817"},{"issue":"8","key":"27_CR18","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3663570","volume":"20","author":"B Peng","year":"2024","unstructured":"Peng, B., et al.: Self-supervised monocular depth estimation via binocular geometric correlation learning. ACM Trans. Multimedia Comput. Commun. Appl. 20(8), 1\u201319 (2024)","journal-title":"ACM Trans. Multimedia Comput. Commun. Appl."},{"key":"27_CR19","doi-asserted-by":"crossref","unstructured":"Savva, M., et al.: Habitat: a platform for embodied AI research. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9339\u20139347 (2019)","DOI":"10.1109\/ICCV.2019.00943"},{"key":"27_CR20","doi-asserted-by":"crossref","unstructured":"Singh, A.D., et al.: Depth estimation from camera image and mmwave radar point cloud. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9275\u20139285 (2023)","DOI":"10.1109\/CVPR52729.2023.00895"},{"key":"27_CR21","unstructured":"Straub, J., et al.: The replica dataset: a digital replica of indoor spaces. arXiv preprint arXiv:1906.05797 (2019)"},{"key":"27_CR22","doi-asserted-by":"crossref","unstructured":"Su, W., Tao, W.: Efficient edge-preserving multi-view stereo network for depth estimation. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 2348\u20132356 (2023)","DOI":"10.1609\/aaai.v37i2.25330"},{"key":"27_CR23","doi-asserted-by":"publisher","DOI":"10.1016\/j.dsp.2024.104681","volume":"154","author":"C Xia","year":"2024","unstructured":"Xia, C., et al.: RCFNet: related cross-level feature network with cascaded self-distillation for monocular depth estimation. Digit. Signal Process. 154, 104681 (2024)","journal-title":"Digit. Signal Process."},{"key":"27_CR24","doi-asserted-by":"crossref","unstructured":"Xu, J., Li, Z., Du, B., Zhang, M., Liu, J.: Reluplex made more practical: leaky relu. In: IEEE Symposium on Computers and Communications, pp.\u00a01\u20137 (2020)","DOI":"10.1109\/ISCC50000.2020.9219587"},{"key":"27_CR25","doi-asserted-by":"crossref","unstructured":"Yang, L., Kang, B., Huang, Z., Xu, X., Feng, J., Zhao, H.: Depth anything: unleashing the power of large-scale unlabeled data. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10371\u201310381 (2024)","DOI":"10.1109\/CVPR52733.2024.00987"},{"key":"27_CR26","doi-asserted-by":"crossref","unstructured":"Zhang, C., et al.: Stereo depth estimation with echoes. In: European Conference on Computer Vision, pp. 496\u2013513 (2022)","DOI":"10.1007\/978-3-031-19812-0_29"},{"issue":"9","key":"27_CR27","first-page":"1","volume":"20","author":"J Zhang","year":"2024","unstructured":"Zhang, J., et al.: Learning domain invariant features for unsupervised indoor depth estimation adaptation. ACM Trans. Multimedia Comput. Commun. Appl. 20(9), 1\u201323 (2024)","journal-title":"ACM Trans. Multimedia Comput. Commun. Appl."},{"key":"27_CR28","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2023.109885","volume":"144","author":"S Zhang","year":"2023","unstructured":"Zhang, S., Xu, W., Wei, Z., Zhang, L., Wang, Y., Liu, J.: Arai-mvsnet: a multi-view stereo depth estimation network with adaptive depth range and depth interval. Pattern Recogn. 144, 109885 (2023)","journal-title":"Pattern Recogn."},{"key":"27_CR29","doi-asserted-by":"publisher","first-page":"1016","DOI":"10.1109\/TIP.2024.3355807","volume":"33","author":"L Zhao","year":"2024","unstructured":"Zhao, L., Wei, Y., Li, J., Zhou, J., Lu, J.: Structure-aware cross-modal transformer for depth completion. IEEE Trans. Image Process. 33, 1016\u20131031 (2024)","journal-title":"IEEE Trans. Image Process."},{"key":"27_CR30","unstructured":"Zhu, L., Rahtu, E., Zhao, H.: Beyond visual field of view: perceiving 3D environment with echoes and vision. arXiv preprint arXiv:2207.01136 (2022)"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-5693-9_27","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,20]],"date-time":"2026-01-20T21:23:23Z","timestamp":1768944203000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-5693-9_27"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819556922","9789819556939"],"references-count":30,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-5693-9_27","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"21 January 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Shanghai","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 October 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/2025.prcv.cn\/index.asp","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}