{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T16:49:08Z","timestamp":1777567748230,"version":"3.51.4"},"publisher-location":"Cham","reference-count":86,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031727535","type":"print"},{"value":"9783031727542","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72754-2_9","type":"book-chapter","created":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T14:57:07Z","timestamp":1730300227000},"page":"146-165","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["Learning Modality-Agnostic Representation for\u00a0Semantic Segmentation from\u00a0Any Modalities"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4008-8951","authenticated-orcid":false,"given":"Xu","family":"Zheng","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0004-1450-811X","authenticated-orcid":false,"given":"Yuanhuiyi","family":"Lyu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7485-4493","authenticated-orcid":false,"given":"Lin","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,31]]},"reference":[{"key":"9_CR1","doi-asserted-by":"crossref","unstructured":"Alonso, I., Murillo, A.C.: Ev-segnet: semantic segmentation for event-based cameras. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops (2019)","DOI":"10.1109\/CVPRW.2019.00205"},{"key":"9_CR2","doi-asserted-by":"crossref","unstructured":"Borse, S., et al.: X-align: cross-modal cross-view alignment for bird\u2019s-eye-view segmentation. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 3287\u20133297 (2023)","DOI":"10.1007\/s00138-023-01400-7"},{"key":"9_CR3","doi-asserted-by":"crossref","unstructured":"Broedermann, T., Sakaridis, C., Dai, D., Van\u00a0Gool, L.: Hrfuser: A multi-resolution sensor fusion architecture for 2d object detection. arXiv preprint arXiv:2206.15157 (2022)","DOI":"10.1109\/ITSC57777.2023.10422432"},{"key":"9_CR4","doi-asserted-by":"crossref","unstructured":"Cao, J., Zheng, X., Lyu, Y., Wang, J., Xu, R., Wang, L.: Chasing day and night: towards robust and efficient all-day object detection guided by an event camera. arXiv preprint arXiv:2309.09297 (2023)","DOI":"10.1109\/ICRA57147.2024.10611705"},{"key":"9_CR5","doi-asserted-by":"crossref","unstructured":"Cao, J., Leng, H., Lischinski, D., Cohen-Or, D., Tu, C., Li, Y.: Shapeconv: shape-aware convolutional layer for indoor rgb-d semantic segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7088\u20137097 (2021)","DOI":"10.1109\/ICCV48922.2021.00700"},{"issue":"4","key":"9_CR6","doi-asserted-by":"publisher","first-page":"1787","DOI":"10.1109\/TCSVT.2022.3215979","volume":"33","author":"G Chen","year":"2022","unstructured":"Chen, G., et al.: Modality-induced transfer-fusion network for rgb-d and rgb-t salient object detection. IEEE Trans. Circuits Syst. Video Technol. 33(4), 1787\u20131801 (2022)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"9_CR7","unstructured":"Chen, J., Deguchi, D., Zhang, C., Zheng, X., Murase, H.: Clip is also a good teacher: a new learning framework for inductive zero-shot semantic segmentation. arXiv preprint arXiv:2310.02296 (2023)"},{"key":"9_CR8","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2024.110431","volume":"152","author":"J Chen","year":"2024","unstructured":"Chen, J., Deguchi, D., Zhang, C., Zheng, X., Murase, H.: Frozen is better than learning: a new design of prototype-based classifier for semantic segmentation. Pattern Recogn. 152, 110431 (2024)","journal-title":"Pattern Recogn."},{"key":"9_CR9","doi-asserted-by":"publisher","first-page":"2313","DOI":"10.1109\/TIP.2021.3049332","volume":"30","author":"LZ Chen","year":"2021","unstructured":"Chen, L.Z., Lin, Z., Wang, Z., Yang, Y.L., Cheng, M.M.: Spatial information guided convolution for real-time rgbd semantic segmentation. IEEE Trans. Image Process. 30, 2313\u20132324 (2021)","journal-title":"IEEE Trans. Image Process."},{"key":"9_CR10","unstructured":"Chen, M., Yao, J., Xing, L., Wang, Y., Zhang, Y., Wang, Y.: Redundancy-adaptive multimodal learning for imperfect data. arXiv preprint arXiv:2310.14496 (2023)"},{"key":"9_CR11","doi-asserted-by":"crossref","unstructured":"Cheng, Y., Wei, F., Bao, J., Chen, D., Zhang, W.: Cico: domain-aware sign language retrieval via cross-lingual contrastive learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19016\u201319026 (2023)","DOI":"10.1109\/CVPR52729.2023.01823"},{"key":"9_CR12","doi-asserted-by":"publisher","first-page":"6800","DOI":"10.1109\/TIP.2022.3216198","volume":"31","author":"R Cong","year":"2022","unstructured":"Cong, R., et al.: Cir-net: cross-modality interaction and refinement for rgb-d salient object detection. IEEE Trans. Image Process. 31, 6800\u20136815 (2022)","journal-title":"IEEE Trans. Image Process."},{"key":"9_CR13","doi-asserted-by":"crossref","unstructured":"Girdhar, R., et al.: Imagebind: one embedding space to bind them all. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15180\u201315190 (2023)","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"9_CR14","unstructured":"Guo, Z., et\u00a0al.: Point-bind & point-llm: aligning point cloud with multi-modality for 3d understanding, generation, and instruction following. arXiv preprint arXiv:2309.00615 (2023)"},{"key":"9_CR15","unstructured":"Han, J., et\u00a0al.: Imagebind-llm: multi-modality instruction tuning. arXiv preprint arXiv:2309.03905 (2023)"},{"key":"9_CR16","unstructured":"Huang, K., Shi, B., Li, X., Li, X., Huang, S., Li, Y.: Multi-modal sensor fusion for auto driving perception: a survey. arXiv preprint arXiv:2202.02703 (2022)"},{"key":"9_CR17","doi-asserted-by":"crossref","unstructured":"Huang, T., et al.: Clip2point: transfer clip to point cloud classification with image-depth pre-training. arXiv preprint arXiv:2210.01055 (2022)","DOI":"10.1109\/ICCV51070.2023.02025"},{"key":"9_CR18","doi-asserted-by":"crossref","unstructured":"Hui, T., et al.: Bridging search region interaction with template for rgb-t tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13630\u201313639 (2023)","DOI":"10.1109\/CVPR52729.2023.01310"},{"key":"9_CR19","doi-asserted-by":"publisher","first-page":"2321","DOI":"10.1109\/TIP.2022.3154931","volume":"31","author":"W Ji","year":"2022","unstructured":"Ji, W., et al.: Dmra: depth-induced multi-scale recurrent attention network for rgb-d saliency detection. IEEE Trans. Image Process. 31, 2321\u20132336 (2022)","journal-title":"IEEE Trans. Image Process."},{"key":"9_CR20","doi-asserted-by":"publisher","first-page":"1829","DOI":"10.1109\/TIP.2023.3249579","volume":"32","author":"Z Jia","year":"2023","unstructured":"Jia, Z., et al.: Event-based semantic segmentation with posterior attention. IEEE Trans. Image Process. 32, 1829\u20131842 (2023)","journal-title":"IEEE Trans. Image Process."},{"key":"9_CR21","doi-asserted-by":"publisher","unstructured":"Lee, M., Park, C., Cho, S., Lee, S.: Spsn: superpixel prototype sampling network for rgb-d salient object detection. In: European Conference on Computer Vision, pp. 630\u2013647. Springer, Heidelberg (2022). https:\/\/doi.org\/10.1007\/978-3-031-19818-2_36","DOI":"10.1007\/978-3-031-19818-2_36"},{"key":"9_CR22","doi-asserted-by":"crossref","unstructured":"Li, J., Dai, H., Han, H., Ding, Y.: Mseg3d: multi-modal 3d semantic segmentation for autonomous driving. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 21694\u201321704 (2023)","DOI":"10.1109\/CVPR52729.2023.02078"},{"key":"9_CR23","doi-asserted-by":"crossref","unstructured":"Li, Y., et\u00a0al.: Deepfusion: lidar-camera deep fusion for multi-modal 3d object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 17182\u201317191 (2022)","DOI":"10.1109\/CVPR52688.2022.01667"},{"key":"9_CR24","doi-asserted-by":"crossref","unstructured":"Liang, Y., Wakaki, R., Nobuhara, S., Nishino, K.: Multimodal material segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19800\u201319808 (2022)","DOI":"10.1109\/CVPR52688.2022.01918"},{"issue":"11","key":"9_CR25","doi-asserted-by":"publisher","first-page":"7646","DOI":"10.1109\/TCSVT.2022.3184840","volume":"32","author":"G Liao","year":"2022","unstructured":"Liao, G., Gao, W., Li, G., Wang, J., Kwong, S.: Cross-collaborative fusion-encoder network for robust rgb-thermal salient object detection. IEEE Trans. Circuits Syst. Video Technol. 32(11), 7646\u20137661 (2022)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"9_CR26","doi-asserted-by":"crossref","unstructured":"Liu, H., Lu, T., Xu, Y., Liu, J., Li, W., Chen, L.: Camliflow: bidirectional camera-lidar fusion for joint optical flow and scene flow estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5791\u20135801 (2022)","DOI":"10.1109\/CVPR52688.2022.00570"},{"key":"9_CR27","doi-asserted-by":"crossref","unstructured":"Liu, R., et al.: Fourier prompt tuning for modality-incomplete scene segmentation. arXiv preprint arXiv:2401.16923 (2024)","DOI":"10.1109\/IV55156.2024.10588722"},{"key":"9_CR28","unstructured":"Lyu, Y., Zheng, X., Kim, D., Wang, L.: Omnibind: teach to build unequal-scale modality interaction for omni-bind of all. arXiv preprint arXiv:2405.16108 (2024)"},{"key":"9_CR29","doi-asserted-by":"crossref","unstructured":"Lyu, Y., Zheng, X., Zhou, J., Wang, L.: Unibind: llm-augmented unified and balanced representation space to bind them all. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 26752\u201326762 (2024)","DOI":"10.1109\/CVPR52733.2024.02526"},{"key":"9_CR30","doi-asserted-by":"crossref","unstructured":"Maheshwari, H., Liu, Y.C., Kira, Z.: Missing modality robustness in semi-supervised multi-modal semantic segmentation. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 1020\u20131030 (2024)","DOI":"10.1109\/WACV57701.2024.00106"},{"key":"9_CR31","doi-asserted-by":"crossref","unstructured":"Man, Y., Gui, L.Y., Wang, Y.X.: Bev-guided multi-modality fusion for driving perception. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 21960\u201321969 (2023)","DOI":"10.1109\/CVPR52729.2023.02103"},{"key":"9_CR32","doi-asserted-by":"crossref","unstructured":"Milioto, A., Vizzo, I., Behley, J., Stachniss, C.: Rangenet++: fast and accurate lidar semantic segmentation. In: 2019 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS), pp. 4213\u20134220. IEEE (2019)","DOI":"10.1109\/IROS40897.2019.8967762"},{"key":"9_CR33","doi-asserted-by":"publisher","first-page":"892","DOI":"10.1109\/TIP.2023.3234702","volume":"32","author":"Y Pang","year":"2023","unstructured":"Pang, Y., Zhao, X., Zhang, L., Lu, H.: Caver: cross-modal view-mixed transformer for bi-modal salient object detection. IEEE Trans. Image Process. 32, 892\u2013904 (2023)","journal-title":"IEEE Trans. Image Process."},{"key":"9_CR34","unstructured":"Park, S.J., Hong, K.S., Lee, S.: Rdfnet: Rgb-d multi-level residual feature fusion for indoor semantic segmentation. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 4980\u20134989 (2017)"},{"key":"9_CR35","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"9_CR36","doi-asserted-by":"crossref","unstructured":"Reza, M.K., Prater-Bennette, A., Asif, M.S.: Robust multimodal learning with missing modalities via parameter-efficient adaptation. arXiv preprint arXiv:2310.03986 (2023)","DOI":"10.1109\/TPAMI.2024.3476487"},{"key":"9_CR37","doi-asserted-by":"crossref","unstructured":"Shivakumar, S.S., Rodrigues, N., Zhou, A., Miller, I.D., Kumar, V., Taylor, C.J.: Pst900: Rgb-thermal calibration, dataset and segmentation network. In: 2020 IEEE International Conference on Robotics and Automation (ICRA), pp. 9441\u20139447. IEEE (2020)","DOI":"10.1109\/ICRA40945.2020.9196831"},{"key":"9_CR38","doi-asserted-by":"publisher","first-page":"6124","DOI":"10.1109\/TIP.2022.3205747","volume":"31","author":"M Song","year":"2022","unstructured":"Song, M., Song, W., Yang, G., Chen, C.: Improving rgb-d salient object detection via modality-aware decoder. IEEE Trans. Image Process. 31, 6124\u20136138 (2022)","journal-title":"IEEE Trans. Image Process."},{"key":"9_CR39","unstructured":"Su, Y., Lan, T., Li, H., Xu, J., Wang, Y., Cai, D.: Pandagpt: one model to instruction-follow them all. arXiv preprint arXiv:2305.16355 (2023)"},{"key":"9_CR40","doi-asserted-by":"crossref","unstructured":"Sun, W., et al.: Learning audio-visual source localization via false negative aware contrastive learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6420\u20136429 (2023)","DOI":"10.1109\/CVPR52729.2023.00621"},{"issue":"3","key":"9_CR41","doi-asserted-by":"publisher","first-page":"2576","DOI":"10.1109\/LRA.2019.2904733","volume":"4","author":"Y Sun","year":"2019","unstructured":"Sun, Y., Zuo, W., Liu, M.: Rtfnet: Rgb-thermal fusion network for semantic segmentation of urban scenes. IEEE Rob. Autom. Lett. 4(3), 2576\u20132583 (2019)","journal-title":"IEEE Rob. Autom. Lett."},{"key":"9_CR42","doi-asserted-by":"publisher","first-page":"1285","DOI":"10.1109\/TIP.2022.3140606","volume":"31","author":"F Wang","year":"2022","unstructured":"Wang, F., Pan, J., Xu, S., Tang, J.: Learning discriminative cross-modality features for rgb-d saliency detection. IEEE Trans. Image Process. 31, 1285\u20131297 (2022)","journal-title":"IEEE Trans. Image Process."},{"key":"9_CR43","doi-asserted-by":"publisher","unstructured":"Wang, H., Ma, C., Zhang, J., Zhang, Y., Avery, J., Hull, L., Carneiro, G.: Learnable cross-modal knowledge distillation for multi-modal learning with missing modality. In: International Conference on Medical Image Computing and Computer-Assisted Intervention, pp. 216\u2013226. Springer, Heidelberg (2023). https:\/\/doi.org\/10.1007\/978-3-031-43901-8_21","DOI":"10.1007\/978-3-031-43901-8_21"},{"key":"9_CR44","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"664","DOI":"10.1007\/978-3-319-46454-1_40","volume-title":"Computer Vision \u2013 ECCV 2016","author":"J Wang","year":"2016","unstructured":"Wang, J., Wang, Z., Tao, D., See, S., Wang, G.: Learning common and specific features for RGB-D semantic segmentation with deconvolutional networks. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9909, pp. 664\u2013679. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46454-1_40"},{"key":"9_CR45","doi-asserted-by":"crossref","unstructured":"Wang, S., Caesar, H., Nan, L., Kooij, J.F.: Unibev: multi-modal 3d object detection with uniform bev encoders for robustness against missing sensor modalities. arXiv preprint arXiv:2309.14516 (2023)","DOI":"10.1109\/IV55156.2024.10588783"},{"key":"9_CR46","doi-asserted-by":"crossref","unstructured":"Wang, Y., Chen, X., Cao, L., Huang, W., Sun, F., Wang, Y.: Multimodal token fusion for vision transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12186\u201312195 (2022)","DOI":"10.1109\/CVPR52688.2022.01187"},{"key":"9_CR47","first-page":"4835","volume":"33","author":"Y Wang","year":"2020","unstructured":"Wang, Y., Huang, W., Sun, F., Xu, T., Rong, Y., Huang, J.: Deep multimodal fusion by channel exchanging. Adv. Neural. Inf. Process. Syst. 33, 4835\u20134845 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"9_CR48","doi-asserted-by":"crossref","unstructured":"Wang, Y., Sun, F., Lu, M., Yao, A.: Learning deep multimodal feature representation with asymmetric multi-layer fusion. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 3902\u20133910 (2020)","DOI":"10.1145\/3394171.3413621"},{"key":"9_CR49","doi-asserted-by":"crossref","unstructured":"Wang, Y., et al.: Multi-modal 3d object detection in autonomous driving: a survey. Int. J. Comput. Vision 1\u201331 (2023)","DOI":"10.2139\/ssrn.4398254"},{"key":"9_CR50","doi-asserted-by":"crossref","unstructured":"Wei, S., Luo, C., Luo, Y.: Mmanet: margin-aware distillation and modality-aware regularization for incomplete multimodal learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 20039\u201320049 (2023)","DOI":"10.1109\/CVPR52729.2023.01919"},{"key":"9_CR51","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2022.108881","volume":"131","author":"W Wu","year":"2022","unstructured":"Wu, W., Chu, T., Liu, Q.: Complementarity-aware cross-modal feature fusion network for rgb-t semantic segmentation. Pattern Recogn. 131, 108881 (2022)","journal-title":"Pattern Recogn."},{"key":"9_CR52","first-page":"12077","volume":"34","author":"E Xie","year":"2021","unstructured":"Xie, E., Wang, W., Yu, Z., Anandkumar, A., Alvarez, J.M., Luo, P.: Segformer: simple and efficient design for semantic segmentation with transformers. Adv. Neural. Inf. Process. Syst. 34, 12077\u201312090 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"9_CR53","doi-asserted-by":"publisher","first-page":"4149","DOI":"10.1109\/TCSVT.2023.3241196","volume":"33","author":"Z Xie","year":"2023","unstructured":"Xie, Z., et al.: Cross-modality double bidirectional interaction and fusion network for rgb-t salient object detection. IEEE Trans. Circuits Syst. Video Technol. 33, 4149\u20134163 (2023)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"9_CR54","doi-asserted-by":"publisher","unstructured":"Yan, X., et al.: 2dpass: 2d priors assisted semantic segmentation on lidar point clouds. In: European Conference on Computer Vision, pp. 677\u2013695. Springer, Heidelberg (2022). https:\/\/doi.org\/10.1007\/978-3-031-19815-1_39","DOI":"10.1007\/978-3-031-19815-1_39"},{"key":"9_CR55","doi-asserted-by":"publisher","unstructured":"Ying, X., Chuah, M.C.: Uctnet: uncertainty-aware cross-modal transformer network for indoor rgb-d semantic segmentation. In: European Conference on Computer Vision, pp. 20\u201337. Springer, Heidelberg (2022). https:\/\/doi.org\/10.1007\/978-3-031-20056-4_2","DOI":"10.1007\/978-3-031-20056-4_2"},{"key":"9_CR56","first-page":"7281","volume":"34","author":"Y Yuan","year":"2021","unstructured":"Yuan, Y., et al.: Hrformer: high-resolution vision transformer for dense predict. Adv. Neural. Inf. Process. Syst. 34, 7281\u20137293 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"9_CR57","doi-asserted-by":"crossref","unstructured":"Zhang, B., Wang, Z., Ling, Y., Guan, Y., Zhang, S., Li, W.: Mx2m: masked cross-modality modeling in domain adaptation for 3d semantic segmentation. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a037, pp. 3401\u20133409 (2023)","DOI":"10.1609\/aaai.v37i3.25448"},{"key":"9_CR58","doi-asserted-by":"crossref","unstructured":"Zhang, J., Liu, H., Yang, K., Hu, X., Liu, R., Stiefelhagen, R.: Cmx: cross-modal fusion for rgb-x semantic segmentation with transformers. arXiv preprint arXiv:2203.04838 (2022)","DOI":"10.1109\/TITS.2023.3300537"},{"key":"9_CR59","doi-asserted-by":"crossref","unstructured":"Zhang, J., Liu, H., Yang, K., Hu, X., Liu, R., Stiefelhagen, R.: Cmx: cross-modal fusion for rgb-x semantic segmentation with transformers. IEEE Trans. Intell. Transport. Syst. (2023)","DOI":"10.1109\/TITS.2023.3300537"},{"key":"9_CR60","doi-asserted-by":"crossref","unstructured":"Zhang, J., et al.: Delivering arbitrary-modal semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1136\u20131147 (2023)","DOI":"10.1109\/CVPR52729.2023.00116"},{"key":"9_CR61","doi-asserted-by":"crossref","unstructured":"Zhang, J., Yang, K., Stiefelhagen, R.: Issafe: improving semantic segmentation in accidents by fusing event-based data. In: 2021 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS), pp. 1132\u20131139. IEEE (2021)","DOI":"10.1109\/IROS51168.2021.9636109"},{"key":"9_CR62","doi-asserted-by":"crossref","unstructured":"Zhang, Q., Zhao, S., Luo, Y., Zhang, D., Huang, N., Han, J.: Abmdrnet: adaptive-weighted bi-directional modality difference reduction network for rgb-t semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2633\u20132642 (2021)","DOI":"10.1109\/CVPR46437.2021.00266"},{"key":"9_CR63","doi-asserted-by":"crossref","unstructured":"Zhang, R., et al.: Pointclip: point cloud understanding by clip. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8552\u20138562 (2022)","DOI":"10.1109\/CVPR52688.2022.00836"},{"key":"9_CR64","unstructured":"Zhang, R., et al.: Llama-adapter: efficient fine-tuning of language models with zero-init attention. arXiv preprint arXiv:2303.16199 (2023)"},{"key":"9_CR65","doi-asserted-by":"crossref","unstructured":"Zhang, T., Guo, H., Jiao, Q., Zhang, Q., Han, J.: Efficient rgb-t tracking via cross-modality distillation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5404\u20135413 (2023)","DOI":"10.1109\/CVPR52729.2023.00523"},{"key":"9_CR66","doi-asserted-by":"crossref","unstructured":"Zhang, Y., et al.: Polarnet: an improved grid representation for online lidar point clouds semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9601\u20139610 (2020)","DOI":"10.1109\/CVPR42600.2020.00962"},{"key":"9_CR67","unstructured":"Zhang, Y., et al.: Meta-transformer: a unified framework for multimodal learning. arXiv preprint arXiv:2307.10802 (2023)"},{"key":"9_CR68","doi-asserted-by":"crossref","unstructured":"Zhao, Z., Palani, H., Liu, T., Evans, L., Toner, R.: Multi-modality guidance network for missing modality inference. arXiv preprint arXiv:2309.03452 (2023)","DOI":"10.1109\/ICMEW63481.2024.10645412"},{"key":"9_CR69","doi-asserted-by":"crossref","unstructured":"Zheng, J., et al.: Cvt-slr: contrastive visual-textual transformation for sign language recognition with variational alignment. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 23141\u201323150 (2023)","DOI":"10.1109\/CVPR52729.2023.02216"},{"key":"9_CR70","unstructured":"Zheng, X., et al.: Deep learning for event-based vision: a comprehensive survey and benchmarks. arXiv preprint arXiv:2302.08890 (2023)"},{"key":"9_CR71","unstructured":"Zheng, X., Luo, Y., Wang, H., Fu, C., Wang, L.: Transformer-cnn cohort: semi-supervised semantic segmentation by the best of both students. arXiv preprint arXiv:2209.02178 (2022)"},{"key":"9_CR72","doi-asserted-by":"crossref","unstructured":"Zheng, X., Luo, Y., Zhou, P., Wang, L.: Distilling efficient vision transformers from cnns for semantic segmentation. arXiv preprint arXiv:2310.07265 (2023)","DOI":"10.2139\/ssrn.4782766"},{"key":"9_CR73","doi-asserted-by":"crossref","unstructured":"Zheng, X., Pan, T., Luo, Y., Wang, L.: Look at the neighbor: distortion-aware unsupervised domain adaptation for panoramic semantic segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 18687\u201318698 (2023)","DOI":"10.1109\/ICCV51070.2023.01713"},{"key":"9_CR74","doi-asserted-by":"crossref","unstructured":"Zheng, X., Wang, L.: Eventdance: unsupervised source-free cross-modal adaptation for event-based object recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 17448\u201317458 (2024)","DOI":"10.1109\/CVPR52733.2024.01652"},{"key":"9_CR75","doi-asserted-by":"crossref","unstructured":"Zheng, X., Zhou, P., Vasilakos, A.V., Wang, L.: 360sfuda++: towards source-free uda for panoramic segmentation by learning reliable category prototypes. arXiv preprint arXiv:2404.16501 (2024)","DOI":"10.1109\/CVPR52733.2024.02634"},{"key":"9_CR76","doi-asserted-by":"crossref","unstructured":"Zheng, X., Zhou, P., Vasilakos, A.V., Wang, L.: Semantics distortion and style matter: towards source-free uda for panoramic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 27885\u201327895 (2024)","DOI":"10.1109\/CVPR52733.2024.02634"},{"key":"9_CR77","doi-asserted-by":"crossref","unstructured":"Zheng, X., Zhu, J., Liu, Y., Cao, Z., Fu, C., Wang, L.: Both style and distortion matter: Dual-path unsupervised domain adaptation for panoramic semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1285\u20131295 (2023)","DOI":"10.1109\/CVPR52729.2023.00130"},{"key":"9_CR78","doi-asserted-by":"crossref","unstructured":"Zhou, H., Qi, L., Wan, Z., Huang, H., Yang, X.: Rgb-d co-attention network for semantic segmentation. In: Proceedings of the Asian Conference on Computer Vision (2020)","DOI":"10.1007\/978-3-030-69525-5_31"},{"key":"9_CR79","unstructured":"Zhou, J., Zheng, X., Lyu, Y., Wang, L.: E-clip: towards label-efficient event-based open-world understanding by clip. arXiv preprint arXiv:2308.03135 (2023)"},{"key":"9_CR80","doi-asserted-by":"crossref","unstructured":"Zhou, J., Zheng, X., Lyu, Y., Wang, L.: Exact: language-guided conceptual reasoning and uncertainty estimation for event-based action recognition and more. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18633\u201318643 (2024)","DOI":"10.1109\/CVPR52733.2024.01763"},{"key":"9_CR81","doi-asserted-by":"crossref","unstructured":"Zhou, W., Zhang, H., Yan, W., Lin, W.: Mmsmcnet: modal memory sharing and morphological complementary networks for rgb-t urban scene semantic segmentation. IEEE Trans. Circuits Syst. Video Technol. (2023)","DOI":"10.1109\/TCSVT.2023.3275314"},{"key":"9_CR82","unstructured":"Zhu, B., et\u00a0al.: Languagebind: extending video-language pretraining to n-modality by language-based semantic alignment. arXiv preprint arXiv:2310.01852 (2023)"},{"key":"9_CR83","doi-asserted-by":"crossref","unstructured":"Zhu, J., Lai, S., Chen, X., Wang, D., Lu, H.: Visual prompt multi-modal tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9516\u20139526 (2023)","DOI":"10.1109\/CVPR52729.2023.00918"},{"key":"9_CR84","doi-asserted-by":"crossref","unstructured":"Zhu, J., Luo, Y., Zheng, X., Wang, H., Wang, L.: A good student is cooperative and reliable: Cnn-transformer collaborative learning for semantic segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 11720\u201311730 (2023)","DOI":"10.1109\/ICCV51070.2023.01076"},{"key":"9_CR85","doi-asserted-by":"crossref","unstructured":"Zhu, X., Zhang, R., He, B., Zeng, Z., Zhang, S., Gao, P.: Pointclip v2: adapting clip for powerful 3d open-world learning. arXiv preprint arXiv:2211.11682 (2022)","DOI":"10.1109\/ICCV51070.2023.00249"},{"key":"9_CR86","doi-asserted-by":"crossref","unstructured":"Zhuang, Z., Li, R., Jia, K., Wang, Q., Li, Y., Tan, M.: Perception-aware multi-sensor fusion for 3d lidar semantic segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 16280\u201316290 (2021)","DOI":"10.1109\/ICCV48922.2021.01597"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72754-2_9","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T15:02:28Z","timestamp":1730300548000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72754-2_9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,31]]},"ISBN":["9783031727535","9783031727542"],"references-count":86,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72754-2_9","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,31]]},"assertion":[{"value":"31 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}