{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,21]],"date-time":"2026-05-21T17:19:07Z","timestamp":1779383947132,"version":"3.53.1"},"reference-count":51,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,10,1]],"date-time":"2026-10-01T00:00:00Z","timestamp":1790812800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,10,1]],"date-time":"2026-10-01T00:00:00Z","timestamp":1790812800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T00:00:00Z","timestamp":1772668800000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition"],"published-print":{"date-parts":[[2026,10]]},"DOI":"10.1016\/j.patcog.2026.113429","type":"journal-article","created":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T07:22:37Z","timestamp":1772781757000},"page":"113429","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["P3D: Plug-and-play prompt-driven framework for RGB-thermal semantic segmentation"],"prefix":"10.1016","volume":"178","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-5104-8184","authenticated-orcid":false,"given":"Yongqi","family":"Sun","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4681-3851","authenticated-orcid":false,"given":"Chenguang","family":"Dai","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8320-4230","authenticated-orcid":false,"given":"Hanyun","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Longguang","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-7285-3026","authenticated-orcid":false,"given":"Wenke","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3732-962X","authenticated-orcid":false,"given":"Meilin","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yongsheng","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3332-9668","authenticated-orcid":false,"given":"Anzhu","family":"Yu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"issue":"5","key":"10.1016\/j.patcog.2026.113429_bib0001","doi-asserted-by":"crossref","first-page":"1951","DOI":"10.1109\/TITS.2019.2909066","article-title":"Automated evaluation of semantic segmentation robustness for autonomous driving","volume":"21","author":"Zhou","year":"2020","journal-title":"IEEE Trans. Intell. Transp. Syst."},{"key":"10.1016\/j.patcog.2026.113429_bib0002","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.110959","article-title":"Multimodal self-supervised learning for remote sensing data land cover classification","volume":"157","author":"Xue","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.113429_bib0003","unstructured":"J. Wu, W. Ji, Y. Liu, H. Fu, M. Xu, Y. Xu, Y. Jin, Medical SAM adapter: adapting segment anything model for medical image segmentation, arXiv: 2304.12620(2023)."},{"issue":"11","key":"10.1016\/j.patcog.2026.113429_bib0004","doi-asserted-by":"crossref","first-page":"21405","DOI":"10.1109\/TITS.2022.3177615","article-title":"SFNet-N: an improved SFNet algorithm for semantic segmentation of low-light autonomous driving road scenes","volume":"23","author":"Wang","year":"2022","journal-title":"IEEE Trans. Intell. Transp. Syst."},{"key":"10.1016\/j.patcog.2026.113429_bib0005","series-title":"Proceedings of the 28th ACM International Conference on Multimedia (ACM MM)","first-page":"2317","article-title":"Integrating semantic segmentation and retinex model for low-light image enhancement","author":"Fan","year":"2020"},{"key":"10.1016\/j.patcog.2026.113429_bib0006","series-title":"Proceedings of the European Conference on Computer Vision (ECCV)","first-page":"707","article-title":"Model adaptation with synthetic and real data for semantic dense foggy scene understanding","author":"Sakaridis","year":"2018"},{"key":"10.1016\/j.patcog.2026.113429_bib0007","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"18900","article-title":"Both style and fog matter: cumulative domain adaptation for semantic foggy scene understanding","author":"Ma","year":"2022"},{"key":"10.1016\/j.patcog.2026.113429_bib0008","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW)","first-page":"285","article-title":"Semantic segmentation for thermal images: a comparative survey","author":"K\u00fct\u00fck","year":"2022"},{"key":"10.1016\/j.patcog.2026.113429_bib0009","first-page":"3156","article-title":"RoadFormer+: delivering RGB-X scene parsing through scale-aware information decoupling and advanced heterogeneous feature fusion","author":"Huang","year":"2025","journal-title":"IEEE Trans. Intell. Transp. Syst."},{"key":"10.1016\/j.patcog.2026.113429_bib0010","series-title":"Proc. IEEE Int. Conf. Robot. Autom. (ICRA)","first-page":"9441","article-title":"PST900: RGB-thermal calibration, dataset and segmentation network","author":"Shivakumar","year":"2020"},{"key":"10.1016\/j.patcog.2026.113429_bib0011","doi-asserted-by":"crossref","first-page":"661","DOI":"10.1016\/j.isprsjprs.2025.01.022","article-title":"An interactive fusion attention-guided network for ground surface hot spring fluids segmentation in dual-spectrum UAV images","volume":"220","author":"Yi","year":"2025","journal-title":"ISPRS J. Photogramm. Remote Sens."},{"key":"10.1016\/j.patcog.2026.113429_bib0012","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"2633","article-title":"ABMDRNet: adaptive-weighted bi-directional modality difference reduction network for RGB-T semantic segmentation","author":"Zhang","year":"2021"},{"issue":"12","key":"10.1016\/j.patcog.2026.113429_bib0013","doi-asserted-by":"crossref","first-page":"14679","DOI":"10.1109\/TITS.2023.3300537","article-title":"CMX: cross-modal fusion for RGB-X semantic segmentation with transformers","volume":"24","author":"Zhang","year":"2023","journal-title":"IEEE Trans. Intell. Transp. Syst."},{"key":"10.1016\/j.patcog.2026.113429_bib0014","doi-asserted-by":"crossref","first-page":"6348","DOI":"10.1109\/TMM.2023.3349072","article-title":"Context-aware interaction network for RGB-T semantic segmentation","volume":"26","author":"Lv","year":"2024","journal-title":"IEEE Trans. Multimed."},{"issue":"2","key":"10.1016\/j.patcog.2026.113429_bib0015","doi-asserted-by":"crossref","first-page":"899","DOI":"10.1002\/mp.17481","article-title":"Plug-and-play segment anything model improves nnUNet performance","volume":"52","author":"Li","year":"2025","journal-title":"Med. Phys."},{"key":"10.1016\/j.patcog.2026.113429_bib0016","doi-asserted-by":"crossref","unstructured":"R. Sahay, A. Savakis, MoPEFT: a mixture-of-PEFTs for the segment anything model, arXiv: 2405.00293(2024).","DOI":"10.1109\/CVPRW67362.2025.00647"},{"key":"10.1016\/j.patcog.2026.113429_bib0017","series-title":"2024\u202fIEEE International Conference on Robotics and Automation (ICRA)","first-page":"9093","article-title":"SAM-event-adapter: adapting segment anything model for event-RGB semantic segmentation","author":"Yao","year":"2024"},{"key":"10.1016\/j.patcog.2026.113429_bib0018","doi-asserted-by":"crossref","unstructured":"J. Zhao, F. Teng, K. Luo, G. Zhao, Z. Li, X. Zheng, K. Yang, Unveiling the potential of segment anything model 2 for RGB-thermal semantic segmentation with language guidance, arXiv: 2503.02581(2025).","DOI":"10.1109\/IROS60139.2025.11247337"},{"key":"10.1016\/j.patcog.2026.113429_bib0019","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","article-title":"Every SAM drop counts: embracing semantic priors for multi-modality image fusion and beyond","author":"Wu","year":"2025"},{"key":"10.1016\/j.patcog.2026.113429_bib0020","series-title":"Proc. IEEE\/RSJ Int. Conf. Intell. Robots Syst. (IROS)","first-page":"5108","article-title":"MFNet: towards real-time semantic segmentation for autonomous vehicles with multi-spectral scenes","author":"Ha","year":"2017"},{"issue":"3","key":"10.1016\/j.patcog.2026.113429_bib0021","doi-asserted-by":"crossref","first-page":"2576","DOI":"10.1109\/LRA.2019.2904733","article-title":"RTFNet: RGB-thermal fusion network for semantic segmentation of urban scenes","volume":"4","author":"Sun","year":"2019","journal-title":"IEEE Rob. Autom."},{"issue":"3","key":"10.1016\/j.patcog.2026.113429_bib0022","doi-asserted-by":"crossref","first-page":"1000","DOI":"10.1109\/TASE.2020.2993143","article-title":"FuseSeg: semantic segmentation of urban scenes based on RGB and thermal data fusion","volume":"18","author":"Sun","year":"2021","journal-title":"IEEE Trans. Autom. Sci. Eng."},{"issue":"12","key":"10.1016\/j.patcog.2026.113429_bib0023","doi-asserted-by":"crossref","first-page":"7737","DOI":"10.1109\/TCSVT.2023.3281419","article-title":"SGFNet: semantic-guided fusion network for RGB-thermal semantic segmentation","volume":"33","author":"Wang","year":"2023","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.patcog.2026.113429_bib0024","series-title":"Proc. IEEE\/RSJ Int. Conf. Intell. Robots Syst. (IROS)","first-page":"4467","article-title":"FEANet: feature-enhanced attention network for RGB-thermal real-time semantic segmentation","author":"Deng","year":"2021"},{"issue":"1","key":"10.1016\/j.patcog.2026.113429_bib0025","doi-asserted-by":"crossref","first-page":"657","DOI":"10.1109\/TITS.2023.3306368","article-title":"EGFNet: edge-aware guidance fusion network for RGB\u2013thermal urban scene parsing","volume":"25","author":"Dong","year":"2024","journal-title":"IEEE Trans. Intell. Transp. Syst."},{"key":"10.1016\/j.patcog.2026.113429_bib0026","doi-asserted-by":"crossref","DOI":"10.1016\/j.neucom.2024.127913","article-title":"Residual spatial fusion network for RGB-thermal semantic segmentation","volume":"595","author":"Li","year":"2024","journal-title":"Neurocomputing"},{"key":"10.1016\/j.patcog.2026.113429_bib0027","series-title":"IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV)","first-page":"1734","article-title":"Sigma: siamese mamba network for multi-modal semantic segmentation","author":"Wan","year":"2025"},{"key":"10.1016\/j.patcog.2026.113429_bib0028","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV)","first-page":"8081","article-title":"Multi-interactive feature learning and a full-time multi-modality benchmark for image fusion and segmentation","author":"Liu","year":"2023"},{"key":"10.1016\/j.patcog.2026.113429_bib0029","series-title":"2024\u202fIEEE International Conference on Robotics and Automation (ICRA)","first-page":"11110","article-title":"Complementary random masking for RGB-thermal semantic segmentation","author":"Shin","year":"2024"},{"key":"10.1016\/j.patcog.2026.113429_bib0030","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"15979","article-title":"Masked autoencoders are scalable vision learners","author":"He","year":"2022"},{"key":"10.1016\/j.patcog.2026.113429_bib0031","series-title":"International Conference on Machine Learning (ICML)","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.patcog.2026.113429_bib0032","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV)","first-page":"3992","article-title":"Segment anything","author":"Kirillov","year":"2023"},{"key":"10.1016\/j.patcog.2026.113429_bib0033","series-title":"International Conference on Learning Representations (ICLR)","article-title":"DINOv2: learning robust visual features without supervision","author":"Oquab","year":"2025"},{"key":"10.1016\/j.patcog.2026.113429_bib0034","unstructured":"X. Zhao, W. Ding, Y. An, Y. Du, T. Yu, M. Li, M. Tang, J. Wang, Fast segment anything, arXiv: 2306.12156(2023)."},{"key":"10.1016\/j.patcog.2026.113429_bib0035","unstructured":"C. Zhang, D. Han, Y. Qiao, J.U. Kim, S.-H. Bae, S. Lee, C.S. Hong, Faster segment anything: Towards lightweight sam for mobile applications, arXiv: 2306.14289(2023)."},{"key":"10.1016\/j.patcog.2026.113429_bib0036","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"16111","article-title":"EfficientSAM: leveraged masked image pretraining for efficient segment anything","author":"Xiong","year":"2024"},{"key":"10.1016\/j.patcog.2026.113429_bib0037","series-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems (NeurIPS)","first-page":"29914","article-title":"Segment anything in high quality","author":"Ke","year":"2023"},{"key":"10.1016\/j.patcog.2026.113429_bib0038","series-title":"International Conference on Learning Representations (ICLR)","article-title":"SAM 2: segment anything in images and videos","author":"Ravi","year":"2025"},{"key":"10.1016\/j.patcog.2026.113429_bib0039","unstructured":"C. Zhu, B. Xiao, L. Shi, S. Xu, X. Zheng, Customize segment anything model for multi-modal semantic segmentation with mixture of LoRA experts, arXiv: 2412.04220(2024)."},{"key":"10.1016\/j.patcog.2026.113429_bib0040","series-title":"2025\u202fIEEE\/CVF Winter Conference on Applications of Computer Vision (WACV)","first-page":"4655","article-title":"SAM-Mamba: mamba guided SAM architecture for generalized zero-shot polyp segmentation","author":"Dutta","year":"2025"},{"key":"10.1016\/j.patcog.2026.113429_bib0041","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"4515","article-title":"SAM-parser: fine-tuning sam efficiently by parameter space reconstruction","author":"Peng","year":"2024"},{"key":"10.1016\/j.patcog.2026.113429_bib0042","doi-asserted-by":"crossref","unstructured":"K. Zhang, D. Liu, Customized segment anything model for medical image segmentation, arXiv: 2304.13785(2023).","DOI":"10.2139\/ssrn.4495221"},{"key":"10.1016\/j.patcog.2026.113429_bib0043","series-title":"International Conference on Learning Representations (ICLR)","article-title":"LoRA: low-rank adaptation of large language models","author":"Hu","year":"2022"},{"key":"10.1016\/j.patcog.2026.113429_bib0044","doi-asserted-by":"crossref","first-page":"1937","DOI":"10.1007\/s11063-018-09977-1","article-title":"Learning from imbalanced data sets with weighted cross-entropy function","volume":"50","author":"Aurelio","year":"2019","journal-title":"Neural Process. Lett."},{"key":"10.1016\/j.patcog.2026.113429_bib0045","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"4413","article-title":"The Lov\u00e1sz-Softmax loss: a tractable surrogate for the optimization of the intersection-over-union measure in neural networks","author":"Berman","year":"2018"},{"key":"10.1016\/j.patcog.2026.113429_bib0046","series-title":"2016 Fourth International Conference on 3D Vision (3DV)","first-page":"565","article-title":"V-net: fully convolutional neural networks for volumetric medical image segmentation","author":"Milletari","year":"2016"},{"key":"10.1016\/j.patcog.2026.113429_bib0047","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"1136","article-title":"Delivering arbitrary-modal semantic segmentation","author":"Zhang","year":"2023"},{"key":"10.1016\/j.patcog.2026.113429_bib0048","doi-asserted-by":"crossref","first-page":"599","DOI":"10.1109\/OJSP.2024.3389812","article-title":"MMSFormer: multimodal transformer for material and semantic segmentation","volume":"5","author":"Reza","year":"2024","journal-title":"IEEE Open J. Signal Process."},{"key":"10.1016\/j.patcog.2026.113429_bib0049","doi-asserted-by":"crossref","first-page":"7790","DOI":"10.1109\/TIP.2021.3109518","article-title":"GMNet: graded-feature multilabel-learning network for RGB-thermal urban scene semantic segmentation","volume":"30","author":"Zhou","year":"2021","journal-title":"IEEE Trans. Image Process."},{"issue":"19","key":"10.1016\/j.patcog.2026.113429_bib0050","doi-asserted-by":"crossref","first-page":"3238","DOI":"10.3390\/electronics11193238","article-title":"LASNet: a light-weight asymmetric spatial feature network for real-time semantic segmentation","volume":"11","author":"Chen","year":"2022","journal-title":"Electronics"},{"key":"10.1016\/j.patcog.2026.113429_bib0051","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2025.111801","article-title":"U3M: Unbiased multiscale modal fusion model for multimodal semantic segmentation","volume":"168","author":"Li","year":"2025","journal-title":"Pattern Recognit."}],"container-title":["Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320326003948?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320326003948?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,21]],"date-time":"2026-05-21T16:58:05Z","timestamp":1779382685000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0031320326003948"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,10]]},"references-count":51,"alternative-id":["S0031320326003948"],"URL":"https:\/\/doi.org\/10.1016\/j.patcog.2026.113429","relation":{},"ISSN":["0031-3203"],"issn-type":[{"value":"0031-3203","type":"print"}],"subject":[],"published":{"date-parts":[[2026,10]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"P3D: Plug-and-play prompt-driven framework for RGB-thermal semantic segmentation","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patcog.2026.113429","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 The Author(s). Published by Elsevier Ltd.","name":"copyright","label":"Copyright"}],"article-number":"113429"}}