{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T12:22:38Z","timestamp":1777897358634,"version":"3.51.4"},"reference-count":55,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100013139","name":"Humanities and Social Science Fund of Ministry of Education of China","doi-asserted-by":"publisher","award":["24YJCZH416"],"award-info":[{"award-number":["24YJCZH416"]}],"id":[{"id":"10.13039\/501100013139","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62403076"],"award-info":[{"award-number":["62403076"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["52472399"],"award-info":[{"award-number":["52472399"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Expert Systems with Applications"],"published-print":{"date-parts":[[2026,7]]},"DOI":"10.1016\/j.eswa.2026.132012","type":"journal-article","created":{"date-parts":[[2026,3,14]],"date-time":"2026-03-14T16:11:40Z","timestamp":1773504700000},"page":"132012","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":1,"special_numbering":"C","title":["Adaptive dual cross-attention network for multispectral object detection in autonomous driving"],"prefix":"10.1016","volume":"318","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3457-1982","authenticated-orcid":false,"given":"Jinlai","family":"Zhang","sequence":"first","affiliation":[]},{"given":"Xiaolong","family":"Song","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0005-8257-2559","authenticated-orcid":false,"given":"Yucheng","family":"Li","sequence":"additional","affiliation":[]},{"given":"Diqing","family":"Liang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8970-1086","authenticated-orcid":false,"given":"Zhiyong","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Jinhu","family":"Cai","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.eswa.2026.132012_bib0001","article-title":"Lightweight and computationally faster hypermetropic convolutional neural network for small size object detection","volume":"119","author":"Amudhan","year":"2022","journal-title":"Image and Vision Computing"},{"key":"10.1016\/j.eswa.2026.132012_bib0002","doi-asserted-by":"crossref","first-page":"2562","DOI":"10.1109\/LSP.2022.3229571","article-title":"Effectiveness guided cross-modal information sharing for aligned RGB-t object detection","volume":"29","author":"An","year":"2022","journal-title":"IEEE Signal Processing Letters"},{"key":"10.1016\/j.eswa.2026.132012_bib0003","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"1090","article-title":"Transfusion: Robust lidar-camera fusion for 3d object detection with transformers","author":"Bai","year":"2022"},{"key":"10.1016\/j.eswa.2026.132012_bib0004","unstructured":"Bochkovskiy, A., Wang, C.-Y., & Liao, H.-Y. M. (2020). Yolov4: Optimal speed and accuracy of object detection. arXiv: 2004.10934."},{"key":"10.1016\/j.eswa.2026.132012_bib0005","series-title":"Proceedings of COMPSTAT\u20192010: 19th international conference on computational statisticsparis france, august 22-27, 2010 keynote, invited and contributed papers","first-page":"177","article-title":"Large-scale machine learning with stochastic gradient descent","author":"Bottou","year":"2010"},{"key":"10.1016\/j.eswa.2026.132012_bib0006","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"403","article-title":"Multimodal object detection by channel switching and spatial attention","author":"Cao","year":"2023"},{"key":"10.1016\/j.eswa.2026.132012_bib0007","series-title":"European conference on computer vision","first-page":"139","article-title":"Multimodal object detection via probabilistic ensembling","author":"Chen","year":"2022"},{"issue":"4","key":"10.1016\/j.eswa.2026.132012_bib0008","doi-asserted-by":"crossref","first-page":"643","DOI":"10.1016\/j.compeleceng.2008.11.011","article-title":"Multi-spectral fusion for surveillance systems","volume":"36","author":"Denman","year":"2010","journal-title":"Computers & Electrical Engineering"},{"key":"10.1016\/j.eswa.2026.132012_bib0009","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"1022","article-title":"Benchmarking robustness of 3d object detection to common corruptions","author":"Dong","year":"2023"},{"key":"10.1016\/j.eswa.2026.132012_bib0011","series-title":"Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR)","article-title":"Rich feature hierarchies for accurate object detection and semantic segmentation","author":"Girshick","year":"2014"},{"key":"10.1016\/j.eswa.2026.132012_bib0012","series-title":"2016 IEEE conference on computer vision and pattern recognition (CVPR)","first-page":"770","article-title":"Deep residual learning for image recognition","author":"He","year":"2016"},{"key":"10.1016\/j.eswa.2026.132012_bib0013","unstructured":"Hendrycks, D., & Dietterich, T. (2019). Benchmarking neural network robustness to common corruptions and perturbations. arXiv: 1903.12261."},{"key":"10.1016\/j.eswa.2026.132012_bib0014","doi-asserted-by":"crossref","DOI":"10.1016\/j.ins.2025.121958","article-title":"Enhanced grey wolf optimizer with hybrid strategies for efficient feature selection in high-dimensional data","volume":"705","author":"Huang","year":"2025","journal-title":"Information Sciences"},{"key":"10.1016\/j.eswa.2026.132012_bib0015","series-title":"2015 IEEE conference on computer vision and pattern recognition (CVPR)","first-page":"1037","article-title":"Multispectral pedestrian detection: Benchmark dataset and baseline","author":"Hwang","year":"2015"},{"key":"10.1016\/j.eswa.2026.132012_bib0016","series-title":"2015 IEEE conference on computer vision and pattern recognition (CVPR)","first-page":"1037","article-title":"Multispectral pedestrian detection: Benchmark dataset and baseline","author":"Hwang","year":"2015"},{"key":"10.1016\/j.eswa.2026.132012_bib0017","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"3496","article-title":"Llvip: A visible-infrared paired dataset for low-light vision","author":"Jia","year":"2021"},{"key":"10.1016\/j.eswa.2026.132012_sbref0018","series-title":"ultralytics\/yolov5: v3.1 - bug fixes and performance improvements","author":"Jocher","year":"2020"},{"issue":"4","key":"10.1016\/j.eswa.2026.132012_bib0019","doi-asserted-by":"crossref","first-page":"7846","DOI":"10.1109\/LRA.2021.3099870","article-title":"Mlpd: Multi-label pedestrian detector in multispectral domain","volume":"6","author":"Kim","year":"2021","journal-title":"IEEE Robotics and Automation Letters"},{"issue":"6","key":"10.1016\/j.eswa.2026.132012_bib0020","doi-asserted-by":"crossref","first-page":"84","DOI":"10.1145\/3065386","article-title":"Imagenet classification with deep convolutional neural networks","volume":"60","author":"Krizhevsky","year":"2017","journal-title":"Communications ACM"},{"key":"10.1016\/j.eswa.2026.132012_bib0021","unstructured":"Li, C., Song, D., Tong, R., & Tang, M. (2018). Multispectral pedestrian detection via simultaneous detection and segmentation. arXiv: 1808.04818."},{"key":"10.1016\/j.eswa.2026.132012_bib0022","doi-asserted-by":"crossref","first-page":"161","DOI":"10.1016\/j.patcog.2018.08.005","article-title":"Illumination-aware faster r-CNN for robust multispectral pedestrian detection","volume":"85","author":"Li","year":"2019","journal-title":"Pattern Recognition"},{"key":"10.1016\/j.eswa.2026.132012_bib0023","unstructured":"Liu, J., Zhang, S., Wang, S., & Metaxas, D. N. (2016a). Multispectral deep neural networks for pedestrian detection. arXiv: 1611.02644."},{"key":"10.1016\/j.eswa.2026.132012_bib0024","series-title":"Computer vision\u2013ECCV 2016: 14th european conference, amsterdam, the netherlands, october 11\u201314, 2016, proceedings, part i 14","first-page":"21","article-title":"Ssd: Single shot multibox detector","author":"Liu","year":"2016"},{"key":"10.1016\/j.eswa.2026.132012_bib0025","doi-asserted-by":"crossref","first-page":"143","DOI":"10.1016\/j.patcog.2018.03.007","article-title":"Unified multi-spectral pedestrian detection based on probabilistic fusion networks","volume":"80","author":"Park","year":"2018","journal-title":"Pattern Recognition"},{"key":"10.1016\/j.eswa.2026.132012_bib0026","article-title":"Pytorch: An imperative style, high-performance deep learning library","volume":"32","author":"Paszke","year":"2019","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.eswa.2026.132012_bib0027","unstructured":"Qingyun, F., Dapeng, H., & Zhaokui, W. (2021). Cross-modality fusion transformer for multispectral object detection. arXiv: 2111:00273."},{"key":"10.1016\/j.eswa.2026.132012_bib0028","series-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","first-page":"779","article-title":"You only look once: Unified, real-time object detection","author":"Redmon","year":"2016"},{"issue":"6","key":"10.1016\/j.eswa.2026.132012_bib0029","doi-asserted-by":"crossref","first-page":"1137","DOI":"10.1109\/TPAMI.2016.2577031","article-title":"Faster r-CNN: Towards real-time object detection with region proposal networks","volume":"39","author":"Ren","year":"2016","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"10.1016\/j.eswa.2026.132012_bib0030","series-title":"2019 IEEE\/CVF conference on computer vision and pattern recognition (CVPR)","first-page":"658","article-title":"Generalized intersection over union: A metric and a loss for bounding box regression","author":"Rezatofighi","year":"2019"},{"key":"10.1016\/j.eswa.2026.132012_bib0031","doi-asserted-by":"crossref","first-page":"1497","DOI":"10.1109\/JSTARS.2020.3041316","article-title":"Yolors: Object detection in multimodal remote sensing imagery","volume":"14","author":"Sharma","year":"2020","journal-title":"IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing"},{"key":"10.1016\/j.eswa.2026.132012_bib0032","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2023.109913","article-title":"Icafusion: Iterative cross-attention guided feature fusion for multispectral object detection","volume":"145","author":"Shen","year":"2024","journal-title":"Pattern Recognition"},{"key":"10.1016\/j.eswa.2026.132012_bib0033","first-page":"1","article-title":"Misaligned visible-thermal object detection: A drone-based benchmark and baseline","author":"Song","year":"2024","journal-title":"IEEE Transactions on Intelligent Vehicles"},{"issue":"10","key":"10.1016\/j.eswa.2026.132012_bib0034","doi-asserted-by":"crossref","first-page":"6700","DOI":"10.1109\/TCSVT.2022.3168279","article-title":"Drone-based RGB-infrared cross-modality vehicle detection via uncertainty-aware learning","volume":"32","author":"Sun","year":"2022","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"key":"10.1016\/j.eswa.2026.132012_bib0035","series-title":"Proceedings of the on thematic workshops of ACM multimedia 2017","first-page":"35","article-title":"Multispectral object detection for autonomous vehicles","author":"Takumi","year":"2017"},{"key":"10.1016\/j.eswa.2026.132012_bib0010","unstructured":"F. Team, et al. FLIR ADAS Dataset (Aligned Version). 2025https:\/\/www.flir.cn\/oem\/adas\/adas-dataset-form\/. Accessed July 6, 2021."},{"key":"10.1016\/j.eswa.2026.132012_bib0036","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2020.103977","article-title":"Multimodal facial biometrics recognition: Dual-stream convolutional neural networks with multi-feature fusion layers","volume":"102","author":"Tiong","year":"2020","journal-title":"Image and Vision Computing"},{"key":"10.1016\/j.eswa.2026.132012_bib0037","article-title":"Attention is all you need","author":"Vaswani","year":"2017","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.eswa.2026.132012_bib0038","series-title":"2021 17th international conference on machine vision and applications (MVA)","first-page":"1","article-title":"Multi-modal pedestrian detection with large misalignment based on modal-wise regression and multi-modal iou","author":"Wanchaitanawong","year":"2021"},{"key":"10.1016\/j.eswa.2026.132012_bib0039","doi-asserted-by":"crossref","DOI":"10.1016\/j.asoc.2025.112919","article-title":"Safety assessment of intelligent vehicles considering drivers\u2019 risk perception information under interval 2-tuple q-rung orthopair fuzzy sets","volume":"175","author":"Wang","year":"2025","journal-title":"Applied Soft Computing"},{"key":"10.1016\/j.eswa.2026.132012_bib0040","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"5541","article-title":"Gm-detr: Generalized muiltispectral detection transformer with efficient fusion encoder for visible-infrared detection","author":"Xiao","year":"2024"},{"key":"10.1016\/j.eswa.2026.132012_bib0041","series-title":"2022 international conference on robotics and automation (ICRA)","first-page":"2920","article-title":"Baanet: Learning bi-directional adaptive attention gates for multispectral pedestrian detection","author":"Yang","year":"2022"},{"key":"10.1016\/j.eswa.2026.132012_bib0042","doi-asserted-by":"crossref","first-page":"1172","DOI":"10.1109\/LSP.2023.3309578","article-title":"Multi-scale aggregation transformers for multispectral object detection","volume":"30","author":"You","year":"2023","journal-title":"IEEE Signal Processing Letters"},{"key":"10.1016\/j.eswa.2026.132012_bib0043","series-title":"European conference on computer vision","first-page":"509","article-title":"Translation, scale and rotation: cross-modal alignment meets RGB-infrared vehicle detection","author":"Yuan","year":"2022"},{"key":"10.1016\/j.eswa.2026.132012_bib0044","series-title":"2020 IEEE international conference on image processing (ICIP)","first-page":"276","article-title":"Multispectral fusion for object detection with cyclic fuse-and-refine blocks","author":"Zhang","year":"2020"},{"key":"10.1016\/j.eswa.2026.132012_bib0045","series-title":"Proceedings of the IEEE\/CVF winter conference on applications of computer vision","first-page":"72","article-title":"Guided attentive feature fusion for multispectral pedestrian detection","author":"Zhang","year":"2021"},{"issue":"12","key":"10.1016\/j.eswa.2026.132012_bib0046","doi-asserted-by":"crossref","first-page":"14679","DOI":"10.1109\/TITS.2023.3300537","article-title":"Cmx: Cross-modal fusion for rgb-x semantic segmentation with transformers","volume":"24","author":"Zhang","year":"2023","journal-title":"IEEE Transactions on intelligent transportation systems"},{"key":"10.1016\/j.eswa.2026.132012_bib0047","doi-asserted-by":"crossref","first-page":"20","DOI":"10.1016\/j.inffus.2018.09.015","article-title":"Cross-modality interactive attention network for multispectral pedestrian detection","volume":"50","author":"Zhang","year":"2019","journal-title":"Information Fusion"},{"key":"10.1016\/j.eswa.2026.132012_bib0048","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"5127","article-title":"Weakly aligned cross-modal learning for multispectral pedestrian detection","author":"Zhang","year":"2019"},{"key":"10.1016\/j.eswa.2026.132012_bib0049","article-title":"Enhanced hermit crabs detection using super-resolution reconstruction and improved YOLOv8 on UAV-captured imagery","author":"Zhao","year":"2025","journal-title":"Marine Environmental Research"},{"key":"10.1016\/j.eswa.2026.132012_bib0050","doi-asserted-by":"crossref","DOI":"10.1016\/j.atech.2024.100730","article-title":"Smart UAV-assisted rose growth monitoring with improved YOLOv10 and mamba restoration techniques","volume":"10","author":"Zhao","year":"2025","journal-title":"Smart Agricultural Technology"},{"key":"10.1016\/j.eswa.2026.132012_bib0051","doi-asserted-by":"crossref","DOI":"10.1016\/j.ecoinf.2025.103324","article-title":"Mamba-based super-resolution and semi-supervised YOLOv10 for freshwater mussel detection using acoustic video camera: A case study at lake izunuma, japan","author":"Zhao","year":"2025","journal-title":"Ecological Informatics"},{"issue":"4","key":"10.1016\/j.eswa.2026.132012_bib0052","doi-asserted-by":"crossref","first-page":"2125","DOI":"10.1109\/TCSVT.2023.3301933","article-title":"Toward unified token learning for vision-language tracking","volume":"34","author":"Zheng","year":"2023","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"key":"10.1016\/j.eswa.2026.132012_bib0053","series-title":"Proceedings of the AAAI conference on artificial intelligence","first-page":"10635","article-title":"Decoupled spatio-temporal consistency learning for self-supervised tracking","volume":"vol. 39","author":"Zheng","year":"2025"},{"key":"10.1016\/j.eswa.2026.132012_bib0054","article-title":"Towards universal modal tracking with online dense temporal token learning","author":"Zheng","year":"2025","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"10.1016\/j.eswa.2026.132012_bib0055","series-title":"Computer vision\u2013ECCV 2020: 16th european conference, glasgow, UK, august 23\u201328, 2020, proceedings, part XVIII 16","first-page":"787","article-title":"Improving multispectral pedestrian detection by addressing modality imbalance problems","author":"Zhou","year":"2020"}],"container-title":["Expert Systems with Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0957417426009255?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0957417426009255?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,14]],"date-time":"2026-04-14T12:10:46Z","timestamp":1776168646000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0957417426009255"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,7]]},"references-count":55,"alternative-id":["S0957417426009255"],"URL":"https:\/\/doi.org\/10.1016\/j.eswa.2026.132012","relation":{},"ISSN":["0957-4174"],"issn-type":[{"value":"0957-4174","type":"print"}],"subject":[],"published":{"date-parts":[[2026,7]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Adaptive dual cross-attention network for multispectral object detection in autonomous driving","name":"articletitle","label":"Article Title"},{"value":"Expert Systems with Applications","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.eswa.2026.132012","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"132012"}}