{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,19]],"date-time":"2026-05-19T14:17:26Z","timestamp":1779200246711,"version":"3.51.4"},"reference-count":44,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.02595","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"27958-27967","source":"Crossref","is-referenced-by-count":1,"title":["Fusion Meets Diverse Conditions: A High-Diversity Benchmark and Baseline for UAV-Based Multimodal Object Detection with Condition Cues"],"prefix":"10.1109","author":[{"given":"Chen","family":"Chen","sequence":"first","affiliation":[{"name":"National University of Defense Technology,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kangcheng","family":"Bin","sequence":"additional","affiliation":[{"name":"National University of Defense Technology,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ting","family":"Hu","sequence":"additional","affiliation":[{"name":"National University of Defense Technology,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiahao","family":"Qi","sequence":"additional","affiliation":[{"name":"National University of Defense Technology,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xingyue","family":"Liu","sequence":"additional","affiliation":[{"name":"National University of Defense Technology,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tianpeng","family":"Liu","sequence":"additional","affiliation":[{"name":"National University of Defense Technology,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhen","family":"Liu","sequence":"additional","affiliation":[{"name":"National University of Defense Technology,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yongxiang","family":"Liu","sequence":"additional","affiliation":[{"name":"National University of Defense Technology,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ping","family":"Zhong","sequence":"additional","affiliation":[{"name":"National University of Defense Technology,China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","first-page":"9595","article-title":"Presenceonly geographical priors for fine-grained image classification","author":"Mac Aodha","year":"2019","journal-title":"ICCV"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-35289-8_25"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA40945.2020.9196845"},{"key":"ref4","article-title":"Condition-aware multimodal fusion for robust semantic perception of driving scenes","author":"Broedermann","year":"2024","journal-title":"arXiv"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02534"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2023.110423"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2022.3222805"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00033"},{"key":"ref9","first-page":"2849","article-title":"Learning roi transformer for oriented object detection in aerial","author":"Ding","year":"2019","journal-title":"CVPR"},{"key":"ref10","article-title":"An image is worth $16\\times 16$ words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2021","journal-title":"ICLR"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2018.11.017"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00281"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2021.3062048"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612651"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00745"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1016\/j.aei.2024.102953"},{"key":"ref17","volume-title":"ultralytics\/yolov5","author":"Jocher","year":"2020"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2018.08.005"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.324"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.5244\/C.30.73"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3060162"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2024.3487780"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1016\/j.jag.2023.103292"},{"key":"ref24","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021","journal-title":"ICML"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2016.09.015"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CISP-BMEI48845.2019.8965948"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-019-01228-7"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2020.3041316"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3168279"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.3390\/rs15030660"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00350"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01067"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/tgrs.2024.3376819"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_30"},{"key":"ref36","article-title":"Central moment discrepancy (CMD) for domain-invariant representation learning","author":"Zellinger","year":"2017","journal-title":"ICLR"},{"key":"ref37","first-page":"41753","article-title":"Provable dynamic fusion for low-quality multimodal data","author":"Zhang","year":"2023","journal-title":"ICML"},{"key":"ref38","article-title":"Multimodal fusion on low-quality data: A comprehensive survey","author":"Zhang","year":"2024","journal-title":"arXiv"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3268118"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/TIM.2023.3251414"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/34.888718"},{"key":"ref42","article-title":"Image fusion via vision-language model","author":"Zhao","year":"2024","journal-title":"ICML"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i9.26346"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58523-5_46"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11445711.pdf?arnumber=11445711","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T06:19:29Z","timestamp":1777529969000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11445711\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":44,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.02595","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}