{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T10:06:20Z","timestamp":1777889180414,"version":"3.51.4"},"reference-count":72,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.00740","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"7894-7904","source":"Crossref","is-referenced-by-count":0,"title":["AdvDreamer Unveils: Are Vision-Language Models Truly Ready for Real-World 3D Variations?"],"prefix":"10.1109","author":[{"given":"Shouwei","family":"Ruan","sequence":"first","affiliation":[{"name":"Institute of Artificial Intelligence, Beihang University,State Key Laboratory of Virtual Reality Technology and Systems"}]},{"given":"Hanqing","family":"Liu","sequence":"additional","affiliation":[{"name":"Institute of Artificial Intelligence, Beihang University,State Key Laboratory of Virtual Reality Technology and Systems"}]},{"given":"Yao","family":"Huang","sequence":"additional","affiliation":[{"name":"Institute of Artificial Intelligence, Beihang University,State Key Laboratory of Virtual Reality Technology and Systems"}]},{"given":"Xiaoqi","family":"Wang","sequence":"additional","affiliation":[{"name":"Institute of Artificial Intelligence, Beihang University,State Key Laboratory of Virtual Reality Technology and Systems"}]},{"given":"Caixin","family":"Kang","sequence":"additional","affiliation":[{"name":"Institute of Artificial Intelligence, Beihang University,State Key Laboratory of Virtual Reality Technology and Systems"}]},{"given":"Hang","family":"Su","sequence":"additional","affiliation":[{"name":"Institute for AI, Tsinghua University,Tsinghua-Bosch Joint ML Center, THBI Lab, BNRist Center,Dept. of Comp. Sci. and Tech."}]},{"given":"Yinpeng","family":"Dong","sequence":"additional","affiliation":[{"name":"College of AI, Tsinghua University"}]},{"given":"Xingxing","family":"Wei","sequence":"additional","affiliation":[{"name":"Institute of Artificial Intelligence, Beihang University,State Key Laboratory of Virtual Reality Technology and Systems"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00904"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.52202\/079017-2620"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1723"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00498"},{"key":"ref5","article-title":"Anthropic","volume-title":"claude-3","year":"2024"},{"key":"ref6","first-page":"284","article-title":"Synthesizing robust adversarial examples","volume-title":"International conference on machine learning","author":"Athalye","year":"2018"},{"key":"ref7","article-title":"Qwen-vl: A frontier large vision-language model with versatile abilities","author":"Bai","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref8","article-title":"Objectnet: A large-scale bias-controlled dataset for pushing the limits of object recognition models","author":"Barbu","year":"2019","journal-title":"Advances in neural information processing systems, 32"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01370"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.52202\/075280-3072"},{"key":"ref11","article-title":"Microsoft coco captions: Data collection and evaluation server","author":"Chen","year":"2015","journal-title":"arXiv preprint arXiv"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.00630"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.02283"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00276"},{"key":"ref15","article-title":"Diffedit: Diffusion-based semantic image editing with mask guidance","volume-title":"ICLR 2023 (Eleventh International Conference on Learning Representations)","author":"Couairon"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02325"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.52202\/068431-2666"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3097983.3098043"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-66415-2_2"},{"key":"ref22","article-title":"The cma evolution strategy: A tutorial","author":"Hansen","year":"2016","journal-title":"arXiv preprint arXiv"},{"key":"ref23","volume-title":"Openlrm: Open-source large reconstruction models","author":"He","year":"2023"},{"key":"ref24","article-title":"Lrm: Large reconstruction model for single image to 3d","author":"Hong","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref25","article-title":"Voxposer: Composable 3d value maps for robotic manipulation with language models","author":"Huang","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref26","volume-title":"openclip, 2021","author":"Ilharco"},{"key":"ref27","article-title":"LAION-AI","volume-title":"Clip-benchmark","year":"2024"},{"key":"ref28","article-title":"Mimicit: Multi-modal in-context instruction tuning","author":"Li","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref29","first-page":"9694","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","volume":"34","author":"Li","year":"2021","journal-title":"Advances in neural information processing systems"},{"key":"ref30","first-page":"12888","article-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"International conference on machine learning","author":"Li","year":"2022"},{"key":"ref31","article-title":"Blip-2 : Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"International conference on machine learning, pages 1973019742. PMLR","author":"Li","year":"2023"},{"key":"ref32","article-title":"Visualbert: A simple and performant baseline for vision and language","author":"Li","journal-title":"arXiv preprint arXiv"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"ref35","volume-title":"Llava-next: Improved reasoning, ocr, and world knowledge","author":"Liu","year":"2024"},{"key":"ref36","article-title":"Visual instruction tuning","author":"Liu","year":"2024","journal-title":"Advances in neural information processing systems, 36"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72983-6_8"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i6.32616"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1145\/3746027.3755188"},{"key":"ref41","article-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","author":"Lu","year":"2019","journal-title":"Advances in neural information processing systems, 32"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-021-00437-5"},{"key":"ref43","article-title":"Understanding zero-shot adversarial robustness for large-scale models","author":"Mao","year":"2022","journal-title":"arXiv preprint arXiv"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_24"},{"key":"ref45","article-title":"openai","volume-title":"Gpt-4o","year":"2024"},{"key":"ref46","article-title":"Dinov2: Learning robust visual features without supervision","author":"Oquab","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref47","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International conference on machine learning","author":"Radford","year":"2021"},{"key":"ref48","article-title":"Grounded sam: Assembling open-world models for diverse visual tasks","author":"Ren","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00434"},{"key":"ref51","article-title":"Improving viewpoint robustness for visual recognition via adversarial training","author":"Ruan","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73347-5_18"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1833"},{"key":"ref54","article-title":"Denoising diffusion implicit models","author":"Song","journal-title":"arXiv preprint arXiv"},{"key":"ref55","article-title":"Eva-clip: Improved training techniques for clip at scale","author":"Sun","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref56","article-title":"Drivevlm: The convergence of autonomous driving and large vision-language models","author":"Tian","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref57","article-title":"Triposr: Fast 3d object reconstruction from a single image","author":"Tochilkin","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.00914"},{"key":"ref59","article-title":"Llama: Open and efficient foundation language models","author":"Touvron","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref60","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref61","article-title":"Cogvlm: Visual expert for pretrained language models","author":"Wang","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.5040\/9798881817916.ch-004"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2024.3440097"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610712"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"ref66","article-title":"Scaling in-the-wild training for diffusion-based illumination harmonization and editing by imposing consistent light transport","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Zhang","year":"2025"},{"key":"ref67","volume-title":"Benchmarking trustworthiness of multimodal large language models: A comprehensive study","author":"Zhang","year":"2024"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73220-1_16"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20074-8_10"},{"key":"ref70","article-title":"On evaluating adversarial robustness of large vision-language models","volume-title":"Thirty-seventh Conference on Neural Information Processing Systems","author":"Zhao","year":"2023"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.52202\/075280-2020"},{"key":"ref72","article-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models","author":"Zhu","year":"2023","journal-title":"arXiv preprint arXiv"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11445309.pdf?arnumber=11445309","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T05:07:58Z","timestamp":1777612078000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11445309\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":72,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.00740","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}