{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,20]],"date-time":"2026-05-20T16:50:44Z","timestamp":1779295844224,"version":"3.51.4"},"reference-count":92,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,10,1]],"date-time":"2023-10-01T00:00:00Z","timestamp":1696118400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,10,1]],"date-time":"2023-10-01T00:00:00Z","timestamp":1696118400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,10,1]]},"DOI":"10.1109\/iccv51070.2023.01987","type":"proceedings-article","created":{"date-parts":[[2024,1,15]],"date-time":"2024-01-15T15:55:59Z","timestamp":1705334159000},"page":"21684-21695","source":"Crossref","is-referenced-by-count":143,"title":["DDP: Diffusion Model for Dense Visual Prediction"],"prefix":"10.1109","author":[{"given":"Yuanfeng","family":"Ji","sequence":"first","affiliation":[{"name":"The University of Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhe","family":"Chen","sequence":"additional","affiliation":[{"name":"Nanjing University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Enze","family":"Xie","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x2019;s Ark Lab"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lanqing","family":"Hong","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x2019;s Ark Lab"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xihui","family":"Liu","sequence":"additional","affiliation":[{"name":"The University of Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhaoqiang","family":"Liu","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x2019;s Ark Lab"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tong","family":"Lu","sequence":"additional","affiliation":[{"name":"Nanjing University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhenguo","family":"Li","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x2019;s Ark Lab"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ping","family":"Luo","sequence":"additional","affiliation":[{"name":"The University of Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Segdiff: Image segmentation with diffusion probabilistic models","author":"Amit","year":"2021"},{"key":"ref2","article-title":"Protein structure and sequence generation with equivariant denoising diffusion probabilistic models","author":"Anand","year":"2022"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00400"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00330"},{"key":"ref5","article-title":"Efficient self-ensemble framework for semantic segmentation","author":"Bousselham","year":"2021"},{"key":"ref6","article-title":"Big-gan: Large scale gan training for high fidelity natural image synthesis","volume-title":"ICLR","author":"Brock"},{"key":"ref7","first-page":"11621","article-title":"nuscenes: A mul-timodal dataset for autonomous driving","volume-title":"CVPR","author":"Caesar"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2699184"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1802.02611"},{"key":"ref10","article-title":"Sampling is as easy as learning the score: theory for diffusion models with minimal data assumptions","author":"Chen","year":"2022"},{"key":"ref11","article-title":"Diffu-siondet: Diffusion model for object detection","author":"Chen","year":"2022"},{"key":"ref12","article-title":"On the importance of noise scheduling for diffusion models","author":"Chen","year":"2023"},{"key":"ref13","article-title":"A generalist framework for panop-tic segmentation of images and videos","author":"Chen","year":"2022"},{"key":"ref14","article-title":"Analog bits: Generating discrete data using diffusion models with self-conditioning","author":"Chen","year":"2022"},{"key":"ref15","article-title":"Analog bits: Generating discrete data using diffusion models with self-conditioning","volume-title":"ICLR","author":"Chen"},{"key":"ref16","article-title":"Vision transformer adapter for dense predictions","volume-title":"ICLR","author":"Chen"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"ref18","first-page":"17864","article-title":"Per-pixel classification is not all you need for semantic segmentation","volume-title":"NeurIPS","volume":"34","author":"Cheng"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.350"},{"key":"ref20","article-title":"Diffdock: Diffusion steps, twists, and turns for molecular docking","author":"Corso","year":"2022"},{"key":"ref21","article-title":"Region rebalance for long-tailed semantic segmentation","author":"Cui","year":"2022"},{"key":"ref22","article-title":"Consistent diffusion models: Mitigating sampling drift by learning to be consistent","author":"Daras","year":"2023"},{"key":"ref23","article-title":"Multiresolution textual inversion","author":"Daras","year":"2022"},{"key":"ref24","first-page":"8780","article-title":"Diffusion models beat gans on image synthesis","volume-title":"NeurIPS","volume":"34","author":"Dhariwal"},{"key":"ref25","article-title":"Depth map prediction from a single image using a multi-scale deep network","volume-title":"NeurIPS","author":"Eigen"},{"key":"ref26","article-title":"Flownet: Learning optical flow with convolutional networks","author":"Fischer","year":"2015"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00214"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1177\/0278364913491297"},{"key":"ref29","article-title":"Vision transformers with patch diversification","author":"Gong","year":"2021"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA40945.2020.9196544"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.300"},{"key":"ref32","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume-title":"NeurIPS","volume":"33","author":"Ho"},{"key":"ref33","article-title":"Video diffusion models","author":"Ho","year":"2022"},{"key":"ref34","article-title":"Cogvideo: Large-scale pretraining for text-to-video generation via transformers","author":"Hong","year":"2022"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.632"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00292"},{"key":"ref37","article-title":"Progressive growing of gans for improved quality, stability, and variation","volume-title":"ICLR","author":"Karras"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58558-7_29"},{"key":"ref39","article-title":"From big to small: Multi-scale local planar guidance for monocular depth estimation","author":"Lee","year":"2019"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00820"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1007\/s11633-023-1458-0"},{"key":"ref42","article-title":"Binsformer: Revisiting adaptive bins for monocular depth estimation","author":"Li","year":"2022"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3252807"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.106"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160968"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2020.2974682"},{"key":"ref49","article-title":"Decoupled weight decay regularization","author":"Loshchilov","year":"2017"},{"key":"ref50","article-title":"Glide: Towards photorealistic image generation and editing with text-guided diffusion models","author":"Nichol","year":"2021"},{"key":"ref51","first-page":"8162","article-title":"Improved denoising diffusion probabilistic models","volume-title":"ICML","author":"Nichol"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58568-6_12"},{"key":"ref53","article-title":"Hierarchical text-conditional image generation with clip latents","author":"Ramesh","year":"2022"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01196"},{"key":"ref55","article-title":"Orthographic feature transform for monocular 3d object detection","author":"Roddick","year":"2018"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"ref58","article-title":"Photorealistic text-to-image diffusion models with deep language understanding","author":"Saharia","year":"2022"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3204461"},{"key":"ref60","first-page":"2234","article-title":"Improved techniques for training gans","volume-title":"NeurIPS","volume":"29","author":"Salimans"},{"key":"ref61","article-title":"Monocular depth estimation using diffusion models","author":"Saxena","year":"2023"},{"key":"ref62","article-title":"Structure-based drug design with equivariant diffusion models","author":"Schneuing","year":"2022"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-33715-4_54"},{"key":"ref64","first-page":"2256","article-title":"Deep unsupervised learning using nonequilibrium thermodynamics","volume-title":"ICML","author":"Sohl-Dickstein"},{"key":"ref65","article-title":"Denoising diffusion implicit models","author":"Song","year":"2020"},{"key":"ref66","article-title":"Denoising diffusion implicit models","volume-title":"ICLR","author":"Song"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298655"},{"key":"ref68","first-page":"12438","article-title":"Improved techniques for training score-based generative models","volume-title":"NeurIPS","volume":"33","author":"Song"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00717"},{"key":"ref70","article-title":"Diffusion probabilistic modeling of protein backbones in 3d for the motif-scaffolding problem","author":"Trippe","year":"2022"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00466"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01385"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"ref74","first-page":"1336","article-title":"Diffusion models for implicit image segmentation ensembles","volume-title":"MIDL","author":"Wolleb"},{"key":"ref75","article-title":"Medsegdiff: Medical image segmentation with diffusion probabilistic model","author":"Wu","year":"2022"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01228-1_26"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.153"},{"key":"ref78","article-title":"Segformer: Simple and efficient design for semantic segmentation with transformers","volume-title":"NeurIPS","volume":"34","author":"Xie"},{"key":"ref79","article-title":"M2\u0302 bev: Multi-camera joint 3d detection and segmentation with unified birds-eye view representation","author":"Xie","year":"2022"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01596"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.728"},{"key":"ref82","first-page":"16494","article-title":"Multi-modal virtual point 3d detection","volume-title":"NeurIPS","volume":"34","author":"Yin"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00578"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1909.11065"},{"key":"ref85","article-title":"Hrformer: High-resolution vision transformer for dense prediction","volume-title":"NeurIPS","volume":"34","author":"Yuan"},{"key":"ref86","article-title":"Dino: Detr with improved denoising anchor boxes for end-to-end object detection","author":"Zhang","year":"2022"},{"key":"ref87","first-page":"10326","article-title":"K-net: Towards unified image segmentation","volume-title":"NeurIPS","author":"Zhang"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.660"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00681"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01339"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.544"},{"key":"ref92","author":"Zhu","year":"2020","journal-title":"Deformable detr: Deformable transformers for end-to-end object detection"}],"event":{"name":"2023 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Paris, France","start":{"date-parts":[[2023,10,1]]},"end":{"date-parts":[[2023,10,6]]}},"container-title":["2023 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10376473\/10376477\/10376747.pdf?arnumber=10376747","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,17]],"date-time":"2024-01-17T20:02:18Z","timestamp":1705521738000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10376747\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,1]]},"references-count":92,"URL":"https:\/\/doi.org\/10.1109\/iccv51070.2023.01987","relation":{},"subject":[],"published":{"date-parts":[[2023,10,1]]}}}