{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,18]],"date-time":"2025-12-18T09:40:09Z","timestamp":1766050809765,"version":"3.38.0"},"reference-count":53,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"4","license":[{"start":{"date-parts":[[2025,4,1]],"date-time":"2025-04-01T00:00:00Z","timestamp":1743465600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,4,1]],"date-time":"2025-04-01T00:00:00Z","timestamp":1743465600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,4,1]],"date-time":"2025-04-01T00:00:00Z","timestamp":1743465600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2025,4]]},"DOI":"10.1109\/tpami.2025.3528247","type":"journal-article","created":{"date-parts":[[2025,1,10]],"date-time":"2025-01-10T20:26:10Z","timestamp":1736540770000},"page":"3018-3030","source":"Crossref","is-referenced-by-count":3,"title":["DiffTF++: 3D-Aware Diffusion Transformer for Large-Vocabulary 3D Generation"],"prefix":"10.1109","volume":"47","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5682-9446","authenticated-orcid":false,"given":"Ziang","family":"Cao","sequence":"first","affiliation":[{"name":"College of Computing and Data Science, S-Lab, Nanyang Technological University, Singapore"}]},{"given":"Fangzhou","family":"Hong","sequence":"additional","affiliation":[{"name":"College of Computing and Data Science, S-Lab, Nanyang Technological University, Singapore"}]},{"given":"Tong","family":"Wu","sequence":"additional","affiliation":[{"name":"Graduate Division of Information Engineering, The Chinese University of Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1821-4296","authenticated-orcid":false,"given":"Liang","family":"Pan","sequence":"additional","affiliation":[{"name":"Shanghai Artificial Intelligent Laboratory, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4220-5958","authenticated-orcid":false,"given":"Ziwei","family":"Liu","sequence":"additional","affiliation":[{"name":"College of Computing and Data Science, S-Lab, Nanyang Technological University, Singapore"}]}],"member":"263","reference":[{"article-title":"DreamFusion: Text-to-3D using 2D diffusion","year":"2022","author":"Poole","key":"ref1"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00037"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01214"},{"key":"ref4","first-page":"8406","article-title":"ProlificDreamer: High-fidelity and diverse text-to-3D generation with variational score distillation","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Wang"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00254"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52729.2023.00421"},{"key":"ref7","first-page":"40","article-title":"Learning representations and generative models for 3D point clouds","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Achlioptas"},{"article-title":"Point-E: A system for generating 3D point clouds from complex prompts","year":"2022","author":"Nichol","key":"ref8"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01565"},{"article-title":"Dit-3D: Exploring plain diffusion transformers for 3D shape generation","year":"2023","author":"Mo","key":"ref10"},{"article-title":"ShapeNet: An information-rich 3D model repository","year":"2015","author":"Chang","key":"ref11"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00084"},{"article-title":"Large-vocabulary 3D diffusion model with transformer","year":"2023","author":"Cao","key":"ref13"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2010.11929"},{"key":"ref16","first-page":"10347","article-title":"Training data-efficient image transformers & distillation through attention","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Touvron"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"article-title":"Deformable DETR: Deformable transformers for end-to-end object detection","year":"2020","author":"Zhu","key":"ref18"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01517"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01438"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3307174"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00681"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00717"},{"key":"ref24","first-page":"4797","article-title":"Conditional image generation with pixelCNN decoders","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Van den Oord"},{"key":"ref25","first-page":"14745","article-title":"TransGAN: Two pure transformers can make one strong GAN, and that can scale up","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Jiang"},{"key":"ref26","first-page":"1691","article-title":"Generative pretraining from pixels","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Chen"},{"article-title":"Generating long sequences with sparse transformers","year":"2019","author":"Child","key":"ref27"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01103"},{"article-title":"Scalable diffusion models with transformers","year":"2022","author":"Peebles","key":"ref29"},{"key":"ref30","first-page":"31841","article-title":"Get3D: A generative model of high quality 3D textured shapes learned from images","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Gao"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20062-5_5"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-20893-6_7"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00574"},{"article-title":"Stylenerf: A style-based 3d-aware generator for high-resolution image synthesis","year":"2021","author":"Gu","key":"ref34"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01129"},{"key":"ref36","first-page":"20154","article-title":"GRAF: Generative radiance fields for 3d-aware image synthesis","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Schwarz"},{"article-title":"Cips-3D: A 3D-aware generator of GANs based on conditionally-independent pixel synthesis","year":"2021","author":"Zhou","key":"ref37"},{"article-title":"3DDesigner: Towards photorealistic 3D object generation and editing with text-guided diffusion models","year":"2022","author":"Li","key":"ref38"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00286"},{"article-title":"Lion: Latent point diffusion models for 3D shape generation","year":"2022","author":"Zeng","key":"ref40"},{"article-title":"3D neural field generation using triplane diffusion","year":"2022","author":"Shue","key":"ref41"},{"article-title":"Rodin: A generative model for sculpting 3D digital avatars using diffusion","year":"2022","author":"Wang","key":"ref42"},{"key":"ref43","first-page":"11808","article-title":"NerfDiff: Single-image view synthesis with NERF-guided distillation from 3D-aware diffusion","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Gu"},{"article-title":"LRM: Large reconstruction model for single image to 3D","year":"2023","author":"Hong","key":"ref44"},{"article-title":"DMV3D: Denoising multi-view diffusion using 3D large reconstruction model","year":"2023","author":"Xu","key":"ref45"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01882"},{"key":"ref47","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Ho"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1002\/jemt.20294"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/2945.468400"},{"article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","year":"2020","author":"Dosovitskiy","key":"ref50"},{"article-title":"Denoising diffusion implicit models","year":"2020","author":"Song","key":"ref51"},{"key":"ref52","first-page":"6629","article-title":"GANs trained by a two time-scale update rule converge to a local NASH equilibrium","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Heusel"},{"article-title":"Demystifying MMD GANs","year":"2018","author":"Bi\u0144kowski","key":"ref53"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/34\/10916529\/10836771.pdf?arnumber=10836771","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,7]],"date-time":"2025-03-07T18:41:04Z","timestamp":1741372864000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10836771\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4]]},"references-count":53,"journal-issue":{"issue":"4"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2025.3528247","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"type":"print","value":"0162-8828"},{"type":"electronic","value":"2160-9292"},{"type":"electronic","value":"1939-3539"}],"subject":[],"published":{"date-parts":[[2025,4]]}}}