{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,12]],"date-time":"2026-05-12T01:58:12Z","timestamp":1778551092021,"version":"3.51.4"},"reference-count":62,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,6,16]],"date-time":"2024-06-16T00:00:00Z","timestamp":1718496000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,6,16]],"date-time":"2024-06-16T00:00:00Z","timestamp":1718496000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,6,16]]},"DOI":"10.1109\/cvpr52733.2024.02558","type":"proceedings-article","created":{"date-parts":[[2024,9,16]],"date-time":"2024-09-16T17:34:53Z","timestamp":1726508093000},"page":"27081-27091","source":"Crossref","is-referenced-by-count":93,"title":["ULIP-2: Towards Scalable Multimodal Pre-Training for 3D Understanding"],"prefix":"10.1109","author":[{"given":"Le","family":"Xue","sequence":"first","affiliation":[{"name":"Salesforce AI Research"}]},{"given":"Ning","family":"Yu","sequence":"additional","affiliation":[{"name":"Salesforce AI Research"}]},{"given":"Shu","family":"Zhang","sequence":"additional","affiliation":[{"name":"Salesforce AI Research"}]},{"given":"Artemis","family":"Panagopoulou","sequence":"additional","affiliation":[{"name":"Salesforce AI Research"}]},{"given":"Junnan","family":"Li","sequence":"additional","affiliation":[{"name":"Salesforce AI Research"}]},{"given":"Roberto","family":"Mart\u00edn-Mart\u00edn","sequence":"additional","affiliation":[{"name":"University of Texas at Austin"}]},{"given":"Jiajun","family":"Wu","sequence":"additional","affiliation":[{"name":"Stanford University"}]},{"given":"Caiming","family":"Xiong","sequence":"additional","affiliation":[{"name":"Salesforce AI Research"}]},{"given":"Ran","family":"Xu","sequence":"additional","affiliation":[{"name":"Salesforce AI Research"}]},{"given":"Juan Carlos","family":"Niebles","sequence":"additional","affiliation":[{"name":"Salesforce AI Research"}]},{"given":"Silvio","family":"Savarese","sequence":"additional","affiliation":[{"name":"Salesforce AI Research"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.170"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2016.xii.041"},{"key":"ref4","article-title":"Shapenet: An information-rich 3d model repository","author":"Angel","year":"2015","journal-title":"arXiv preprint"},{"key":"ref5","first-page":"1931","article-title":"Unifying vision-and-language tasks via text generation","volume-title":"International Conference on Machine Learning","author":"Cho"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52729.2023.01263"},{"key":"ref7","article-title":"Autoencoders as cross-modal teachers: Can pretrained 2d image transformers help 3d representation learning?","author":"Dong","year":"2022","journal-title":"ar Xiv preprint"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00961"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00007"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01112"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02025"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01278"},{"key":"ref13","article-title":"Openclip","author":"Ilharco","year":"2021","journal-title":"If you use this software, please cite it as below"},{"key":"ref14","doi-asserted-by":"crossref","DOI":"10.1088\/978-1-6270-5612-0","article-title":"Morgan & Claypool Publishers","volume-title":"3D scientific visualization with Blender\u00ae","author":"Kent","year":"2015"},{"key":"ref15","first-page":"5583","article-title":"Vilt: Vision-and-language transformer without convolution or region su-pervision","volume-title":"International Conference on Machine Learning","author":"Kim"},{"key":"ref16","first-page":"9694","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","volume":"34","author":"Li","year":"2021","journal-title":"Advances in neural information processing systems"},{"key":"ref17","first-page":"12888","article-title":"Blip: Bootstrapping language-image pretraining for unified vision-language understanding and generation","author":"Li","year":"2022","journal-title":"International Conference on Machine Learning"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3653724"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.202"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01667"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00746"},{"key":"ref22","article-title":"Openshape: Scaling up 3d shape representation towards open-world understanding","author":"Liu","year":"2023","journal-title":"Advances in neural information processing systems"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00534"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00294"},{"key":"ref25","first-page":"7219","article-title":"Neu-ral baby talk","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","author":"Lu"},{"key":"ref26","article-title":"Rethinking network design and local geometry in point cloud: A simple residual mlp framework","author":"Ma","year":"2022","journal-title":"arXiv preprint"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00290"},{"key":"ref28","first-page":"529","article-title":"Slip: Self-supervision meets language-image pretraining","volume-title":"European Conference on Computer Vision","author":"Mu","year":"2022"},{"key":"ref29","article-title":"Gpt-4 technical report","year":"2023","journal-title":"OpenAI blog"},{"key":"ref30","article-title":"X-instructblip: A framework for aligning x-modal instruction-aware representations to llms and emergent cross-modal reasoning","author":"Panagopoulou","year":"2023","journal-title":"arXiv preprint"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20086-1_35"},{"key":"ref32","first-page":"652","article-title":"Pointnet: Deep learning on point sets for 3d classification and segmentation","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","author":"Qi"},{"key":"ref33","article-title":"Pointnet++: Deep hierarchical feature learning on point sets in a metric space","volume":"30","author":"Ruizhongtai Qi","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref34","article-title":"Contrast with reconstruct: Contrastive 3d representation learning guided by generative pretraining","volume-title":"International Conference on Machine Learning (ICML)","author":"Qi"},{"key":"ref35","article-title":"Gpt4point: A unified framework for point-language understanding and generation","author":"Qi","year":"2023","journal-title":"arXiv preprint"},{"key":"ref36","article-title":"Pointnext: Revisiting pointnet++ with improved training and scaling strategies","author":"Qian","year":"2022","journal-title":"arXiv preprint"},{"key":"ref37","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International Conference on Machine Learning","author":"Radford"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01837"},{"key":"ref39","article-title":"Model-agnostic hierarchical attention for 3d object detection","author":"Shu","year":"2023","journal-title":"arXiv preprint"},{"key":"ref40","article-title":"X-InstructBLIP: A framework for aligning x-modal instruction-aware representations to LLMs and emergent cross-modal reasoning","year":"2023","journal-title":"Anonymous Submission"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00167"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00167"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00273"},{"key":"ref46","article-title":"Simvlm: Simple visual language model pretraining with weak supervision","author":"Wang","year":"2021","journal-title":"arXiv preprint"},{"key":"ref47","article-title":"P2p: Tuning pretrained image models for point cloud analysis with point-to-pixel prompting","author":"Wang","year":"2022","journal-title":"arXiv preprint"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995547"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2018.09.008"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298801"},{"key":"ref51","article-title":"Pointllm: Empowering large language models to understand point clouds","author":"Xu","year":"2023","journal-title":"ar Xiv preprint"},{"key":"ref52","article-title":"Ulip: Learning unified representation of language, image and point cloud for 3d understanding","author":"Xue","year":"2022","journal-title":"ar Xiv preprint"},{"key":"ref53","article-title":"Let images give you more: Point cloud cross-modal training for shape analysis","author":"Yan","year":"2022","journal-title":"arXiv preprint"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01161"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01871"},{"key":"ref56","article-title":"Multi-grained vision language pretraining: Aligning texts with visual concepts","author":"Zeng","year":"2021","journal-title":"arXiv preprint"},{"key":"ref57","article-title":"Point-m2ae: multi-scale masked autoencoders for hierarchical point cloud pretraining","author":"Zhang","year":"2022","journal-title":"arXiv preprint"},{"key":"ref58","first-page":"8552","article-title":"Point-clip: Point cloud understanding by clip","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Zhang"},{"key":"ref59","article-title":"Learning 3d representations from 2d pretrained models via image-to-point masked autoencoders","author":"Zhang","year":"2022","journal-title":"arXiv preprint"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00674"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.7005"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/iccv51070.2023.00249"}],"event":{"name":"2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","location":"Seattle, WA, USA","start":{"date-parts":[[2024,6,16]]},"end":{"date-parts":[[2024,6,22]]}},"container-title":["2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10654794\/10654797\/10657055.pdf?arnumber=10657055","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,20]],"date-time":"2024-09-20T05:36:43Z","timestamp":1726810603000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10657055\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,16]]},"references-count":62,"URL":"https:\/\/doi.org\/10.1109\/cvpr52733.2024.02558","relation":{},"subject":[],"published":{"date-parts":[[2024,6,16]]}}}