{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T17:46:26Z","timestamp":1772905586943,"version":"3.50.1"},"reference-count":69,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"12","license":[{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"Hong Kong Research Grant Council","award":["27209621"],"award-info":[{"award-number":["27209621"]}]},{"name":"General Research Fund Scheme","award":["17202422"],"award-info":[{"award-number":["17202422"]}]},{"name":"RGC Theme-based research","award":["T45-701\/22-R"],"award-info":[{"award-number":["T45-701\/22-R"]}]},{"name":"RGC Matching Fund Scheme"},{"name":"JC STEM Lab of Robotics for Soft Materials"},{"name":"The Hong Kong Jockey Club Charities Trust"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2024,12]]},"DOI":"10.1109\/tpami.2024.3410324","type":"journal-article","created":{"date-parts":[[2024,6,6]],"date-time":"2024-06-06T17:26:09Z","timestamp":1717694769000},"page":"8517-8533","source":"Crossref","is-referenced-by-count":24,"title":["Lowis3D: Language-Driven Open-World Instance-Level 3D Scene Understanding"],"prefix":"10.1109","volume":"46","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-1582-5092","authenticated-orcid":false,"given":"Runyu","family":"Ding","sequence":"first","affiliation":[{"name":"Department of Electrical and Electronic Engineering, University of Hong Kong, Hong Kong"}]},{"given":"Jihan","family":"Yang","sequence":"additional","affiliation":[{"name":"Department of Electrical and Electronic Engineering, University of Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3562-3094","authenticated-orcid":false,"given":"Chuhui","family":"Xue","sequence":"additional","affiliation":[{"name":"ByteDance Inc, Singapore"}]},{"given":"Wenqing","family":"Zhang","sequence":"additional","affiliation":[{"name":"ByteDance Inc, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2570-9118","authenticated-orcid":false,"given":"Song","family":"Bai","sequence":"additional","affiliation":[{"name":"ByteDance Inc, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4285-1626","authenticated-orcid":false,"given":"Xiaojuan","family":"Qi","sequence":"additional","affiliation":[{"name":"Department of Electrical and Electronic Engineering, University of Hong Kong, Hong Kong"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.261"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00961"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00273"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00290"},{"key":"ref5","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford"},{"key":"ref6","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Jia"},{"key":"ref7","article-title":"Florence: A new foundation model for computer vision","author":"Yuan","year":"2021"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"ref9","article-title":"Open-vocabulary object detection via vision and language knowledge distillation","author":"Gu","year":"2021"},{"key":"ref10","first-page":"33781","article-title":"Bridging the gap between object and image-level representations for open-vocabulary detection","volume-title":"Proc. 36th Conf. Neural Inf. Process. Syst.","author":"Rasheed"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19818-2_42"},{"key":"ref12","article-title":"Language-driven semantic segmentation","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Li"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_40"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00289"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00836"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/iccv51070.2023.02025"},{"key":"ref17","article-title":"ViT-GPT2 image captioning","year":"2022"},{"key":"ref18","article-title":"OFA: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework","author":"Wang","year":"2022"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00785"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00455"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00677"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00085"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00678"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.170"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2022.3148457"},{"key":"ref26","article-title":"PointNet: Deep hierarchical feature learning on point sets in a metric space","author":"Qi","year":"2017"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00278"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00651"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00319"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00831"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00319"},{"key":"ref32","article-title":"Submanifold sparse convolutional networks","author":"Graham","year":"2017"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00407"},{"key":"ref34","first-page":"6737","article-title":"Learning object bounding boxes for 3D instance segmentation on point clouds","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Yang"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00492"},{"key":"ref36","article-title":"MOPT: Multi-object panoptic tracking","author":"Hurtado","year":"2020"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/IROS45743.2020.9340837"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01289"},{"key":"ref39","first-page":"466","article-title":"Zero-shot semantic segmentation","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Bucher"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413593"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00845"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00940"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/3DV53792.2021.00107"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01416"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_21"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.066"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01874"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01886"},{"key":"ref49","article-title":"OpenMask3D: Open-vocabulary 3D instance segmentation","author":"Takmaz","year":"2023"},{"key":"ref50","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018"},{"key":"ref51","article-title":"ClipCap: Clip prefix for image captioning","author":"Mokady","year":"2021"},{"issue":"6","key":"ref52","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3295748","volume":"51","author":"Hossain","year":"2019","journal-title":"ACM Comput. Surv."},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.703"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3054739"},{"key":"ref55","article-title":"Open3D: A modern library for 3D data processing","author":"Zhou","year":"2018"},{"key":"ref56","first-page":"1321","article-title":"On calibration of modern neural networks","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Guo"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20059-5_31"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00219"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01463"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093545"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.5555\/3045118.3045167"},{"key":"ref62","article-title":"Rectified linear units improve restricted Boltzmann machines","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Nair"},{"key":"ref63","article-title":"Decoupled weight decay regularization","author":"Loshchilov","year":"2017"},{"issue":"8","key":"ref64","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI Blog"},{"key":"ref65","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","author":"Li","year":"2023"},{"key":"ref66","article-title":"Kosmos-2: Grounding multimodal large language models to the world","author":"Peng","year":"2023"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1145\/3503250"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01807"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00873"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/34\/10746266\/10551628.pdf?arnumber=10551628","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T00:25:27Z","timestamp":1732667127000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10551628\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12]]},"references-count":69,"journal-issue":{"issue":"12"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2024.3410324","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12]]}}}