{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T10:00:31Z","timestamp":1775815231303,"version":"3.50.1"},"reference-count":323,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"7","license":[{"start":{"date-parts":[[2024,7,1]],"date-time":"2024-07-01T00:00:00Z","timestamp":1719792000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/"}],"funder":[{"name":"National Key Research and Development Program of China","award":["2023YFC3807600"],"award-info":[{"award-number":["2023YFC3807600"]}]},{"name":"interdisciplinary doctoral","award":["iDoc 2021-360"],"award-info":[{"award-number":["iDoc 2021-360"]}]},{"name":"Personalized Health and Related Technologies"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2024,7]]},"DOI":"10.1109\/tpami.2024.3361862","type":"journal-article","created":{"date-parts":[[2024,2,5]],"date-time":"2024-02-05T18:37:45Z","timestamp":1707158265000},"page":"5092-5113","source":"Crossref","is-referenced-by-count":126,"title":["Towards Open Vocabulary Learning: A Survey"],"prefix":"10.1109","volume":"46","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-4559-7970","authenticated-orcid":false,"given":"Jianzong","family":"Wu","sequence":"first","affiliation":[{"name":"National Key Laboratory of General Artificial Intelligence, School of Intelligence Science and Technology, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0550-8247","authenticated-orcid":false,"given":"Xiangtai","family":"Li","sequence":"additional","affiliation":[{"name":"National Key Laboratory of General Artificial Intelligence, School of Intelligence Science and Technology, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7178-5358","authenticated-orcid":false,"given":"Shilin","family":"Xu","sequence":"additional","affiliation":[{"name":"National Key Laboratory of General Artificial Intelligence, School of Intelligence Science and Technology, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9770-7720","authenticated-orcid":false,"given":"Haobo","family":"Yuan","sequence":"additional","affiliation":[{"name":"Wuhan University, Wuhan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4868-6526","authenticated-orcid":false,"given":"Henghui","family":"Ding","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0530-7231","authenticated-orcid":false,"given":"Yibo","family":"Yang","sequence":"additional","affiliation":[{"name":"King Abdullah University of Science and Technology, Thuwal, Saudi Arabia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2284-1700","authenticated-orcid":false,"given":"Xia","family":"Li","sequence":"additional","affiliation":[{"name":"ETH Zurich, Z&#x00FC;rich, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8891-6766","authenticated-orcid":false,"given":"Jiangning","family":"Zhang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8735-2516","authenticated-orcid":false,"given":"Yunhai","family":"Tong","sequence":"additional","affiliation":[{"name":"National Key Laboratory of General Artificial Intelligence, School of Intelligence Science and Technology, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9104-2315","authenticated-orcid":false,"given":"Xudong","family":"Jiang","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5534-587X","authenticated-orcid":false,"given":"Bernard","family":"Ghanem","sequence":"additional","affiliation":[{"name":"King Abdullah University of Science and Technology, Thuwal, Saudi Arabia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7225-5449","authenticated-orcid":false,"given":"Dacheng","family":"Tao","sequence":"additional","affiliation":[{"name":"University of Sydney, Sydney, NSW, Australia"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2010.11929"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref6","first-page":"2152","article-title":"An embarrassingly simple approach to zero-shot learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Romera-Paredes"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_24"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.15"},{"key":"ref9","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","volume-title":"Proc. Annu. Meeting Assoc. Comput. Linguistics","author":"Devlin"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1310.4546"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01416"},{"key":"ref12","first-page":"1","article-title":"Open-vocabulary object detection via vision and language knowledge distillation","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Gu"},{"key":"ref13","first-page":"540","article-title":"Open-vocabulary image segmentation","volume-title":"Proc. Eur. Conf. Comput. Vis.","author":"Ghiasi"},{"key":"ref14","first-page":"11207","article-title":"Language-driven semantic segmentation","volume-title":"Proc. IEEE\/CVF Int. Conf. Learn. Representations","author":"Li"},{"key":"ref15","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford"},{"key":"ref16","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Jia"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1145\/3582688"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3386252"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3293318"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/tnnls.2023.3265051"},{"key":"ref21","first-page":"1","article-title":"Suppressing the heterogeneity: A strong feature extractor for few-shot segmentation","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Hu"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3275156"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2798607"},{"key":"ref24","article-title":"Transformer-based visual segmentation: A survey","author":"Li","year":"2023"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1016\/j.aiopen.2022.01.001"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298799"},{"key":"ref27","article-title":"M2IOSR: Maximal mutual information open set recognition","author":"Sun","year":"2021"},{"key":"ref28","article-title":"Generalized out-of-distribution detection: A survey","author":"Yang","year":"2021"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.2981604"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00577"},{"key":"ref32","article-title":"Microsoft COCO captions: Data collection and evaluation server","author":"Chen","year":"2015"},{"key":"ref33","article-title":"Reducing network agnostophobia","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Dhamija"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.173"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1137\/1.9781611976236.18"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1314"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298799"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00577"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00262"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.221"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref42","article-title":"LLaMA: Open and efficient foundation language models","author":"Touvron","year":"2023"},{"key":"ref43","first-page":"1","article-title":"Learning object-language alignments for open-vocabulary object detection","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Lin"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_41"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20059-5_31"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02005"},{"key":"ref47","first-page":"8090","article-title":"Open-vocabulary panoptic segmentation with maskclip","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Ding"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00550"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01817"},{"key":"ref50","first-page":"5122","article-title":"Semantic understanding of scenes through the ADE20K dataset","volume-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit.","author":"Zhou"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-014-0733-5"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.119"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1604.01685"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00412"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00529"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00375"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01850"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58558-7_26"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN.2017.7965897"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.5244\/C.31.42"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_38"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461700"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2017\/469"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2018.07.030"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2020.2978199"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00441"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-99-4755-3_48"},{"key":"ref68","first-page":"1","article-title":"A baseline for detecting misclassified and out-of-distribution examples in neural networks","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Hendrycks"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01096"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00114"},{"key":"ref71","first-page":"1","article-title":"Training confidence-calibrated classifiers for detecting out-of-distribution samples","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Lee"},{"key":"ref72","first-page":"1","article-title":"Deep autoencoding Gaussian mixture model for unsupervised anomaly detection","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Zong"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00057"},{"key":"ref74","first-page":"6823","article-title":"Generative probabilistic novelty detection with adversarial autoencoders","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Pidhorskyi"},{"key":"ref75","article-title":"A simple fix to Mahalanobis distance for improving near-OOD detection","author":"Ren","year":"2021"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-69538-5_4"},{"key":"ref77","article-title":"Improving reconstruction autoencoder out-of-distribution detection with Mahalanobis distance","author":"Denouden","year":"2018"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00723"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20053-3_22"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00845"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00689"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01868"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3279660"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00940"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01081"},{"key":"ref86","first-page":"468","article-title":"Zero-shot semantic segmentation","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Bucher"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413593"},{"key":"ref88","article-title":"Conterfactual generative zero-shot semantic segmentation","author":"Shen","year":"2021"},{"key":"ref89","article-title":"FastText.zip: Compressing text classification models","author":"Joulin","year":"2016"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1310.4546"},{"key":"ref91","first-page":"10317","article-title":"Consistent structural relation learning for zero-shot segmentation","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Li"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413593"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413990"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00304"},{"key":"ref95","first-page":"4175","article-title":"Balanced meta-softmax for long-tailed visual recognition","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Ren"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01100"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01406"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58568-6_43"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01514"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00117"},{"key":"ref101","first-page":"6353","article-title":"Generalized and discriminative few-shot object detection via SVD-dictionary enhancement","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Wu"},{"key":"ref102","first-page":"9919","article-title":"Frustratingly simple few-shot object detection","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Wang"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00856"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00867"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01384"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2022.109018"},{"key":"ref107","first-page":"1","article-title":"Conditional networks for few-shot semantic segmentation","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Rakelly"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00124"},{"key":"ref109","article-title":"One-shot instance segmentation","author":"Michaelis","year":"2018"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00919"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00967"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01464"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01129"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00111"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00085"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00120"},{"key":"ref117","article-title":"DST-DET: Simple dynamic self-training for open-vocabulary object detection","author":"Xu","year":"2023"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20080-9_16"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_40"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00682"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00689"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00267"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00677"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01760"},{"key":"ref125","first-page":"23033","article-title":"Segclip: Patch aggregation with learnable centers for open-vocabulary semantic segmentation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Luo"},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02254"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00100"},{"key":"ref128","article-title":"OpenSD: Unified open-vocabulary segmentation and detection","author":"Li","year":"2023"},{"key":"ref129","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01451"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01863"},{"key":"ref131","article-title":"Diffusion models for zero-shot open-vocabulary segmentation","author":"Karazija","year":"2023"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00289"},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_21"},{"key":"ref134","article-title":"Scaling open-vocabulary object detection","author":"Minderer","year":"2023"},{"key":"ref135","article-title":"DST-DET: Simple dynamic self-training for open-vocabulary object detection","author":"Xu","year":"2023"},{"key":"ref136","first-page":"1","article-title":"F-VLM: Open-vocabulary object detection upon frozen vision and language models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Kuo"},{"key":"ref137","article-title":"MosaicFusion: Diffusion models as data augmenters for large vocabulary instance segmentation","author":"Xie","year":"2023"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00117"},{"key":"ref139","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01555"},{"key":"ref140","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00199"},{"key":"ref141","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2019.2962685"},{"key":"ref142","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.189"},{"key":"ref143","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.660"},{"key":"ref144","article-title":"Rethinking atrous convolution for semantic image segmentation","author":"Chen","year":"2017"},{"key":"ref145","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00254"},{"key":"ref146","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6805"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_45"},{"key":"ref148","article-title":"SFNet: Faster, accurate, and domain agnostic semantic segmentation via semantic flow","author":"Li","year":"2022"},{"key":"ref149","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00813"},{"key":"ref150","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00982"},{"key":"ref151","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00692"},{"key":"ref152","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58520-4_26"},{"key":"ref153","article-title":"BoundarySqueeze: Image segmentation as boundary squeezing","author":"He","year":"2021"},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00326"},{"key":"ref155","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00378"},{"key":"ref156","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"ref157","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00972"},{"key":"ref158","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.324"},{"key":"ref159","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_17"},{"key":"ref160","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00904"},{"key":"ref161","doi-asserted-by":"publisher","DOI":"10.1109\/cvprw.2017.66"},{"key":"ref162","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00511"},{"key":"ref163","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01024"},{"key":"ref164","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00925"},{"key":"ref165","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01008"},{"key":"ref166","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01249"},{"key":"ref167","first-page":"17721","article-title":"SOLOv2: Dynamic and fast instance segmentation","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Wang"},{"key":"ref168","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3223955"},{"key":"ref169","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3217852"},{"key":"ref170","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02259"},{"key":"ref171","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19812-0_42"},{"key":"ref172","article-title":"Panoptic-PartFormer++: A unified and decoupled view for panoptic part segmentation","author":"Li","year":"2023"},{"key":"ref173","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19812-0_34"},{"key":"ref174","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19836-6_31"},{"key":"ref175","doi-asserted-by":"publisher","DOI":"10.1109\/tcsvt.2023.3292995"},{"key":"ref176","doi-asserted-by":"publisher","DOI":"10.1109\/iccv51070.2023.01280"},{"key":"ref177","first-page":"1","article-title":"DAC-DETR: Divide the attention layers and conquer","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Hu"},{"key":"ref178","doi-asserted-by":"publisher","DOI":"10.1002\/nav.3800020109"},{"key":"ref179","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00542"},{"key":"ref180","first-page":"10326","article-title":"K-Net: Towards unified image segmentation","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Zhang"},{"key":"ref181","first-page":"17864","article-title":"Per-pixel classification is not all you need for semantic segmentation","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Cheng"},{"key":"ref182","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58548-8_7"},{"key":"ref183","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"ref184","first-page":"1","article-title":"Deformable {DETR}: Deformable transformers for end-to-end object detection","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Zhu"},{"key":"ref185","article-title":"Detectron2","author":"Wu","year":"2019"},{"key":"ref186","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01601"},{"key":"ref187","article-title":"VisualBERT: A simple and performant baseline for vision and language","author":"Li","year":"2019"},{"key":"ref188","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"ref189","first-page":"13","article-title":"ViLBERT: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Lu"},{"key":"ref190","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1514"},{"key":"ref191","article-title":"ImageBERT: Cross-modal pre-training with large-scale weak-supervised image-text data","author":"Qi","year":"2020"},{"key":"ref192","article-title":"Pixel-BERT: Aligning image pixels with text by deep multi-modal transformers","author":"Huang","year":"2020"},{"key":"ref193","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02240"},{"key":"ref194","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19809-0_30"},{"key":"ref195","article-title":"EVA-CLIP: Improved training techniques for clip at scale","author":"Sun","year":"2023"},{"key":"ref196","first-page":"9694","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Li"},{"key":"ref197","article-title":"COCA: Contrastive captioners are image-text foundation models","author":"Yu","year":"2022"},{"key":"ref198","article-title":"Video-text modeling with zero-shot transfer from contrastive captioners","author":"Yan","year":"2022"},{"key":"ref199","first-page":"12888","article-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li"},{"key":"ref200","first-page":"19730","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li"},{"key":"ref201","first-page":"1913","article-title":"Interpretable and globally optimal prediction for textual grounding using image concepts","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Yeh"},{"key":"ref202","doi-asserted-by":"publisher","DOI":"10.1109\/tip.2024.3371348"},{"key":"ref203","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00254"},{"key":"ref204","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00179"},{"key":"ref205","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00501"},{"key":"ref206","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2024.127738"},{"key":"ref207","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3079993"},{"key":"ref208","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01005"},{"key":"ref209","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01525"},{"key":"ref210","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01762"},{"key":"ref211","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16188"},{"key":"ref212","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"ref213","article-title":"Grounding DINO: Marrying DINO with grounded pre-training for open-set object detection","author":"Liu","year":"2023"},{"key":"ref214","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_10"},{"key":"ref215","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52688.2022.01629"},{"key":"ref216","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01369"},{"key":"ref217","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_7"},{"key":"ref218","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00679"},{"key":"ref219","article-title":"Mammut: A simple architecture for joint learning for multimodal tasks","author":"Kuo","year":"2023"},{"key":"ref220","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20044-1_16"},{"key":"ref221","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00287"},{"key":"ref222","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00083"},{"key":"ref223","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52688.2022.01368"},{"key":"ref224","first-page":"33781","article-title":"Bridging the gap between object and image-level representations for open-vocabulary detection","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Rasheed"},{"key":"ref225","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01076"},{"key":"ref226","article-title":"Open-vocabulary object detection using pseudo caption labels","author":"Cho","year":"2023"},{"key":"ref227","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02252"},{"key":"ref228","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46466-4_6"},{"key":"ref229","article-title":"Enhancing the role of context in region-word alignment for object detection","author":"Buettner","year":"2023"},{"key":"ref230","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"ref231","article-title":"GLIPv2: Unifying localization and vision-language understanding","author":"Zhang","year":"2022"},{"key":"ref232","first-page":"728","article-title":"Simple open-vocabulary object detection with vision transformers","volume-title":"Proc. Eur. Conf. Comput. Vis.","author":"Minderer"},{"key":"ref233","article-title":"Detection-oriented image-text pretraining for open-vocabulary detection","author":"Kim","year":"2023"},{"key":"ref234","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"ref235","first-page":"15946","article-title":"Multi-modal classifiers for open-vocabulary object detection","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Kaul"},{"key":"ref236","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-019-01228-7"},{"key":"ref237","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-16788-1_24"},{"key":"ref238","article-title":"X-paste: Revisit copy-paste at scale with clip and stablediffusion","author":"Zhao","year":"2022"},{"key":"ref239","article-title":"Guiding text-to-image diffusion model towards grounded generation","author":"Li","year":"2023"},{"key":"ref240","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00294"},{"key":"ref241","article-title":"Three ways to improve feature alignment for open vocabulary detection","author":"Arandjelovi\u0107","year":"2023"},{"key":"ref242","article-title":"PalI: A jointly-scaled multilingual language-image model","author":"Chen","year":"2022"},{"key":"ref243","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Brown"},{"key":"ref244","article-title":"Tip-adapter: Training-free clip-adapter for better vision-language modeling","author":"Zhang","year":"2021"},{"key":"ref245","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"},{"key":"ref246","article-title":"Prompt-guided transformers for end-to-end open-vocabulary object detection","author":"Song","year":"2023"},{"key":"ref247","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02250"},{"key":"ref248","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00978"},{"key":"ref249","first-page":"1","article-title":"CoDet: Co-occurrence guided region-word alignment for open-vocabulary object detection","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Ma"},{"key":"ref250","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01072"},{"key":"ref251","first-page":"1","article-title":"Open-vocabulary semantic segmentation with frozen vision-language models","volume-title":"Proc. Brit. Mach. Vis. Conf.","author":"Ma"},{"key":"ref252","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00288"},{"key":"ref253","article-title":"CAT-Seg: Cost aggregation for open-vocabulary semantic segmentation","author":"Cho","year":"2023"},{"key":"ref254","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00080"},{"key":"ref255","article-title":"TagCLIP: Improving discrimination ability of open-vocabulary semantic segmentation","author":"Li","year":"2023"},{"key":"ref256","first-page":"1","article-title":"Convolutions die hard: Open-vocabulary segmentation with single frozen convolutional clip","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Yu"},{"key":"ref257","doi-asserted-by":"crossref","DOI":"10.1109\/CVPR52733.2024.02640","article-title":"OMG-Seg: Is one model good enough for all segmentation?","author":"Li","year":"2024"},{"key":"ref258","article-title":"Clip-dinoiser: Teaching clip a few dino tricks","author":"Wysocza\u2019nska","year":"2023"},{"key":"ref259","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01860"},{"key":"ref260","doi-asserted-by":"publisher","DOI":"10.1023\/B:VISI.0000022288.19776.77"},{"key":"ref261","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52729.2023.01074"},{"key":"ref262","article-title":"Prompt pre-training with twenty-thousand classes for open-vocabulary visual recognition","author":"Ren","year":"2023"},{"key":"ref263","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref264","article-title":"Leveraging open-vocabulary diffusion to camouflaged instance segmentation","author":"Vu","year":"2023"},{"key":"ref265","article-title":"Guiding text-to-image diffusion model towards grounded generation","volume-title":"Proc. Int. Conf. Comput. Vis.","author":"Li"},{"key":"ref266","article-title":"ActionCLIP: A new paradigm for video action recognition","author":"Wang","year":"2021"},{"key":"ref267","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_7"},{"key":"ref268","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00633"},{"key":"ref269","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00539"},{"key":"ref270","article-title":"OpenVIS: Open-vocabulary video instance segmentation","author":"Guo","year":"2023"},{"key":"ref271","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00836"},{"key":"ref272","first-page":"2639","article-title":"PointCLIP V2: Adapting clip for powerful 3D open-world learning","volume-title":"Proc. Int. Conf. Comput. Vis.","author":"Zhu"},{"key":"ref273","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01463"},{"key":"ref274","article-title":"OpenShape: Scaling up 3D shape representation towards open-world understanding","author":"Liu","year":"2023"},{"key":"ref275","article-title":"Open-vocabulary 3D detection via image-level class and debiased cross-modal contrastive learning","author":"Lu","year":"2022"},{"key":"ref276","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00290"},{"key":"ref277","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00961"},{"key":"ref278","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"ref279","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_23"},{"key":"ref280","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19772-7_1"},{"key":"ref281","article-title":"Multimodal open-vocabulary video classification via pre-trained vision and language models","author":"Qian","year":"2022"},{"key":"ref282","first-page":"36978","article-title":"Transforming CLIP to an open-vocabulary video model via interpolated weight optimization","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Weng"},{"key":"ref283","article-title":"AIM: Adapting image models for efficient video action recognition","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Yang"},{"key":"ref284","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00933"},{"key":"ref285","article-title":"VicTR: Video-conditioned text representations for activity recognition","author":"Kahatapitiya","year":"2023"},{"key":"ref286","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20047-2_29"},{"key":"ref287","article-title":"DVIS++: Improved decoupled framework for universal video segmentation","author":"Zhang","year":"2023"},{"key":"ref288","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02025"},{"key":"ref289","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00727"},{"key":"ref290","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00121"},{"key":"ref291","first-page":"1","article-title":"CoDA: Collaborative novel box discovery and cross-modal alignment for open-vocabulary 3D object detection","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Cao"},{"key":"ref292","article-title":"Object2scene: Putting objects in context for open-vocabulary 3D detection","author":"Zhu","year":"2023"},{"key":"ref293","article-title":"OpenSight: A simple open-vocabulary framework for LiDAR-based object detection","author":"Zhang","year":"2023"},{"key":"ref294","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00219"},{"key":"ref295","article-title":"RegionPLC: Regional point-language contrastive learning for open-world 3D scene understanding","author":"Yang","year":"2023"},{"key":"ref296","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02082"},{"key":"ref297","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01392"},{"key":"ref298","first-page":"1","article-title":"OpenMask3D: Open-vocabulary 3D instance segmentation","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Takmaz"},{"key":"ref299","article-title":"Openins3D: Snap and lookup for 3D open-vocabulary instance segmentation","author":"Huang","year":"2023"},{"key":"ref300","article-title":"Open3DIS: Open-vocabulary 3D instance segmentation with 2D mask guidance","author":"Nguyen","year":"2023"},{"key":"ref301","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00678"},{"key":"ref302","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2022.3146922"},{"key":"ref303","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01060"},{"key":"ref304","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00438"},{"key":"ref305","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20080-9_30"},{"key":"ref306","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3227513"},{"key":"ref307","first-page":"4047","article-title":"Fine-grained entity segmentation","volume-title":"Proc. Int. Conf. Comput. Vis.","author":"Qi"},{"key":"ref308","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00902"},{"key":"ref309","article-title":"Open world DETR: Transformer based open world object detection","author":"Dong","year":"2022"},{"key":"ref310","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01101"},{"key":"ref311","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20080-9_12"},{"key":"ref312","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00123"},{"key":"ref313","article-title":"Dual decision improves open-set panoptic segmentation","volume-title":"Proc. Brit. Mach. Vis. Conf.","author":"Xu"},{"key":"ref314","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01417"},{"key":"ref315","first-page":"1","article-title":"AIMS: All-inclusive multi-level segmentation","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Qi"},{"key":"ref316","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01477"},{"key":"ref317","article-title":"An open and comprehensive pipeline for unified object grounding and detection","author":"Zhao","year":"2024"},{"key":"ref318","article-title":"Semantic-SAM: Segment and recognize anything at any granularity","author":"Li","year":"2023"},{"key":"ref319","article-title":"Open-vocabulary SAM: Segment and recognize twenty-thousand classes interactively","author":"Yuan","year":"2024"},{"key":"ref320","article-title":"Rethinking evaluation metrics of open-vocabulary segmentaion","author":"Zhou","year":"2023"},{"key":"ref321","first-page":"1","article-title":"Explore in-context learning for 3D point cloud understanding","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Fang"},{"key":"ref322","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00660"},{"key":"ref323","first-page":"1","article-title":"Visual instruction tuning","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Liu"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/34\/10550108\/10420487.pdf?arnumber=10420487","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,10]],"date-time":"2024-11-10T13:40:36Z","timestamp":1731246036000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10420487\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,7]]},"references-count":323,"journal-issue":{"issue":"7"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2024.3361862","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,7]]}}}