{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T16:18:04Z","timestamp":1775837884132,"version":"3.50.1"},"reference-count":112,"publisher":"Tsinghua University Press","issue":"3","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62106150,62272315"],"award-info":[{"award-number":["62106150,62272315"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Big Data Min. Anal."],"published-print":{"date-parts":[[2025,6]]},"DOI":"10.26599\/bdma.2024.9020098","type":"journal-article","created":{"date-parts":[[2025,4,4]],"date-time":"2025-04-04T21:07:31Z","timestamp":1743800851000},"page":"726-750","source":"Crossref","is-referenced-by-count":11,"title":["A Survey of Zero-Shot Object Detection"],"prefix":"10.26599","volume":"8","author":[{"given":"Weipeng","family":"Cao","sequence":"first","affiliation":[{"name":"Guangdong Laboratory of Artificial Intelligence and Digital Economy (Shenzhen),Shenzhen,China,518107"}]},{"given":"Xuyang","family":"Yao","sequence":"additional","affiliation":[{"name":"College of Computer Science and Software Engineering, Shenzhen University,Shenzhen,China,518060"}]},{"given":"Zhiwu","family":"Xu","sequence":"additional","affiliation":[{"name":"College of Computer Science and Software Engineering, Shenzhen University,Shenzhen,China,518060"}]},{"given":"Ye","family":"Liu","sequence":"additional","affiliation":[{"name":"College of Computer Science and Software Engineering, Shenzhen University,Shenzhen,China,518060"}]},{"given":"Yinghui","family":"Pan","sequence":"additional","affiliation":[{"name":"Shenzhen University,National Engineering Laboratory for Big Data System Computing Technology,Shenzhen,China,518060"}]},{"given":"Zhong","family":"Ming","sequence":"additional","affiliation":[{"name":"Guangdong Laboratory of Artificial Intelligence and Digital Economy (Shenzhen),Shenzhen,China,518107"}]}],"member":"11138","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2022.3215572"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1016\/j.ins.2023.01.106"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2023.3238524"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_24"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01416"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2018.8460700"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093355"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00577"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02250"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01072"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00294"},{"key":"ref12","first-page":"42098","article-title":"X-Paste: Revisiting scalable Copy-Paste for instance segmentation using clip and StableDiffusion","volume-title":"Proc. 40th Int. Conf. Machine Learning","author":"Zhao"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_21"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20080-9_42"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20080-9_16"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_10"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_41"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2023.3293484"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00681"},{"key":"ref20","first-page":"15946","article-title":"Multi-modal classifiers for open-vocabulary object detection","volume-title":"Proc. 40th Int. Conf. Machine Learning","author":"Kaul"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i1.25159"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01464"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.15"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2022.3184821"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.2986892"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2023.109869"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1007\/s00371-022-02604-0"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.02.056"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2023.104758"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2022.3151073"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2021.106773"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3140070"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00441"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01101"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01885"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3361862"},{"key":"ref38","volume-title":"Pixel-BERT: Aligning image pixels with text by deep multi-modal transformers","author":"Huang","year":"2020"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"ref42","first-page":"1","article-title":"Learning object-language alignments for open-vocabulary object detection","volume-title":"Proc. 11th Int. Conf. Learning Representations","author":"Lin"},{"key":"ref43","first-page":"393","article-title":"Localized vision-language matching for open-vocabulary object detection","volume-title":"Proc. 44th DAGM German Conf. Pattern Recognition","author":"Bravo"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28278"},{"key":"ref45","first-page":"663","article-title":"DetCLIP: Dictionary-enriched visual-concept paralleled pre-training for open-world detection","volume-title":"Proc. 36th Neural Information Processing Systems","author":"Yao"},{"key":"ref46","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. 38th Int. Conf. Machine Learning, Virtual Event","author":"Radford","year":"2021"},{"key":"ref47","first-page":"1","article-title":"Open-vocabulary object detection upon frozen vision and language models","volume-title":"Proc. 11th Int. Conf. Learning Representations","author":"Kuo"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00679"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01368"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00978"},{"key":"ref51","first-page":"1","article-title":"Open-vocabulary object detection via vision and language knowledge distillation","volume-title":"Proc. 10th Int. Conf. Learning Representations, Virtual Event","author":"Gu","year":"2021"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01369"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01076"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_44"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.473"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2021.108237"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01288"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1002\/widm.1488"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2013.09.056"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-50077-5_2"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.453"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01240-3_29"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00635"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01268"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.3011807"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00171"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2019.8803655"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00267"},{"key":"ref69","first-page":"8805","article-title":"Chain of visual perception: Harnessing multimodal large language models for zero-shot camouflaged object detection","volume-title":"Proc. 32nd ACM Int. Conf. Multimedia","author":"Tang"},{"key":"ref70","first-page":"2179","article-title":"Zero-shot visual relation detection via composite visual cues from large language models","volume-title":"Proc. 37th Int. Conf. Neural Information Processing Systems","author":"Li"},{"key":"ref71","first-page":"107","article-title":"Background learnable cascade for zero-shot object detection","volume-title":"Proc. 15th Asian Conf. Computer Vision","author":"Zheng"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00480"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.324"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00240"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00618"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093384"},{"key":"ref77","doi-asserted-by":"crossref","first-page":"117","DOI":"10.1016\/j.neucom.2021.03.073","article-title":"Rethinking semantic-visual alignment in zero-shot object detection via a softplus margin focal loss","volume":"449","author":"Li","year":"2021","journal-title":"Neurocomputing"},{"issue":"4","key":"ref78","doi-asserted-by":"crossref","first-page":"998","DOI":"10.1109\/TCSVT.2019.2899569","article-title":"Zero shot detection","volume":"30","author":"Zhu","year":"2019","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.690"},{"key":"ref80","first-page":"8690","article-title":"Zero-shot object detection with textual descriptions","volume-title":"Proc. 33rd AAAI Conf. Artificial Intelligence","author":"Li"},{"key":"ref81","first-page":"1","article-title":"Visual classification via description from large language models","volume-title":"Proc. 11th Int. Conf. Learning Representations","author":"Menon"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2024.102537"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447560"},{"key":"ref84","first-page":"6926","article-title":"Zero-shot aerial object detection with visual description regularization","volume-title":"Proc. 38th AAAI Conf. Artificial Intelligence","author":"Zang"},{"issue":"11","key":"ref85","doi-asserted-by":"crossref","first-page":"139","DOI":"10.1145\/3422622","article-title":"Generative adversarial networks","volume":"63","author":"Goodfellow","year":"2020","journal-title":"Commun. ACM"},{"key":"ref86","first-page":"14","article-title":"Auto-encoding variational bayes","volume-title":"Proc. 2nd Int. Conf. Learning Representations","author":"Kingma"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2017.308"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00227"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00758"},{"key":"ref90","first-page":"479","article-title":"Latent embedding feedback and discriminative features for zero-shot classification","volume-title":"Proc. 16th European Conf. Computer Vision-ECCV 2020","author":"Narayan"},{"key":"ref91","first-page":"155","article-title":"Synthesizing the unseen for zero-shot object detection","volume-title":"Proc. 15th Asian Conf. Computer Vision","author":"Hayat"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6996"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00747"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3135480"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28353"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448084"},{"key":"ref97","first-page":"538","article-title":"Research progress of zero-shot learning beyond computer vision","volume-title":"Proc. 20th Int. Conf. Algorithms and Architectures for Parallel Processing","author":"Cao"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2012.256"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00937"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2021.3123374"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3326279"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2022.3146922"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00902"},{"key":"ref104","first-page":"1","article-title":"Deformable DETR: Deformable transformers for end-to-end object detection","volume-title":"Proc. 9th Int. Conf. Learning Representations","author":"Zhu"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20080-9_12"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.37256\/aie.4220233058"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610712"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-014-0733-5"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00852"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00550"}],"container-title":["Big Data Mining and Analytics"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/8254253\/10949806\/10949839.pdf?arnumber=10949839","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,7]],"date-time":"2025-04-07T22:20:11Z","timestamp":1744064411000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10949839\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6]]},"references-count":112,"journal-issue":{"issue":"3"},"URL":"https:\/\/doi.org\/10.26599\/bdma.2024.9020098","relation":{},"ISSN":["2096-0654","2097-406X"],"issn-type":[{"value":"2096-0654","type":"print"},{"value":"2097-406X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,6]]}}}