{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,19]],"date-time":"2026-05-19T17:29:52Z","timestamp":1779211792280,"version":"3.51.4"},"reference-count":87,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2024,8,20]],"date-time":"2024-08-20T00:00:00Z","timestamp":1724112000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,8,20]],"date-time":"2024-08-20T00:00:00Z","timestamp":1724112000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,2]]},"DOI":"10.1007\/s11263-024-02214-4","type":"journal-article","created":{"date-parts":[[2024,8,22]],"date-time":"2024-08-22T06:21:11Z","timestamp":1724307671000},"page":"825-843","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":83,"title":["Contextual Object Detection with Multimodal Large Language Models"],"prefix":"10.1007","volume":"133","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1110-5062","authenticated-orcid":false,"given":"Yuhang","family":"Zang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wei","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jun","family":"Han","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kaiyang","family":"Zhou","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chen Change","family":"Loy","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,8,20]]},"reference":[{"key":"2214_CR1","unstructured":"Alayrac, J. B., Donahue, J., Luc, P., Miech, A., Barr, I., Hasson, Y., Lenc, K., Mensch, A., Millican, K., Reynolds, M., Ring, R., Rutherford, E., Cabi, S., Han, T., Gong, Z., Samangooei, S., Monteiro, M., Menick, J., Borgeaud, S., & Simonyan, K. (2022). Flamingo: a visual language model for few-shot learning. Advances in Neural Information Processing Systems, 35, 23716\u201323736."},{"key":"2214_CR2","doi-asserted-by":"crossref","unstructured":"Anderson, P., He, X., Buehler, C., Teney, D., Johnson, M., Gould, S., & Zhang, L. (2018). Bottom-up and top-down attention for image captioning and visual question answering. In Proceedings of the IEEE conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2018.00636"},{"key":"2214_CR3","doi-asserted-by":"crossref","unstructured":"Antol, S., Agrawal, A., Lu, J., Mitchell, M., Batra, D., Zitnick, CL., & Parikh, D. (2015). VQA: Visual question answering. In Proceedings of the IEEE international conference on computer vision.","DOI":"10.1109\/ICCV.2015.279"},{"key":"2214_CR4","doi-asserted-by":"crossref","unstructured":"Bansal, A., Sikka, K., Sharma, G., Chellappa, R., & Divakaran, A. (2018). Zero-shot object detection. In Proceedings of the European conference on computer vision (ECCV).","DOI":"10.1007\/978-3-030-01246-5_24"},{"key":"2214_CR5","unstructured":"Brown, T., Mann, B., Ryder, N., Subbiah, M., Kaplan, J.D., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A., & Agarwal, S. (2020). Language models are few-shot learners. In Advances in neural information processing systems."},{"key":"2214_CR6","doi-asserted-by":"crossref","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., & Zagoruyko, S. (2020). End-to-end object detection with transformers. In European conference on computer vision.","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"2214_CR7","doi-asserted-by":"crossref","unstructured":"Chen, J., Guo, H., Yi, K., Li, B., & Elhoseiny, M. (2022a). VisualGPT: Data-efficient adaptation of pretrained language models for image captioning. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR52688.2022.01750"},{"key":"2214_CR8","doi-asserted-by":"crossref","unstructured":"Chen, L., Zhang, H., Xiao, J., Nie, L., Shao, J., Liu, W., & Chua, T. S. (2017). SCA-CNN: Spatial and channel-wise attention in convolutional networks for image captioning. In Proceedings of the IEEE conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2017.667"},{"key":"2214_CR9","unstructured":"Chen, T., Saxena, S., Li, L., Fleet, D. J., & Hinton, G. (2022b). Pix2Seq: A language modeling framework for object detection. In ICLR."},{"key":"2214_CR10","doi-asserted-by":"crossref","unstructured":"Chen, Z., Huang, S., & Tao, D. (2018). Context refinement for object detection. In Proceedings of the European conference on computer vision (ECCV).","DOI":"10.1007\/978-3-030-01237-3_5"},{"key":"2214_CR11","unstructured":"Chowdhery, A., Narang, S., Devlin, J., Bosma, M., Mishra, G., Roberts, A., Barham, P., Chung, H. W., Sutton, C., Gehrmann, S. & Schuh, P. (2022). PaLM: Scaling language modeling with pathways. arXiv preprint arXiv:2204.02311"},{"key":"2214_CR12","doi-asserted-by":"crossref","unstructured":"Dai, Y., Lang, H., Zeng, K., Huang, F., & Li, Y. (2023). Exploring large language models for multi-modal out-of-distribution detection. arXiv preprint arXiv:2310.08027","DOI":"10.18653\/v1\/2023.findings-emnlp.351"},{"key":"2214_CR13","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L. J., Li, K., & Fei-Fei, L. (2009). ImageNet: A large-scale hierarchical image database. In 2009 IEEE conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"2214_CR14","doi-asserted-by":"crossref","unstructured":"Divvala, S. K., Hoiem, D., Hays, J. H., Efros, A. A., & Hebert, M. (2009). An empirical study of context in object detection. In 2009 IEEE conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPRW.2009.5206532"},{"key":"2214_CR15","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., & Gelly, S. (2021) An image is worth 16x16 words: Transformers for image recognition at scale. In ICLR."},{"key":"2214_CR16","unstructured":"Driess, D., Xia, F., Sajjadi, MSM., Lynch, C., Chowdhery, A., Ichter, B., Wahid, A., Tompson, J., Vuong, Q., Yu, T., Huang, W., Chebotar, Y., Sermanet, P., Duckworth, D., Levine, S., Vanhoucke, V., Hausman, K., Toussaint, M., Greff, K., Zeng, A., Mordatch, I., & Florence, P. (2023). PaLM-E: An embodied multimodal language model. arXiv preprint arXiv:2303.03378"},{"key":"2214_CR17","doi-asserted-by":"crossref","unstructured":"Du, Y., Wei, F., Zhang, Z., Shi, M., Gao, Y., & Li, G. (2022). Learning to prompt for open-vocabulary object detection with vision-language model. In CVPR.","DOI":"10.1109\/CVPR52688.2022.01369"},{"key":"2214_CR18","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1007\/s11263-009-0275-4","volume":"88","author":"M Everingham","year":"2010","unstructured":"Everingham, M., Van Gool, L., Williams, C. K., Winn, J., & Zisserman, A. (2010). The PASCAL visual object classes (VOC) challenge. International Journal of Computer Vision, 88, 303\u2013338.","journal-title":"International Journal of Computer Vision"},{"key":"2214_CR19","doi-asserted-by":"crossref","unstructured":"Ghiasi, G., Cui, Y., Srinivas, A., Qian, R., Lin, TY., Cubuk, ED., Le, QV., & Zoph, B. (2021). Simple copy-paste is a strong data augmentation method for instance segmentation. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR46437.2021.00294"},{"key":"2214_CR20","doi-asserted-by":"crossref","unstructured":"Goyal, Y., Khot, T., Summers-Stay, D., Batra, D., & Parikh, D. (2017). Making the v in vqa matter: Elevating the role of image understanding in visual question answering. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2017.670"},{"key":"2214_CR21","unstructured":"Gu, X., Lin, T. Y., Kuo, W., & Cui, Y. (2022). Open-vocabulary object detection via vision and language knowledge distillation. In ICLR."},{"key":"2214_CR22","doi-asserted-by":"crossref","unstructured":"Gupta, A., Dollar, P., & Girshick, R. (2019). LVIS: A dataset for large vocabulary instance segmentation. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2019.00550"},{"key":"2214_CR23","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In Proceedings of the IEEE conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2016.90"},{"key":"2214_CR24","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., & Girshick, R. (2017). Mask R-CNN. In Proceedings of the IEEE international conference on computer vision.","DOI":"10.1109\/ICCV.2017.322"},{"issue":"8","key":"2214_CR25","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., & Schmidhuber, J. (1997). Long short-term memory. Neural Computation, 9(8), 1735\u20131780.","journal-title":"Neural Computation"},{"key":"2214_CR26","unstructured":"Huang, S., Dong, L., Wang, W., Hao, Y., Singhal, S., Ma, S., Lv, T., Cui, L., Mohammed, O. K., & Liu, Q. (2023). Language is not all you need: Aligning perception with language models. arXiv preprint arXiv:2302.14045"},{"key":"2214_CR27","unstructured":"HuggingFace. (2024). Huggingface. https:\/\/huggingface.co\/"},{"key":"2214_CR28","unstructured":"Jia, C., Yang, Y., Xia, Y., Chen, Y. T., Parekh, Z., Pham, H., Le, Q., Sung, Y. H., Li, Z., & Duerig, T. (2021). Scaling up visual and vision-language representation learning with noisy text supervision. In International conference on machine learning."},{"key":"2214_CR29","doi-asserted-by":"crossref","unstructured":"Kamath, A., Singh, M., LeCun, Y., Synnaeve, G., Misra, I., & Carion, N. (2021). Mdetr-modulated detection for end-to-end multi-modal understanding. In Proceedings of the IEEE\/CVF international conference on computer vision.","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"2214_CR30","doi-asserted-by":"crossref","unstructured":"Karpathy, A., & Fei-Fei, L. (2015). Deep visual-semantic alignments for generating image descriptions. In Proceedings of the IEEE conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"2214_CR31","doi-asserted-by":"crossref","unstructured":"Kirillov, A., Mintun, E., Ravi, N., Mao, H., Rolland, C., Gustafson, L., Xiao, T., Whitehead, S., Berg, A. C., Lo, W. Y., & Doll\u00e1r, P. (2023). Segment anything. arXiv preprint arXiv:2304.02643","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"2214_CR32","unstructured":"Koh, J. Y., Salakhutdinov, R., & Fried, D. (2023). Grounding language models to images for multimodal generation. arXiv preprint arXiv:2301.13823"},{"key":"2214_CR33","unstructured":"Kuo, W., Cui, Y., Gu, X., Piergiovanni, A., & Angelova, A. (2022). F-VLM: Open-vocabulary object detection upon frozen vision and language models. arXiv preprint arXiv:2209.15639"},{"issue":"7","key":"2214_CR34","doi-asserted-by":"publisher","first-page":"1956","DOI":"10.1007\/s11263-020-01316-z","volume":"128","author":"A Kuznetsova","year":"2020","unstructured":"Kuznetsova, A., Rom, H., Alldrin, N., Uijlings, J., Krasin, I., Pont-Tuset, J., Kamali, S., Popov, S., Malloci, M., Kolesnikov, A., & Duerig, T. (2020). The open images dataset v4. International Journal of Computer Vision, 128(7), 1956\u20131981.","journal-title":"International Journal of Computer Vision"},{"key":"2214_CR35","doi-asserted-by":"crossref","unstructured":"Law, H., & Deng, J. (2018). CornerNet: Detecting objects as paired keypoints. In Proceedings of the European conference on computer vision ECCV.","DOI":"10.1007\/978-3-030-01264-9_45"},{"key":"2214_CR36","unstructured":"Li, J., Li, D., Savarese, S., & Hoi, S. (2023) BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597"},{"key":"2214_CR37","doi-asserted-by":"crossref","unstructured":"Li, L. H., Zhang, P., Zhang, H., Yang, J., Li, C., Zhong, Y., Wang, L., Yuan, L., Zhang, L., & Hwang, J. N., & Chang, K. W. (2022) Grounded language-image pre-training. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"2214_CR38","unstructured":"Lin, T. Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., & Zitnick, C. L. (2014). Microsoft COCO: Common objects in context. In Computer vision\u2013ECCV 2014: 13th European conference, Zurich, Switzerland, September 6\u201312, 2014, proceedings, Part V 13."},{"key":"2214_CR39","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., & Lee, Y. J. (2023a). Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"2214_CR40","unstructured":"Liu, H., Li, C., Wu, Q., & Lee, Y. J. (2023b). Visual instruction tuning. arXiv preprint arXiv:2304.08485"},{"key":"2214_CR41","doi-asserted-by":"crossref","unstructured":"Liu, J., Ding, H., Cai, Z., Zhang, Y., Satzoda, R. K., Mahadevan, V., & Manmatha, R. (2023c). PolyFormer: Referring image segmentation as sequential polygon generation. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR52729.2023.01789"},{"key":"2214_CR42","doi-asserted-by":"crossref","unstructured":"Liu, S., Zeng, Z., Ren, T., Li, F., Zhang, H., Yang, J., Li, C., Yang, J., Su, H., Zhu, J., & Zhang, L. (2023d). Grounding DINO: Marrying dino with grounded pre-training for open-set object detection. arXiv preprint arXiv:2303.05499","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"2214_CR43","unstructured":"Liu, W., Anguelov, D., Erhan, D., Szegedy, C., Reed, S., Fu, C. Y., & Berg, A. C. (2016). SSD: Single shot multibox detector. In Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11\u201314, 2016, Proceedings, Part I 14."},{"key":"2214_CR44","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., & Guo, B. (2021). Swin transformer: Hierarchical vision transformer using shifted windows. In Proceedings of the IEEE\/CVF international conference on computer vision.","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"2214_CR45","unstructured":"Loshchilov, I., & Hutter, F. (2019). Decoupled weight decay regularization. In ICLR."},{"key":"2214_CR46","unstructured":"Mokady, R., Hertz, A., & Bermano, A. H. (2021) ClipCap: Clip prefix for image captioning. arXiv preprint arXiv:2111.09734"},{"key":"2214_CR47","doi-asserted-by":"crossref","unstructured":"Mottaghi, R., Chen, X., Liu, X., Cho, N. G., Lee, S. W., Fidler, S., Urtasun, R., & Yuille, A. (2014). The role of context for object detection and semantic segmentation in the wild. In Proceedings of the IEEE conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2014.119"},{"key":"2214_CR48","unstructured":"Muchen, L., & Leonid, S. (2021). Referring transformer: A one-step approach to multi-task visual grounding. In NeurIPS."},{"key":"2214_CR49","doi-asserted-by":"crossref","unstructured":"Nagaraja, V. K., Morariu, V. I., & Davis, L. S. (2016). Modeling context between objects for referring expression understanding. In ECCV.","DOI":"10.1007\/978-3-319-46493-0_48"},{"key":"2214_CR50","unstructured":"OpenAI. (2022). Chatgpt: Optimizing language models for dialogue. https:\/\/openai.com\/blog\/chatgpt"},{"key":"2214_CR51","unstructured":"OpenAI. (2023). GPT-4 technical report. arXiv preprint arXiv:2303.08774"},{"key":"2214_CR52","unstructured":"Ouyang-Zhang, J., Cho, J. H., Zhou, X., & Kr\u00e4henb\u00fchl, P. (2022). NMS strikes back. arXiv preprint arXiv:2212.06137"},{"key":"2214_CR53","doi-asserted-by":"crossref","unstructured":"Plummer, B. A., Wang, L., Cervantes, C. M., Caicedo, J. C., Hockenmaier, J., & Lazebnik, S. (2015). Flickr30K entities: Collecting region-to-phrase correspondences for richer image-to-sentence models. In Proceedings of the IEEE international conference on computer vision.","DOI":"10.1109\/ICCV.2015.303"},{"key":"2214_CR54","unstructured":"Radford, A., Narasimhan, K., Salimans, T., & Sutskever, I. (2018). Improving language understanding by generative pre-training. OpenAI Blog."},{"issue":"8","key":"2214_CR55","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., & Sutskever, I. (2019). Language models are unsupervised multitask learners. OpenAI Blog, 1(8), 9.","journal-title":"OpenAI Blog"},{"key":"2214_CR56","unstructured":"Radford, A., Kim, J. W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., & Clark, J., Krueger, G. (2021). Learning transferable visual models from natural language supervision. In International conference on machine learning."},{"issue":"140","key":"2214_CR57","first-page":"1","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel, C., Shazeer, N., Roberts, A., Lee, K., Narang, S., Matena, M., Zhou, Y., Li, W., & Liu, P. J. (2020). Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of Machine Learning Research, 21(140), 1\u201367.","journal-title":"Journal of Machine Learning Research"},{"key":"2214_CR58","first-page":"33781","volume":"35","author":"H Rasheed","year":"2022","unstructured":"Rasheed, H., Maaz, M., Khattak, M. U., Khan, S., & Khan, F. S. (2022). Bridging the gap between object and image-level representations for open-vocabulary detection. Advances in Neural Information Processing Systems, 35, 33781\u201333794.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2214_CR59","unstructured":"Ren, S., He, K., Girshick, R., & Sun, J. (2015). Faster R-CNN: Towards real-time object detection with region proposal networks. Advances in neural information processing systems, 28."},{"key":"2214_CR60","doi-asserted-by":"crossref","unstructured":"Rezatofighi, H., Tsoi, N., Gwak, J., Sadeghian, A., Reid, I., & Savarese, S. (2019). Generalized intersection over union: A metric and a loss for bounding box regression. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR.2019.00075"},{"key":"2214_CR61","doi-asserted-by":"crossref","unstructured":"Shao, S., Li, Z., Zhang, T., Peng, C., Yu, G., Zhang, X., Li, J., & Sun, J. (2019). Objects365: A large-scale, high-quality dataset for object detection. In Proceedings of the IEEE\/CVF international conference on computer vision.","DOI":"10.1109\/ICCV.2019.00852"},{"key":"2214_CR62","unstructured":"Shen, Y., Song, K., Tan, X., Li, D., Lu, W., & Zhuang, Y. (2023). HuggingGPT: Solving ai tasks with chatgpt and its friends in huggingface. arXiv preprint arXiv:2303.17580"},{"key":"2214_CR63","doi-asserted-by":"crossref","unstructured":"Shrivastava, A. & Gupta, A. (2016). Contextual priming and feedback for faster r-cnn. In ECCV.","DOI":"10.1007\/978-3-319-46448-0_20"},{"key":"2214_CR64","doi-asserted-by":"crossref","unstructured":"Tian, Z., Shen, C., Chen, H., & He, T. (2019). FCOS: Fully convolutional one-stage object detection. In CVPR.","DOI":"10.1109\/ICCV.2019.00972"},{"key":"2214_CR65","unstructured":"Touvron, H., Lavril, T., Izacard, G., Martinet, X., Lachaux, MA., Lacroix, T., Rozi\u00e8re, B., Goyal, N., Hambro, E., & Azhar, F. (2023). LLaMA: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971"},{"key":"2214_CR66","first-page":"200","volume":"34","author":"M Tsimpoukelli","year":"2021","unstructured":"Tsimpoukelli, M., Menick, J. L., Cabi, S., Eslami, S., Vinyals, O., & Hill, F. (2021). Multimodal few-shot learning with frozen language models. Advances in Neural Information Processing Systems, 34, 200\u2013212.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2214_CR67","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, AN., Kaiser, \u0141., & Polosukhin, I. (2017). Attention is all you need. Advances in Neural Information Processing Systems, 30."},{"key":"2214_CR68","doi-asserted-by":"crossref","unstructured":"Wang, J., Zhang, P., Chu, T., Cao, Y., Zhou, Y., Wu, T., Wang, B., He, C., & Lin, D. (2023a). V3Det: Vast vocabulary visual detection dataset. arXiv preprint arXiv:2304.03752","DOI":"10.1109\/ICCV51070.2023.01817"},{"key":"2214_CR69","doi-asserted-by":"crossref","unstructured":"Wang, W., Dai, J., Chen, Z., Huang, Z., Li, Z., Zhu, X., Hu, X., Lu, T., Lu, L., & Li, H. (2023b). InternImage: Exploring large-scale vision foundation models with deformable convolutions. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR52729.2023.01385"},{"key":"2214_CR70","doi-asserted-by":"crossref","unstructured":"Wang, Z., Lu, Y., Li, Q., Tao, X., Guo, Y., Gong, M., & Liu, T. (2022) CRIS: Clip-driven referring image segmentation. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR52688.2022.01139"},{"key":"2214_CR71","unstructured":"Wu, C., Yin, S., Qi, W., Wang, X., Tang, Z., & Duan, N. (2023a). Visual ChatGPT: Talking, drawing and editing with visual foundation models. arXiv preprint arXiv:2303.04671"},{"key":"2214_CR72","doi-asserted-by":"crossref","unstructured":"Wu, J., Li, X., Ding, H., Li, X., Cheng, G., Tong, Y., & Loy, C. C. (2023b). Betrayed by captions: Joint caption grounding and generation for open vocabulary instance segmentation. arXiv preprint arXiv:2301.00805","DOI":"10.36227\/techrxiv.22082723.v1"},{"key":"2214_CR73","doi-asserted-by":"crossref","unstructured":"Wu, S., Zhang, W., Jin, S., Liu, W., & Loy, C. C. (2023c). Aligning bag of regions for open-vocabulary object detection. In CVPR.","DOI":"10.1109\/CVPR52729.2023.01464"},{"key":"2214_CR74","doi-asserted-by":"crossref","unstructured":"Wu, X., Zhu, F., Zhao, R., & Li, H. (2023d). CORA: Adapting clip for open-vocabulary detection with region prompting and anchor pre-matching. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR52729.2023.00679"},{"key":"2214_CR75","doi-asserted-by":"crossref","unstructured":"Yang, Z., Wang, J., Tang, Y., Chen, K., Zhao, H., & Torr, PH. (2022). LAVT: Language-aware vision transformer for referring image segmentation. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR52688.2022.01762"},{"key":"2214_CR76","unstructured":"Yang, Z., Li, L., Wang, J., Lin, K., Azarnasab, E., Ahmed, F., Liu, Z., Liu, C., Zeng, M., & Wang, L. (2023). MM-REACT: Prompting chatgpt for multimodal reasoning and action. arXiv preprint arXiv:2303.11381"},{"key":"2214_CR77","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young, P., Lai, A., Hodosh, M., & Hockenmaier, J. (2014). From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions. Transactions of the Association for Computational Linguistics, 2, 67\u201378.","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"2214_CR78","unstructured":"Yu, L., Poirson, P., Yang, S., Berg, A. C., & Berg, T. L. (2016). Modeling context in referring expressions. In Computer vision\u2013ECCV 2016: 14th European conference, Amsterdam, The Netherlands, October 11\u201314, 2016, Proceedings, Part II 14."},{"key":"2214_CR79","unstructured":"Yu, W., Iter, D., Wang, S., Xu, Y., Ju, M., Sanyal, S., Zhu, C., Zeng, M., & Jiang, M. (2022). Generate rather than retrieve: Large language models are strong context generators. In ICLR."},{"key":"2214_CR80","doi-asserted-by":"crossref","unstructured":"Zang, Y., Li, W., Zhou, K., Huang, C., & Loy, C. C. (2022). Open-vocabulary detr with conditional matching. In ECCV.","DOI":"10.1007\/978-3-031-20077-9_7"},{"key":"2214_CR81","doi-asserted-by":"crossref","unstructured":"Zareian, A., Rosa, K. D., Hu, D. H., & Chang, S. F. (2021). Open-vocabulary object detection using captions. In CVPR.","DOI":"10.1109\/CVPR46437.2021.01416"},{"key":"2214_CR82","first-page":"36067","volume":"35","author":"H Zhang","year":"2022","unstructured":"Zhang, H., Zhang, P., Hu, X., Chen, Y. C., Li, L. H., Dai, X., Wang, L., Yuan, L., Hwang, J. N., & Gao, J. (2022). GLIPv2: Unifying localization and vision-language understanding. Advances in Neural Information Processing Systems, 35, 36067\u20133608.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2214_CR83","unstructured":"Zhang, H., Li, F., Liu, S., Zhang, L., Su, H., Zhu, J., Ni, L. M., & Shum, H. Y. (2023). DINO: Detr with improved denoising anchor boxes for end-to-end object detection. In ICLR."},{"key":"2214_CR84","unstructured":"Zhang, S., Roller, S., Goyal, N., Artetxe, M., Chen, M., Chen, S., Dewan, C., Diab, M., Li, X., & Lin, X. V. (2022b). OPT: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068"},{"key":"2214_CR85","doi-asserted-by":"crossref","unstructured":"Zhong, Y., Yang, J., Zhang, P., Li, C., Codella, N., Li, L. H., Zhou, L., Dai, X., Yuan, L., & Li, Y. (2022). RegionCLIP: Region-based language-image pretraining. In CVPR.","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"2214_CR86","doi-asserted-by":"crossref","unstructured":"Zhou, X., Girdhar, R., Joulin, A., Kr\u00e4henb\u00fchl, P., & Misra, I. (2022). Detecting twenty-thousand classes using image-level supervision. In: ECCV.","DOI":"10.1007\/978-3-031-20077-9_21"},{"key":"2214_CR87","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., & Dai, J. (2021). Deformable DETR: Deformable transformers for end-to-end object detection. In ICLR."}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-024-02214-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-024-02214-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-024-02214-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,22]],"date-time":"2025-01-22T06:43:23Z","timestamp":1737528203000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-024-02214-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,20]]},"references-count":87,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2025,2]]}},"alternative-id":["2214"],"URL":"https:\/\/doi.org\/10.1007\/s11263-024-02214-4","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,8,20]]},"assertion":[{"value":"22 February 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"31 July 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 August 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}