{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T11:20:35Z","timestamp":1780053635714,"version":"3.54.0"},"publisher-location":"Cham","reference-count":84,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031734106","type":"print"},{"value":"9783031734113","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,23]],"date-time":"2024-11-23T00:00:00Z","timestamp":1732320000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,23]],"date-time":"2024-11-23T00:00:00Z","timestamp":1732320000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73411-3_10","type":"book-chapter","created":{"date-parts":[[2024,11,22]],"date-time":"2024-11-22T20:08:46Z","timestamp":1732306126000},"page":"164-182","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":10,"title":["DetToolChain: A New Prompting Paradigm to\u00a0Unleash Detection Ability of\u00a0MLLM"],"prefix":"10.1007","author":[{"given":"Yixuan","family":"Wu","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yizhou","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shixiang","family":"Tang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Wenhao","family":"Wu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Tong","family":"He","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Wanli","family":"Ouyang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Philip","family":"Torr","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jian","family":"Wu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,11,23]]},"reference":[{"key":"10_CR1","unstructured":"Achiam, J., et\u00a0al.: GPT-4 technical report. arXiv preprint arXiv:2303.08774 (2023)"},{"key":"10_CR2","unstructured":"Bai, J., et\u00a0al.: Qwen technical report. arXiv preprint arXiv:2309.16609 (2023)"},{"key":"10_CR3","unstructured":"Bai, J., et al.: Qwen-VL: a frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)"},{"key":"10_CR4","doi-asserted-by":"crossref","unstructured":"Bansal, A., Sikka, K., Sharma, G., Chellappa, R., Divakaran, A.: Zero-shot object detection. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 384\u2013400 (2018)","DOI":"10.1007\/978-3-030-01246-5_24"},{"key":"10_CR5","unstructured":"Besta, M., P., et\u00a0al.: Graph of thoughts: solving elaborate problems with large language models. arXiv preprint arXiv:2308.09687 (2023)"},{"key":"10_CR6","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., et al.: Language models are few-shot learners. Adv. Neural. Inf. Process. Syst. 33, 1877\u20131901 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"10_CR7","doi-asserted-by":"crossref","unstructured":"Buri\u0107, M., Pobar, M., Iva\u0161i\u0107-Kos, M.: Object detection in sports videos. In: 2018 41st International Convention on Information and Communication Technology, Electronics and Microelectronics (MIPRO), pp. 1034\u20131039. IEEE (2018)","DOI":"10.23919\/MIPRO.2018.8400189"},{"key":"10_CR8","doi-asserted-by":"publisher","first-page":"14531","DOI":"10.1109\/ACCESS.2020.2966881","volume":"8","author":"W Cao","year":"2020","unstructured":"Cao, W., Liu, Q., He, Z.: Review of pavement defect detection methods. IEEE Access 8, 14531\u201314544 (2020). https:\/\/doi.org\/10.1109\/ACCESS.2020.2966881","journal-title":"IEEE Access"},{"key":"10_CR9","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"10_CR10","doi-asserted-by":"crossref","unstructured":"Chen, B., et al.: SpatialVLM: endowing vision-language models with spatial reasoning capabilities. arXiv preprint arXiv:2401.12168 (2024)","DOI":"10.1109\/CVPR52733.2024.01370"},{"key":"10_CR11","unstructured":"Chen, J., Li, D.Z.X.S.X., Zhang, Z.L.P., Xiong, R.K.V.C.Y., Elhoseiny, M.: MiniGPT-v2: large language model as a unified interface for vision-language multi-task learning. arXiv preprint arXiv:2310.09478 (2023)"},{"key":"10_CR12","unstructured":"Chen, K., Zhang, Z., Zeng, W., Zhang, R., Zhu, F., Zhao, R.: Shikra: unleashing multimodal LLM\u2019s referential dialogue magic. arXiv preprint arXiv:2306.15195 (2023)"},{"key":"10_CR13","unstructured":"Chen, P., et al.: Open vocabulary object detection with proposal mining and prediction equalization. arXiv preprint arXiv:2206.11134 (2022)"},{"key":"10_CR14","doi-asserted-by":"crossref","unstructured":"Chen, S., Sun, P., Song, Y., Luo, P.: DiffusionDet: diffusion model for object detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 19830\u201319843 (2023)","DOI":"10.1109\/ICCV51070.2023.01816"},{"key":"10_CR15","unstructured":"Chen, X., Lin, M., Sch\u00e4rli, N., Zhou, D.: Teaching large language models to self-debug. arXiv preprint arXiv:2304.05128 (2023)"},{"issue":"2","key":"10_CR16","doi-asserted-by":"publisher","first-page":"318","DOI":"10.1007\/s11633-023-1380-5","volume":"21","author":"YC Chou","year":"2024","unstructured":"Chou, Y.C., Li, B., Fan, D.P., Yuille, A., Zhou, Z.: Acquiring weak annotations for tumor localization in temporal and volumetric data. Mach. Intell. Res. 21(2), 318\u2013330 (2024)","journal-title":"Mach. Intell. Res."},{"issue":"5","key":"10_CR17","doi-asserted-by":"publisher","first-page":"1459","DOI":"10.3390\/s20051459","volume":"20","author":"T Czimmermann","year":"2020","unstructured":"Czimmermann, T., et al.: Visual-based defect detection and classification approaches for industrial applications-a survey. Sensors 20(5), 1459 (2020)","journal-title":"Sensors"},{"key":"10_CR18","unstructured":"Dai, G., Shu, X., Wu, W.: GPT4Ego: unleashing the potential of pre-trained models for zero-shot egocentric action recognition. arXiv preprint arXiv:2401.10039 (2024)"},{"key":"10_CR19","first-page":"32942","volume":"35","author":"ZY Dou","year":"2022","unstructured":"Dou, Z.Y., et al.: Coarse-to-fine vision-language pre-training with fusion in the backbone. Adv. Neural. Inf. Process. Syst. 35, 32942\u201332956 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"10_CR20","unstructured":"Du, Y., Li, S., Torralba, A., Tenenbaum, J.B., Mordatch, I.: Improving factuality and reasoning in language models through multiagent debate. arXiv preprint arXiv:2305.14325 (2023)"},{"key":"10_CR21","unstructured":"Gu, X., Lin, T.Y., Kuo, W., Cui, Y.: Open-vocabulary object detection via vision and language knowledge distillation. arXiv preprint arXiv:2104.13921 (2021)"},{"key":"10_CR22","doi-asserted-by":"crossref","unstructured":"Han, Z., Zhu, F., Lao, Q., Jiang, H.: Zero-shot referring expression comprehension via structural similarity between images and captions. arXiv preprint arXiv:2311.17048 (2023)","DOI":"10.1109\/CVPR52733.2024.01362"},{"key":"10_CR23","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., Girshick, R.: Mask R-CNN. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2961\u20132969 (2017)","DOI":"10.1109\/ICCV.2017.322"},{"key":"10_CR24","doi-asserted-by":"crossref","unstructured":"Kim, D., Angelova, A., Kuo, W.: Contrastive feature masking open-vocabulary vision transformer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15602\u201315612 (2023)","DOI":"10.1109\/ICCV51070.2023.01430"},{"key":"10_CR25","unstructured":"Kirillov, A., et\u00a0al.: Segment anything. arXiv preprint arXiv:2304.02643 (2023)"},{"key":"10_CR26","unstructured":"Lei, B., Liao, C., Ding, C., et\u00a0al.: Boosting logical reasoning in large language models through a new framework: the graph of thought. arXiv preprint arXiv:2308.08614 (2023)"},{"key":"10_CR27","unstructured":"Lei, X., Yang, Z., Chen, X., Li, P., Liu, Y.: Scaffolding coordinates to promote vision-language coordination in large multi-modal models. arXiv preprint arXiv:2402.12058 (2024)"},{"key":"10_CR28","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"TY Lin","year":"2014","unstructured":"Lin, T.Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"10_CR29","doi-asserted-by":"crossref","unstructured":"Lin, Z., et\u00a0al.: Sphinx: the joint mixing of weights, tasks, and visual embeddings for multi-modal large language models. arXiv preprint arXiv:2311.07575 (2023)","DOI":"10.1007\/978-3-031-73033-7_3"},{"issue":"1","key":"10_CR30","doi-asserted-by":"publisher","first-page":"104","DOI":"10.1007\/s11633-023-1459-z","volume":"21","author":"J Liu","year":"2024","unstructured":"Liu, J., et al.: Deep industrial image anomaly detection: a survey. Mach. Intell. Res. 21(1), 104\u2013135 (2024)","journal-title":"Mach. Intell. Res."},{"key":"10_CR31","doi-asserted-by":"crossref","unstructured":"Liu, S., et al.: Grounding DINO: marrying DINO with grounded pre-training for open-set object detection. arXiv preprint arXiv:2303.05499 (2023)","DOI":"10.1007\/978-3-031-72970-6_3"},{"issue":"8","key":"10_CR32","doi-asserted-by":"publisher","first-page":"1074","DOI":"10.1109\/LGRS.2016.2565705","volume":"13","author":"Z Liu","year":"2016","unstructured":"Liu, Z., Wang, H., Weng, L., Yang, Y.: Ship rotated bounding box space for ship extraction from high-resolution optical satellite images with complex backgrounds. IEEE Geosci. Remote Sens. Lett. 13(8), 1074\u20131078 (2016)","journal-title":"IEEE Geosci. Remote Sens. Lett."},{"key":"10_CR33","unstructured":"Lu, P., et al.: Chameleon: plug-and-play compositional reasoning with large language models. Adv. Neural Inf. Process. Syst. 36 (2024)"},{"key":"10_CR34","doi-asserted-by":"crossref","unstructured":"Mao, J., Huang, J., Toshev, A., Camburu, O., Yuille, A.L., Murphy, K.: Generation and comprehension of unambiguous object descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 11\u201320 (2016)","DOI":"10.1109\/CVPR.2016.9"},{"key":"10_CR35","doi-asserted-by":"publisher","unstructured":"Minderer, M., et\u00a0al.: Simple open-vocabulary object detection. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision \u2013 ECCV 2022. ECCV 2022. LNCS, vol. 13670. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20080-9_42","DOI":"10.1007\/978-3-031-20080-9_42"},{"key":"10_CR36","doi-asserted-by":"crossref","unstructured":"Mitra, C., Huang, B., Darrell, T., Herzig, R.: Compositional chain-of-thought prompting for large multimodal models. arXiv preprint arXiv:2311.17076 (2023)","DOI":"10.1109\/CVPR52733.2024.01367"},{"key":"10_CR37","unstructured":"Peng, Z., et al.: Kosmos-2: grounding multimodal large language models to the world. arXiv preprint arXiv:2306.14824 (2023)"},{"issue":"1","key":"10_CR38","first-page":"5485","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. 21(1), 5485\u20135551 (2020)","journal-title":"J. Mach. Learn. Res."},{"key":"10_CR39","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. Adv. Neural Inf. Process. Syst. 28 (2015)"},{"key":"10_CR40","unstructured":"Schick, T., et al.: PEER: a collaborative language model. arXiv preprint arXiv:2208.11663 (2022)"},{"key":"10_CR41","doi-asserted-by":"publisher","unstructured":"Selvaraju, R.R., Cogswell, M., Das, A., Vedantam, R., Parikh, D., Batra, D.: Grad-CAM: visual explanations from deep networks via gradient-based localization. In: 2017 IEEE International Conference on Computer Vision (ICCV), pp. 618\u2013626 (2017). https:\/\/doi.org\/10.1109\/ICCV.2017.74","DOI":"10.1109\/ICCV.2017.74"},{"key":"10_CR42","doi-asserted-by":"crossref","unstructured":"Shtedritski, A., Rupprecht, C., Vedaldi, A.: What does clip know about a red circle? Visual prompt engineering for VLMs. arXiv preprint arXiv:2304.06712 (2023)","DOI":"10.1109\/ICCV51070.2023.01101"},{"key":"10_CR43","doi-asserted-by":"crossref","unstructured":"Subramanian, S., Merrill, W., Darrell, T., Gardner, M., Singh, S., Rohrbach, A.: ReCLIP: a strong zero-shot baseline for referring expression comprehension. arXiv preprint arXiv:2204.05991 (2022)","DOI":"10.18653\/v1\/2022.acl-long.357"},{"key":"10_CR44","doi-asserted-by":"crossref","unstructured":"Sun, P., et\u00a0al.: Sparse R-CNN: end-to-end object detection with learnable proposals. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14454\u201314463 (2021)","DOI":"10.1109\/CVPR46437.2021.01422"},{"key":"10_CR45","unstructured":"Team, G., et\u00a0al.: Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)"},{"key":"10_CR46","unstructured":"InternLM Team: InternLM: a multilingual language model with progressively enhanced capabilities (2023)"},{"key":"10_CR47","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1016\/j.cviu.2017.04.011","volume":"159","author":"G Thomas","year":"2017","unstructured":"Thomas, G., Gade, R., Moeslund, T.B., Carr, P., Hilton, A.: Computer vision for sports: current applications and research topics. Comput. Vis. Image Underst. 159, 3\u201318 (2017)","journal-title":"Comput. Vis. Image Underst."},{"key":"10_CR48","doi-asserted-by":"crossref","unstructured":"Tong, S., Liu, Z., Zhai, Y., Ma, Y., LeCun, Y., Xie, S.: Eyes wide shut? Exploring the visual shortcomings of multimodal LLMs. arXiv preprint arXiv:2401.06209 (2024)","DOI":"10.1109\/CVPR52733.2024.00914"},{"key":"10_CR49","doi-asserted-by":"crossref","unstructured":"Vandeghen, R., Cioppa, A., Van\u00a0Droogenbroeck, M.: Semi-supervised training to improve player and ball detection in soccer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3481\u20133490 (2022)","DOI":"10.1109\/CVPRW56347.2022.00392"},{"key":"10_CR50","doi-asserted-by":"crossref","unstructured":"Wang, L., et al.: Object-aware distillation pyramid for open-vocabulary object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11186\u201311196 (2023)","DOI":"10.1109\/CVPR52729.2023.01076"},{"key":"10_CR51","unstructured":"Wang, P., et al.: OFA: unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. In: International Conference on Machine Learning, pp. 23318\u201323340. PMLR (2022)"},{"key":"10_CR52","unstructured":"Wang, W., et\u00a0al.: VisionLLM: large language model is also an open-ended decoder for vision-centric tasks. arXiv preprint arXiv:2305.11175 (2023)"},{"key":"10_CR53","first-page":"12435","volume":"35","author":"Y Wang","year":"2022","unstructured":"Wang, Y., et al.: Unsupervised object detection pretraining with joint object priors generation and detector learning. Adv. Neural. Inf. Process. Syst. 35, 12435\u201312448 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"10_CR54","unstructured":"Wang, Y., et\u00a0al.: Hulk: a universal knowledge translator for human-centric tasks. arXiv preprint arXiv:2312.01697 (2023)"},{"key":"10_CR55","first-page":"24824","volume":"35","author":"J Wei","year":"2022","unstructured":"Wei, J., et al.: Chain-of-thought prompting elicits reasoning in large language models. Adv. Neural. Inf. Process. Syst. 35, 24824\u201324837 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"10_CR56","doi-asserted-by":"crossref","unstructured":"Wu, S., Zhang, W., Jin, S., Liu, W., Loy, C.C.: Aligning bag of regions for open-vocabulary object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15254\u201315264 (2023)","DOI":"10.1109\/CVPR52729.2023.01464"},{"key":"10_CR57","unstructured":"Wu, W., Yao, H., Zhang, M., Song, Y., Ouyang, W., Wang, J.: GPT4Vis: what can GPT-4 do for zero-shot visual recognition? arXiv preprint arXiv:2311.15732 (2023)"},{"key":"10_CR58","doi-asserted-by":"crossref","unstructured":"Wu, X., Zhu, F., Zhao, R., Li, H.: CORA: adapting clip for open-vocabulary detection with region prompting and anchor pre-matching. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7031\u20137040 (2023)","DOI":"10.1109\/CVPR52729.2023.00679"},{"key":"10_CR59","doi-asserted-by":"crossref","unstructured":"Wu, Y., Zhang, Z., Xie, C., Zhu, F., Zhao, R.: Advancing referring expression segmentation beyond single image. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2628\u20132638 (2023)","DOI":"10.1109\/ICCV51070.2023.00248"},{"key":"10_CR60","doi-asserted-by":"publisher","unstructured":"Wu, Y., Zheng, B., Chen, J., Chen, D.Z., Wu, J.: Self-learning and one-shot learning based single-slice annotation for 3D medical image segmentation. In: Wang, L., Dou, Q., Fletcher, P.T., Speidel, S., Li, S. (eds.) Medical Image Computing and Computer Assisted Intervention \u2013 MICCAI 2022. MICCAI 2022. LNCS, vol. 13438. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-16452-1_24","DOI":"10.1007\/978-3-031-16452-1_24"},{"key":"10_CR61","unstructured":"Xie, C., Zhang, Z., Wu, Y., Zhu, F., Zhao, R., Liang, S.: Described object detection: Liberating object detection with flexible expressions. Adv. Neural Inf. Process. Syst. 36 (2024)"},{"key":"10_CR62","doi-asserted-by":"crossref","unstructured":"Yan, B., et al.: Universal instance perception as object discovery and retrieval. In: 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 15325\u201315336 (2023)","DOI":"10.1109\/CVPR52729.2023.01471"},{"key":"10_CR63","unstructured":"Yang, J., Zhang, H., Li, F., Zou, X., Li, C., Gao, J.: Set-of-mark prompting unleashes extraordinary visual grounding in GPT-4V. arXiv preprint arXiv:2310.11441 (2023)"},{"key":"10_CR64","doi-asserted-by":"crossref","unstructured":"Yang, L., Wang, Y., Li, X., Wang, X., Yang, J.: Fine-grained visual prompting. Adv. Neural Inf. Process. Syst. 36 (2024)","DOI":"10.1109\/TPAMI.2024.3504568"},{"key":"10_CR65","unstructured":"Yang, Z., et al.: The dawn of LMMs: preliminary explorations with GPT-4V(ision). arXiv preprint arXiv:2309.17421 (2023)"},{"key":"10_CR66","unstructured":"Yao, F., et al.: Thinking like an expert: multimodal hypergraph-of-thought (HoT) reasoning to boost foundation modals. arXiv preprint arXiv:2308.06207 (2023)"},{"key":"10_CR67","unstructured":"Yao, S., et al.: Tree of thoughts: deliberate problem solving with large language models. Adv. Neural Inf. Process. Syst. 36 (2024)"},{"key":"10_CR68","doi-asserted-by":"crossref","unstructured":"Yao, Y., Li, Z., Zhao, H.: Beyond chain-of-thought, effective graph-of-thought reasoning in large language models. arXiv preprint arXiv:2305.16582 (2023)","DOI":"10.18653\/v1\/2024.findings-naacl.183"},{"key":"10_CR69","unstructured":"Yao, Y., Zhang, A., Zhang, Z., Liu, Z., Chua, T.S., Sun, M.: CPT: colorful prompt tuning for pre-trained vision-language models. arXiv preprint arXiv:2109.11797 (2021)"},{"key":"10_CR70","unstructured":"Yin, Z., et\u00a0al.: LAMM: language-assisted multi-modal instruction-tuning dataset, framework, and benchmark. Adv. Neural Inf. Process. Syst. 36 (2024)"},{"key":"10_CR71","unstructured":"You, H., et al.: FERRET: refer and ground anything anywhere at any granularity. arXiv preprint arXiv:2310.07704 (2023)"},{"key":"10_CR72","doi-asserted-by":"crossref","unstructured":"Yu, L., et al.: MAttNet: modular attention network for referring expression comprehension. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1307\u20131315 (2018)","DOI":"10.1109\/CVPR.2018.00142"},{"key":"10_CR73","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1007\/978-3-319-46475-6_5","volume-title":"Computer Vision \u2013 ECCV 2016","author":"L Yu","year":"2016","unstructured":"Yu, L., Poirson, P., Yang, S., Berg, A.C., Berg, T.L.: Modeling context in referring expressions. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9906, pp. 69\u201385. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46475-6_5"},{"key":"10_CR74","doi-asserted-by":"crossref","unstructured":"Zang, Y., Li, W., Han, J., Zhou, K., Loy, C.C.: Contextual object detection with multimodal large language models. arXiv preprint arXiv:2305.18279 (2023)","DOI":"10.1007\/s11263-024-02214-4"},{"key":"10_CR75","doi-asserted-by":"publisher","unstructured":"Zang, Y., Li, W., Zhou, K., Huang, C., Loy, C.C.: Open-vocabulary DETR with conditional matching. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision \u2013 ECCV 2022. ECCV 2022. LNCS, vol. 13669. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20077-9_7","DOI":"10.1007\/978-3-031-20077-9_7"},{"key":"10_CR76","doi-asserted-by":"crossref","unstructured":"Zareian, A., Rosa, K.D., Hu, D.H., Chang, S.F.: Open-vocabulary object detection using captions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14393\u201314402 (2021)","DOI":"10.1109\/CVPR46437.2021.01416"},{"key":"10_CR77","first-page":"1","volume":"71","author":"N Zeng","year":"2022","unstructured":"Zeng, N., Wu, P., Wang, Z., Li, H., Liu, W., Liu, X.: A small-sized object detection oriented multi-scale feature fusion approach with application to defect detection. IEEE Trans. Instrum. Meas. 71, 1\u201314 (2022)","journal-title":"IEEE Trans. Instrum. Meas."},{"key":"10_CR78","unstructured":"Zhang, Z., Zhang, A., Li, M., Smola, A.: Automatic chain of thought prompting in large language models. arXiv preprint arXiv:2210.03493 (2022)"},{"key":"10_CR79","unstructured":"Zhang, Z., Zhang, A., Li, M., Zhao, H., Karypis, G., Smola, A.: Multimodal chain-of-thought reasoning in language models. arXiv preprint arXiv:2302.00923 (2023)"},{"key":"10_CR80","unstructured":"Zhao, Y., Lin, Z., Zhou, D., Huang, Z., Feng, J., Kang, B.: BuboGPT: enabling visual grounding in multi-modal LLMs. arXiv preprint arXiv:2307.08581 (2023)"},{"key":"10_CR81","unstructured":"Zheng, G., Yang, B., Tang, J., Zhou, H.Y., Yang, S.: DDCoT: duty-distinct chain-of-thought prompting for multimodal reasoning in language models. Adv. Neural Inf. Process. Syst. 36 (2024)"},{"key":"10_CR82","doi-asserted-by":"crossref","unstructured":"Zhong, Y., et\u00a0al.: RegionCLIP: region-based language-image pretraining. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16793\u201316803 (2022)","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"10_CR83","unstructured":"Zhuge, M., et\u00a0al.: Mindstorms in natural language-based societies of mind. arXiv preprint arXiv:2305.17066 (2023)"},{"key":"10_CR84","doi-asserted-by":"crossref","unstructured":"Zong, Z., Song, G., Liu, Y.: DETRs with collaborative hybrid assignments training. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6748\u20136758 (2023)","DOI":"10.1109\/ICCV51070.2023.00621"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73411-3_10","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T21:35:24Z","timestamp":1733088924000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73411-3_10"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,23]]},"ISBN":["9783031734106","9783031734113"],"references-count":84,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73411-3_10","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,23]]},"assertion":[{"value":"23 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}