{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T16:49:22Z","timestamp":1761929362041,"version":"build-2065373602"},"publisher-location":"Singapore","reference-count":45,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819537280","type":"print"},{"value":"9789819537297","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,11,1]],"date-time":"2025-11-01T00:00:00Z","timestamp":1761955200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,11,1]],"date-time":"2025-11-01T00:00:00Z","timestamp":1761955200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-3729-7_16","type":"book-chapter","created":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T16:44:06Z","timestamp":1761929046000},"page":"187-199","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["CAPNet: Context-Aware Prompt Network for\u00a0Weakly-Supervised Open-World Phrase-Grounding"],"prefix":"10.1007","author":[{"given":"Hui","family":"Yuan","sequence":"first","affiliation":[]},{"given":"Naigong","family":"Yu","sequence":"additional","affiliation":[]},{"given":"Zhaoxuan","family":"Lu","sequence":"additional","affiliation":[]},{"given":"Yan","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Jianhua","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Jinhan","family":"Yan","sequence":"additional","affiliation":[]},{"given":"Zhiwen","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Liang","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,11,1]]},"reference":[{"key":"16_CR1","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16x16 words: Transformers for image recognition at scale, arXiv preprint arXiv:2010.11929 (2020)"},{"key":"16_CR2","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"491","DOI":"10.1007\/978-3-030-58558-7_29","volume-title":"Computer Vision \u2013 ECCV 2020","author":"A Kolesnikov","year":"2020","unstructured":"Kolesnikov, A., et al.: Big transfer (BiT): general visual representation learning. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12350, pp. 491\u2013507. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58558-7_29"},{"key":"16_CR3","doi-asserted-by":"crossref","unstructured":"Girshick, R., Donahue, J., Darrell, T., Malik, J.: Rich feature hierarchies for accurate object detection and semantic segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 580\u2013587 (2014)","DOI":"10.1109\/CVPR.2014.81"},{"key":"16_CR4","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: towards real-time object detection with region proposal networks. Advances in Neural Information Processing Syst. 28 (2015)"},{"issue":"4","key":"16_CR5","doi-asserted-by":"publisher","first-page":"834","DOI":"10.1109\/TPAMI.2017.2699184","volume":"40","author":"L-C Chen","year":"2017","unstructured":"Chen, L.-C., Papandreou, G., Kokkinos, I., Murphy, K., Yuille, A.L.: Deeplab: semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected crfs. IEEE Trans. Pattern Anal. Mach. Intell. 40(4), 834\u2013848 (2017)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"16_CR6","doi-asserted-by":"crossref","unstructured":"Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3431\u20133440 (2015)","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"16_CR7","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"16_CR8","doi-asserted-by":"publisher","unstructured":"Zhang, R. et al.: Tip-Adapter: Training-free adaption of CLIP for few-shot classification. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision \u2013 ECCV 2022. ECCV 2022. LNCS, vol. 13695. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19833-5_29","DOI":"10.1007\/978-3-031-19833-5_29"},{"key":"16_CR9","doi-asserted-by":"crossref","unstructured":"Shen, H., Zhao, T., Zhu, M., Yin, J.: Groundvlp: harnessing zero-shot visual grounding from vision-language pre-training and open-vocabulary object detection. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 38(5), pp. 4766\u20134775 (2024)","DOI":"10.1609\/aaai.v38i5.28278"},{"key":"16_CR10","unstructured":"Shaharabany, T., Tewel, Y., Wolf, L.: What is where by looking: Weakly-supervised open-world phrase-grounding without text inputs. Adv. Neural Inform. Process. Syst. 35, 28222\u201328237 (2022)"},{"key":"16_CR11","doi-asserted-by":"crossref","unstructured":"Shaharabany, T, Wolf, L.: Similarity maps for self-training weakly-supervised phrase grounding. I: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6925\u20136934 (2023)","DOI":"10.1109\/CVPR52729.2023.00669"},{"key":"16_CR12","doi-asserted-by":"crossref","unstructured":"Chen, S., Luo, G., Zhou, Y., Sun, X., Jiang, G., Ji, R.: Querymatch: a query-based contrastive learning framework for weakly supervised visual grounding. In: Proceedings of the 32nd ACM International Conference on Multimedia, pp. 4177\u20134186 (2024)","DOI":"10.1145\/3664647.3681058"},{"key":"16_CR13","doi-asserted-by":"crossref","unstructured":"Kuang, D., Zhang, R., Nie, Z., Chen, J., Kim, J.: Momentum pseudo-labeling for weakly supervised phrase grounding. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a039(23), pp. 24348\u201324356 (2025)","DOI":"10.1609\/aaai.v39i23.34612"},{"key":"16_CR14","doi-asserted-by":"crossref","unstructured":"Liu, Y., Wan, B., Ma, L., He, X:\u00a0Relation-aware instance refinement for weakly supervised visual grounding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5612\u20135621 (2021)","DOI":"10.1109\/CVPR46437.2021.00556"},{"key":"16_CR15","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"53","DOI":"10.1007\/978-3-030-58580-8_4","volume-title":"Computer Vision \u2013 ECCV 2020","author":"E Rusak","year":"2020","unstructured":"Rusak, E., et al.: A simple way to make neural networks robust against diverse image corruptions. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12348, pp. 53\u201369. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58580-8_4"},{"key":"16_CR16","doi-asserted-by":"crossref","unstructured":"Javed, S.A., Saxena, S., Gandhi, V.: Learning unsupervised visual grounding through semantic self-supervision, arXiv preprint, arXiv:1803.06506 (2018)","DOI":"10.24963\/ijcai.2019\/112"},{"key":"16_CR17","doi-asserted-by":"crossref","unstructured":"Akbari, H., Karaman, S., Bhargava, S., Chen, B., Vondrick, C., Chang, S.-F.: Multi-level multimodal common semantic space for image-phrase grounding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12476\u201312486 (2019)","DOI":"10.1109\/CVPR.2019.01276"},{"key":"16_CR18","doi-asserted-by":"crossref","unstructured":"Arbelle, A., et\u00a0al.: Detector-free weakly supervised grounding by separation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1801\u20131812 (2021)","DOI":"10.1109\/ICCV48922.2021.00182"},{"key":"16_CR19","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning, pp. 12888\u201312900. PMLR (2022)"},{"key":"16_CR20","doi-asserted-by":"crossref","unstructured":"Shtedritski, A., Rupprecht, C., Vedaldi, A.: What does clip know about a red circle? visual prompt engineering for vlms. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 11987\u201311997 (2023)","DOI":"10.1109\/ICCV51070.2023.01101"},{"key":"16_CR21","unstructured":"Yang, L., Wang, Y., Li, X., Wang, X., Yang, J.: Fine-grained visual prompting. In: Advances in Neural Information Processing Systems 36, 24993\u201325006 (2023)"},{"issue":"9","key":"16_CR22","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3560815","volume":"55","author":"P Liu","year":"2023","unstructured":"Liu, P., Yuan, W., Fu, J., Jiang, Z., Hayashi, H., Neubig, G.: Pre-train, prompt, and predict: a systematic survey of prompting methods in natural language processing. ACM Comput. Surv. 55(9), 1\u201335 (2023)","journal-title":"ACM Comput. Surv."},{"key":"16_CR23","doi-asserted-by":"crossref","unstructured":"Li, X.L., Liang, P.: Prefix-tuning: Optimizing continuous prompts for generation. arXiv preprint, arXiv:2101.00190 (2021)","DOI":"10.18653\/v1\/2021.acl-long.353"},{"key":"16_CR24","doi-asserted-by":"crossref","unstructured":"Du, Y., Wei, F., Zhang, Z., Shi, M., Gao, Y., Li, G.: Learning to prompt for open-vocabulary object detection with vision-language model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14084\u201314093 (2022)","DOI":"10.1109\/CVPR52688.2022.01369"},{"key":"16_CR25","doi-asserted-by":"publisher","unstructured":"Jia, M. et al.: Visual prompt tuning. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision \u2013 ECCV 2022. ECCV 2022. LNCS, vol. 13693. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19827-4_41","DOI":"10.1007\/978-3-031-19827-4_41"},{"issue":"9","key":"16_CR26","doi-asserted-by":"publisher","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","volume":"130","author":"K Zhou","year":"2022","unstructured":"Zhou, K., Yang, J., Loy, C.C., Liu, Z.: Learning to prompt for vision-language models. Int. J. Comput. Vision 130(9), 2337\u20132348 (2022)","journal-title":"Int. J. Comput. Vision"},{"key":"16_CR27","doi-asserted-by":"crossref","unstructured":"Lu, Y., Liu, J., Zhang, Y., Liu, Y., Tian, X.: Prompt distribution learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5206\u20135215 (2022)","DOI":"10.1109\/CVPR52688.2022.00514"},{"key":"16_CR28","doi-asserted-by":"crossref","unstructured":"Ding, z., et al.: Exploring structured semantic prior for multi label recognition with incomplete labels. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3398\u20133407 (2023)","DOI":"10.1109\/CVPR52729.2023.00331"},{"issue":"5","key":"16_CR29","doi-asserted-by":"publisher","first-page":"3450","DOI":"10.1109\/TPAMI.2023.3346405","volume":"46","author":"P Hu","year":"2023","unstructured":"Hu, P., Sun, X., Sclaroff, S., Saenko, K.: Dualcoop++: fast and effective adaptation to multi-label recognition with limited annotations. IEEE Trans. Pattern Anal. Mach. Intell. 46(5), 3450\u20133462 (2023)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"16_CR30","doi-asserted-by":"crossref","unstructured":"Rao, Y., et al.: Denseclip: language-guided dense prediction with context-aware prompting. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18082\u201318091 (2022)","DOI":"10.1109\/CVPR52688.2022.01755"},{"key":"16_CR31","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol.\u00a030 (2017)"},{"key":"16_CR32","doi-asserted-by":"crossref","unstructured":"Chefer, H., Gur, S., Wolf, L.: Transformer interpretability beyond attention visualization. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 782\u2013791 (2021)","DOI":"10.1109\/CVPR46437.2021.00084"},{"key":"16_CR33","doi-asserted-by":"crossref","unstructured":"Krause, J., Stark, M., Deng, J., Fei-Fei, L.: 3d object representations for fine-grained categorization. In: Proceedings of the IEEE International Conference on Computer Vision Workshops, pp. 554\u2013561 (2013)","DOI":"10.1109\/ICCVW.2013.77"},{"key":"16_CR34","unstructured":"Khosla, A., Jayadevaprakash, N., Yao, B., Li, F.-F.: Novel dataset for fine-grained image categorization: Stanford dogs. In: Proc. CVPR Workshop on Fine-grained Visual Categorization (FGVC), vol. 2(1) (2011)"},{"key":"16_CR35","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"16_CR36","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vision 123, 32\u201373 (2017)","journal-title":"Int. J. Comput. Vision"},{"key":"16_CR37","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., Caicedo, J.C., Hockenmaier, J., Lazebnik, S.: Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2641\u20132649 (2015)","DOI":"10.1109\/ICCV.2015.303"},{"key":"16_CR38","unstructured":"Grubinger, M., Clough, P., M\u00fcller, H., Deselaers, T.: The iapr tc-12 benchmark: a new evaluation resource for visual information systems. In: International Workshop Ontoimage 2 (2006)"},{"key":"16_CR39","doi-asserted-by":"crossref","unstructured":"Chen, K., Kovvuri, R., Nevatia, R.: Query-guided regression network with context policy for phrase grounding. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 824\u2013832 (2017)","DOI":"10.1109\/ICCV.2017.95"},{"issue":"10","key":"16_CR40","doi-asserted-by":"publisher","first-page":"1084","DOI":"10.1007\/s11263-017-1059-x","volume":"126","author":"J Zhang","year":"2018","unstructured":"Zhang, J., Bargal, S.A., Lin, Z., Brandt, J., Shen, X., Sclaroff, S.: Top-down neural attention by excitation backprop. Int. J. Comput. Vision 126(10), 1084\u20131102 (2018)","journal-title":"Int. J. Comput. Vision"},{"key":"16_CR41","doi-asserted-by":"crossref","unstructured":"Zhou, B., Khosla, A., Lapedriza, A., Oliva, A., Torralba, A.: Learning deep features for discriminative localization. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2921\u20132929 (2016)","DOI":"10.1109\/CVPR.2016.319"},{"key":"16_CR42","doi-asserted-by":"crossref","unstructured":"Choe, J., Shim, H.: Attention-based dropout layer for weakly supervised object localization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2219\u20132228 (2019)","DOI":"10.1109\/CVPR.2019.00232"},{"key":"16_CR43","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2021.107949","volume":"116","author":"J Choe","year":"2021","unstructured":"Choe, J., Han, D., Yun, S., Ha, J.-W., Oh, S.J., Shim, H.: Region-based dropout with attention prior for weakly supervised object localization. Pattern Recogn. 116, 107949 (2021)","journal-title":"Pattern Recogn."},{"key":"16_CR44","doi-asserted-by":"crossref","unstructured":"Shaharabany, T., Wolf, L.: Learning a weight map for weakly-supervised localization. In: ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1\u20135. IEEE (2023)","DOI":"10.1109\/ICASSP49357.2023.10094940"},{"key":"16_CR45","doi-asserted-by":"crossref","unstructured":"Chefer, H., Gur, S., Wolf, L.: Generic attention-model explainability for interpreting bi-modal and encoder-decoder transformer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 397\u2013406 (2021)","DOI":"10.1109\/ICCV48922.2021.00045"}],"container-title":["Lecture Notes in Computer Science","Image and Graphics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-3729-7_16","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T16:44:12Z","timestamp":1761929052000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-3729-7_16"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,1]]},"ISBN":["9789819537280","9789819537297"],"references-count":45,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-3729-7_16","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,1]]},"assertion":[{"value":"1 November 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICIG","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Image and Graphics","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Xuzhou","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"31 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 November 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icig2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icig.csig.org.cn\/2025\/index.html","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}