{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,18]],"date-time":"2025-12-18T14:27:51Z","timestamp":1766068071010,"version":"build-2065373602"},"publisher-location":"Cham","reference-count":46,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031723377"},{"type":"electronic","value":"9783031723384"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-72338-4_30","type":"book-chapter","created":{"date-parts":[[2024,9,16]],"date-time":"2024-09-16T10:03:01Z","timestamp":1726480981000},"page":"448-462","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["ProGEO: Generating Prompts Through Image-Text Contrastive Learning for\u00a0Visual Geo-Localization"],"prefix":"10.1007","author":[{"given":"Jingqi","family":"Hu","sequence":"first","affiliation":[]},{"given":"Chen","family":"Mao","sequence":"additional","affiliation":[]},{"given":"Chong","family":"Tan","sequence":"additional","affiliation":[]},{"given":"Hui","family":"Li","sequence":"additional","affiliation":[]},{"given":"Hong","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Min","family":"Zheng","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,9,17]]},"reference":[{"key":"30_CR1","doi-asserted-by":"publisher","first-page":"194","DOI":"10.1016\/j.neucom.2022.09.127","volume":"513","author":"A Ali-bey","year":"2022","unstructured":"Ali-bey, A., Chaib-draa, B., Gigu\u00e8re, P.: Gsv-cities: toward appropriate supervised visual place recognition. Neurocomputing 513, 194\u2013203 (2022)","journal-title":"Neurocomputing"},{"key":"30_CR2","doi-asserted-by":"crossref","unstructured":"Ali-bey, A., Chaib-draa, B., Gigu\u00e8re, P.: Mixvpr: feature mixing for visual place recognition. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 2998\u20133007 (2023)","DOI":"10.1109\/WACV56688.2023.00301"},{"issue":"6","key":"30_CR3","doi-asserted-by":"publisher","first-page":"1437","DOI":"10.1109\/TPAMI.2017.2711011","volume":"40","author":"R Arandjelovi\u0107","year":"2018","unstructured":"Arandjelovi\u0107, R., Gronat, P., Torii, A., Pajdla, T., Sivic, J.: NetVLAD: CNN architecture for weakly supervised place recognition. IEEE Trans. Pattern Anal. Mach. Intell. 40(6), 1437\u20131451 (2018). https:\/\/doi.org\/10.1109\/TPAMI.2017.2711011","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"30_CR4","doi-asserted-by":"publisher","first-page":"346","DOI":"10.1016\/j.cviu.2007.09.014","volume":"110","author":"H Bay","year":"2008","unstructured":"Bay, H., Ess, A., Tuytelaars, T., Van Gool, L.: Speeded-up robust features (surf). Comput. Vision Image Understanding 110, 346\u2013359 (2008). https:\/\/doi.org\/10.1016\/j.cviu.2007.09.014","journal-title":"Comput. Vision Image Understanding"},{"key":"30_CR5","doi-asserted-by":"crossref","unstructured":"Berton, G., Masone, C., Caputo, B.: Rethinking visual geo-localization for large-scale applications. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00483"},{"issue":"4","key":"30_CR6","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1109\/MMUL.2020.3015990","volume":"27","author":"Y Cao","year":"2020","unstructured":"Cao, Y., Zhang, J., Yu, J.: Image retrieval via gated multiscale netvlad for social media applications. IEEE Multimedia 27(4), 69\u201378 (2020)","journal-title":"IEEE Multimedia"},{"key":"30_CR7","doi-asserted-by":"crossref","unstructured":"Cascante-Bonilla, P., Wu, H., Wang, L., Feris, R.S., Ordonez, V.: Simvqa: exploring simulated environments for visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5056\u20135066 (2022)","DOI":"10.1109\/CVPR52688.2022.00500"},{"key":"30_CR8","doi-asserted-by":"crossref","unstructured":"Ding, Y., Yu, J., Liu, B., Hu, Y., Cui, M., Wu, Q.: Mukea: multimodal knowledge extraction and accumulation for knowledge-based visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5089\u20135098 (2022)","DOI":"10.1109\/CVPR52688.2022.00503"},{"key":"30_CR9","doi-asserted-by":"crossref","unstructured":"Doan, A.D., Latif, Y., Chin, T.J., Liu, Y., Do, T.T., Reid, I.: Scalable place recognition under appearance change for autonomous driving, pp. 9319\u20139328 (2019)","DOI":"10.1109\/ICCV.2019.00941"},{"key":"30_CR10","doi-asserted-by":"crossref","unstructured":"Doan, A.D., Latif, Y., Chin, T.J., Liu, Y., Do, T.T., Reid, I.: Scalable place recognition under appearance change for autonomous driving. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9319\u20139328 (2019)","DOI":"10.1109\/ICCV.2019.00941"},{"key":"30_CR11","unstructured":"Dosovitskiy, A., et al.: An image is worth 16$$\\times $$16 words: transformers for image recognition at scale. ArXiv arxiv:2010.11929 (2021)"},{"key":"30_CR12","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"369","DOI":"10.1007\/978-3-030-58548-8_22","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Y Ge","year":"2020","unstructured":"Ge, Y., Wang, H., Zhu, F., Zhao, R., Li, H.: Self-supervising fine-grained region similarities for large-scale image localization. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12349, pp. 369\u2013386. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58548-8_22"},{"key":"30_CR13","doi-asserted-by":"crossref","unstructured":"Hausler, S., Garg, S., Xu, M., Milford, M., Fischer, T.: Patch-netvlad: multi-scale fusion of locally-global descriptors for place recognition, pp. 14141\u201314152 (2021)","DOI":"10.1109\/CVPR46437.2021.01392"},{"key":"30_CR14","doi-asserted-by":"publisher","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition, pp. 770\u2013778 (2016). https:\/\/doi.org\/10.1109\/CVPR.2016.90","DOI":"10.1109\/CVPR.2016.90"},{"key":"30_CR15","unstructured":"Hermans, A., Beyer, L., Leibe, B.: In defense of the triplet loss for person re-identification. arXiv preprint arXiv:1703.07737 (2017)"},{"key":"30_CR16","doi-asserted-by":"crossref","unstructured":"Ibrahimi, S., van Noord, N., Alpherts, T., Worring, M.: Inside out visual place recognition (2021)","DOI":"10.5244\/C.35.132"},{"key":"30_CR17","doi-asserted-by":"crossref","unstructured":"Jin\u00a0Kim, H., Dunn, E., Frahm, J.M.: Learned contextual feature reweighting for image geo-localization. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2136\u20132145 (2017)","DOI":"10.1109\/CVPR.2017.346"},{"key":"30_CR18","doi-asserted-by":"crossref","unstructured":"Keetha, N., et al.: Anyloc: towards universal visual place recognition. IEEE Rob. Autom. Lett. (2023)","DOI":"10.1109\/LRA.2023.3343602"},{"key":"30_CR19","doi-asserted-by":"crossref","unstructured":"Khattak, M.U., Rasheed, H., Maaz, M., Khan, S., Khan, F.S.: Maple: multi-modal prompt learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19113\u201319122 (2023)","DOI":"10.1109\/CVPR52729.2023.01832"},{"key":"30_CR20","unstructured":"Kingma, D., Ba, J.: Adam: a method for stochastic optimization (2014)"},{"key":"30_CR21","unstructured":"Lee, J., Toutanova, K.: Pre-training of deep bidirectional transformers for language understanding, vol. 3, p. 8. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"30_CR22","doi-asserted-by":"crossref","unstructured":"Li, S., Sun, L., Li, Q.: Clip-reid: exploiting vision-language model for image re-identification without concrete text labels. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a037, pp. 1405\u20131413 (2023)","DOI":"10.1609\/aaai.v37i1.25225"},{"key":"30_CR23","doi-asserted-by":"crossref","unstructured":"Liu, L., Li, H., Dai, Y.: Stochastic attraction-repulsion embedding for large scale image localization (2019)","DOI":"10.1109\/ICCV.2019.00266"},{"issue":"2","key":"30_CR24","doi-asserted-by":"publisher","first-page":"91","DOI":"10.1023\/B:VISI.0000029664.99615.94","volume":"60","author":"DG Lowe","year":"2004","unstructured":"Lowe, D.G.: Distinctive image features from scale-invariant keypoints. Int. J. Comput. Vision 60(2), 91\u2013110 (2004). https:\/\/doi.org\/10.1023\/B:VISI.0000029664.99615.94","journal-title":"Int. J. Comput. Vision"},{"key":"30_CR25","doi-asserted-by":"publisher","first-page":"1038","DOI":"10.1109\/TRO.2008.2004520","volume":"24","author":"M Milford","year":"2008","unstructured":"Milford, M., Wyeth, G.: Mapping a suburb with a single camera using a biologically inspired slam system. IEEE Trans. Rob. 24, 1038\u20131053 (2008)","journal-title":"IEEE Trans. Rob."},{"key":"30_CR26","doi-asserted-by":"crossref","unstructured":"Peng, G., Yue, Y., Zhang, J., Wu, Z., Tang, X., Wang, D.: Semantic reinforced attention learning for visual place recognition, pp. 13415\u201313422. IEEE (2021)","DOI":"10.1109\/ICRA48506.2021.9561812"},{"key":"30_CR27","doi-asserted-by":"crossref","unstructured":"Peng, G., Zhang, J., Li, H., Wang, D.: Attentional pyramid pooling of salient visual residuals for place recognition, pp. 885\u2013894 (2021)","DOI":"10.1109\/ICCV48922.2021.00092"},{"key":"30_CR28","doi-asserted-by":"crossref","unstructured":"Radenovi\u0107, F., Tolias, G., Chum, O.: Fine-tuning CNN image retrieval with no human annotation (2018)","DOI":"10.1109\/TPAMI.2018.2846566"},{"key":"30_CR29","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"30_CR30","doi-asserted-by":"crossref","unstructured":"Rao, Y., et al.: Denseclip: language-guided dense prediction with context-aware prompting. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18082\u201318091 (2022)","DOI":"10.1109\/CVPR52688.2022.01755"},{"key":"30_CR31","doi-asserted-by":"crossref","unstructured":"Song, H., Dong, L., Zhang, W.N., Liu, T., Wei, F.: Clip models are few-shot learners: empirical studies on VQA and visual entailment. arXiv preprint arXiv:2203.07190 (2022)","DOI":"10.18653\/v1\/2022.acl-long.421"},{"key":"30_CR32","doi-asserted-by":"crossref","unstructured":"Torii, A., Arandjelovi?, R., Sivic, J., Okutomi, M., Pajdla, T.: 24\/7 place recognition by view synthesis. 40(2), 257\u2013271 (2018)","DOI":"10.1109\/TPAMI.2017.2667665"},{"key":"30_CR33","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NIPS 2017, pp. 6000\u20136010 (2017)"},{"issue":"4","key":"30_CR34","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3528223.3530068","volume":"41","author":"Y Vinker","year":"2022","unstructured":"Vinker, Y., et al.: Clipasso: semantically-aware object sketching. ACM Trans. Graph. (TOG) 41(4), 1\u201311 (2022)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"30_CR35","doi-asserted-by":"crossref","unstructured":"Wang, H., et al.: Cosface: large margin cosine loss for deep face recognition, pp. 5265\u20135274. Computer Vision Foundation\/IEEE Computer Society (2018)","DOI":"10.1109\/CVPR.2018.00552"},{"key":"30_CR36","doi-asserted-by":"crossref","unstructured":"Wang, R., Shen, Y., Zuo, W., Zhou, S., Zheng, N.: Transvpr: transformer-based place recognition with multi-level attention aggregation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13648\u201313657 (2022)","DOI":"10.1109\/CVPR52688.2022.01328"},{"key":"30_CR37","doi-asserted-by":"crossref","unstructured":"Warburg, F., Hauberg, S., Lopez-Antequera, M., Gargallo, P., Kuang, Y., Civera, J.: Mapillary street-level sequences: a dataset for lifelong place recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2626\u20132635 (2020)","DOI":"10.1109\/CVPR42600.2020.00270"},{"key":"30_CR38","doi-asserted-by":"crossref","unstructured":"Wu, M., Huang, Q.: Im2city: image geo-localization via multi-modal learning. In: Proceedings of the 5th ACM SIGSPATIAL International Workshop on AI for Geographic Knowledge Discovery, pp. 50\u201361 (2022)","DOI":"10.1145\/3557918.3565868"},{"key":"30_CR39","first-page":"29009","volume":"34","author":"H Yang","year":"2021","unstructured":"Yang, H., Lu, X., Zhu, Y.: Cross-view geo-localization with layer-to-layer transformer. Adv. Neural. Inf. Process. Syst. 34, 29009\u201329020 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"30_CR40","unstructured":"Yao, Y., Zhang, A., Zhang, Z., Liu, Z., Chua, T.S., Sun, M.: CPT: colorful prompt tuning for pre-trained vision-language models. arXiv preprint arXiv:2109.11797 (2021)"},{"issue":"2","key":"30_CR41","doi-asserted-by":"publisher","first-page":"661","DOI":"10.1109\/TNNLS.2019.2908982","volume":"31","author":"J Yu","year":"2020","unstructured":"Yu, J., Zhu, C., Zhang, J., Huang, Q., Tao, D.: Spatial pyramid-enhanced netvlad with weighted triplet loss for place recognition. IEEE Trans. Neural Netw. Learn. Syst. 31(2), 661\u2013674 (2020). https:\/\/doi.org\/10.1109\/TNNLS.2019.2908982","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"issue":"7","key":"30_CR42","doi-asserted-by":"publisher","first-page":"2136","DOI":"10.1007\/s11263-021-01469-5","volume":"129","author":"M Zaffar","year":"2021","unstructured":"Zaffar, M., et al.: VPR-bench: an open-source visual place recognition evaluation framework with quantifiable viewpoint and appearance change. Int. J. Comput. Vision 129(7), 2136\u20132174 (2021)","journal-title":"Int. J. Comput. Vision"},{"issue":"9","key":"30_CR43","doi-asserted-by":"publisher","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","volume":"130","author":"K Zhou","year":"2022","unstructured":"Zhou, K., Yang, J., Loy, C.C., Liu, Z.: Learning to prompt for vision-language models. Int. J. Comput. Vision 130(9), 2337\u20132348 (2022)","journal-title":"Int. J. Comput. Vision"},{"key":"30_CR44","doi-asserted-by":"crossref","unstructured":"Zhu, S., Shah, M., Chen, C.: Transgeo: transformer is all you need for cross-view image geo-localization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1162\u20131171 (2022)","DOI":"10.1109\/CVPR52688.2022.00123"},{"key":"30_CR45","doi-asserted-by":"crossref","unstructured":"Zhu, S., Yang, L., Chen, C., Shah, M., Shen, X., Wang, H.: R2former: unified retrieval and reranking transformer for place recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19370\u201319380 (2023)","DOI":"10.1109\/CVPR52729.2023.01856"},{"key":"30_CR46","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Wang, J., Xie, L., Zheng, L.: Attention-based pyramid aggregation network for visual place recognition. In: 2018 ACM Multimedia Conference on Multimedia Conference, MM 2018, Seoul, Republic of Korea, 22\u201326 October 2018, pp. 99\u2013107. ACM (2018)","DOI":"10.1145\/3240508.3240525"}],"container-title":["Lecture Notes in Computer Science","Artificial Neural Networks and Machine Learning \u2013 ICANN 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72338-4_30","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,9]],"date-time":"2025-10-09T07:49:57Z","timestamp":1759996197000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72338-4_30"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031723377","9783031723384"],"references-count":46,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72338-4_30","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"17 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICANN","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Artificial Neural Networks","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Lugano","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Switzerland","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20 September 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"33","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icann2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}