{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:25:53Z","timestamp":1777656353638,"version":"3.51.4"},"publisher-location":"Cham","reference-count":80,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031198380","type":"print"},{"value":"9783031198397","type":"electronic"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-19839-7_12","type":"book-chapter","created":{"date-parts":[[2022,10,22]],"date-time":"2022-10-22T11:40:06Z","timestamp":1666438806000},"page":"196-215","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":26,"title":["Where in\u00a0the\u00a0World Is This Image? Transformer-Based Geo-localization in\u00a0the\u00a0Wild"],"prefix":"10.1007","author":[{"given":"Shraman","family":"Pramanick","sequence":"first","affiliation":[]},{"given":"Ewa M.","family":"Nowara","sequence":"additional","affiliation":[]},{"given":"Joshua","family":"Gleason","sequence":"additional","affiliation":[]},{"given":"Carlos D.","family":"Castillo","sequence":"additional","affiliation":[]},{"given":"Rama","family":"Chellappa","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,10,23]]},"reference":[{"key":"12_CR1","unstructured":"Akbari, H., et al.: VATT: transformers for multimodal self-supervised learning from raw video, audio and text. In: Advances in Neural Information Processing Systems, vol. 34, pp. 24206\u201324221 (2021)"},{"key":"12_CR2","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"517","DOI":"10.1007\/978-3-642-33709-3_37","volume-title":"Computer Vision \u2013 ECCV 2012","author":"G Baatz","year":"2012","unstructured":"Baatz, G., Saurer, O., K\u00f6ser, K., Pollefeys, M.: Large scale visual geo-localization of images in mountainous terrain. In: Fitzgibbon, A., Lazebnik, S., Perona, P., Sato, Y., Schmid, C. (eds.) ECCV 2012. LNCS, vol. 7573, pp. 517\u2013530. Springer, Heidelberg (2012). https:\/\/doi.org\/10.1007\/978-3-642-33709-3_37"},{"key":"12_CR3","doi-asserted-by":"crossref","unstructured":"Berton, G., Masone, C., Caputo, B.: Rethinking visual geo-localization for large-scale applications. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4878\u20134888 (2022)","DOI":"10.1109\/CVPR52688.2022.00483"},{"issue":"3","key":"12_CR4","doi-asserted-by":"publisher","first-page":"613","DOI":"10.1007\/s10044-017-0611-1","volume":"20","author":"J Brejcha","year":"2017","unstructured":"Brejcha, J., \u010cad\u00edk, M.: State-of-the-art in visual geo-localization. Pattern Anal. Appl. 20(3), 613\u2013637 (2017)","journal-title":"Pattern Anal. Appl."},{"key":"12_CR5","doi-asserted-by":"crossref","unstructured":"Cao, L., Smith, J.R., Wen, Z., Yin, Z., Jin, X., Han, J.: Bluefinder: estimate where a beach photo was taken. In: Proceedings of the 21st International Conference on World Wide Web, pp. 469\u2013470 (2012)","DOI":"10.1145\/2187980.2188081"},{"key":"12_CR6","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"issue":"1","key":"12_CR7","doi-asserted-by":"publisher","first-page":"41","DOI":"10.1023\/A:1007379606734","volume":"28","author":"R Caruana","year":"1997","unstructured":"Caruana, R.: Multitask learning. Mach. Learn. 28(1), 41\u201375 (1997)","journal-title":"Mach. Learn."},{"key":"12_CR8","unstructured":"Chen, D.M., et al.: City-scale landmark identification on mobile devices. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 737\u2013744 (2011)"},{"key":"12_CR9","doi-asserted-by":"crossref","unstructured":"Chen, H., et al.: Pre-trained image processing transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12299\u201312310 (2021)","DOI":"10.1109\/CVPR46437.2021.01212"},{"key":"12_CR10","doi-asserted-by":"crossref","unstructured":"Cheng, B., Misra, I., Schwing, A.G., Kirillov, A., Girdhar, R.: Masked-attention mask transformer for universal image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1290\u20131299 (2022)","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"12_CR11","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), pp. 4171\u20134186. Association for Computational Linguistics, Minneapolis, Minnesota, Jun 2019. 10.18653\/v1\/N19-1423, https:\/\/aclanthology.org\/N19-1423"},{"key":"12_CR12","unstructured":"Doersch, C., Gupta, A., Zisserman, A.: Crosstransformers: spatially-aware few-shot transfer. In: Advances in Neural Information Processing Systems, vol. 33, pp. 21981\u201321993 (2020)"},{"key":"12_CR13","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. In: International Conference on Learning Representations (2020)"},{"key":"12_CR14","unstructured":"Fang, Y., Liao, B., Wang, X., Fang, J., Qi, J., Wu, R., Niu, J., Liu, W.: You only look at one sequence: Rethinking transformer in vision through object detection. Advances in Neural Information Processing Systems 34 (2021)"},{"key":"12_CR15","unstructured":"Gu, Y., Yang, K., Fu, S., Chen, S., Li, X., Marsic, I.: Hybrid attention based multimodal network for spoken language classification. In: ACL, vol. 2018, p. 2379 (2018)"},{"key":"12_CR16","doi-asserted-by":"crossref","unstructured":"Hausler, S., Garg, S., Xu, M., Milford, M., Fischer, T.: Patch-netvlad: multi-scale fusion of locally-global descriptors for place recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14141\u201314152 (2021)","DOI":"10.1109\/CVPR46437.2021.01392"},{"key":"12_CR17","doi-asserted-by":"crossref","unstructured":"Hays, J., Efros, A.A.: Im2gps: estimating geographic information from a single image. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1\u20138. IEEE (2008)","DOI":"10.1109\/CVPR.2008.4587784"},{"key":"12_CR18","doi-asserted-by":"publisher","first-page":"41","DOI":"10.1007\/978-3-319-09861-6_3","volume-title":"Multimodal Location Estimation of Videos and Images","author":"J Hays","year":"2015","unstructured":"Hays, J., Efros, A.A.: Large-scale image geolocalization. In: Choi, J., Friedland, G. (eds.) Multimodal Location Estimation of Videos and Images, pp. 41\u201362. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-09861-6_3"},{"key":"12_CR19","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"12_CR20","doi-asserted-by":"crossref","unstructured":"Hu, R., Singh, A.: Unit: multimodal multitask learning with a unified transformer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1439\u20131449 (2021)","DOI":"10.1109\/ICCV48922.2021.00147"},{"key":"12_CR21","doi-asserted-by":"crossref","unstructured":"Hu, S., Feng, M., Nguyen, R.M., Lee, G.H.: Cvm-net: Cross-view matching network for image-based ground-to-aerial geo-localization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7258\u20137267 (2018)","DOI":"10.1109\/CVPR.2018.00758"},{"key":"12_CR22","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1007\/978-3-030-46147-8_1","volume-title":"Machine Learning and Knowledge Discovery in Databases","author":"M Izbicki","year":"2020","unstructured":"Izbicki, M., Papalexakis, E.E., Tsotras, V.J.: Exploiting the earth\u2019s spherical geometry to geolocate images. In: Brefeld, U., Fromont, E., Hotho, A., Knobbe, A., Maathuis, M., Robardet, C. (eds.) ECML PKDD 2019. LNCS (LNAI), vol. 11907, pp. 3\u201319. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-46147-8_1"},{"key":"12_CR23","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"715","DOI":"10.1007\/978-3-030-58545-7_41","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Y Kant","year":"2020","unstructured":"Kant, Y., et al.: Spatially aware multimodal transformers for TextVQA. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12354, pp. 715\u2013732. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58545-7_41"},{"issue":"1","key":"12_CR24","doi-asserted-by":"publisher","first-page":"93","DOI":"10.1109\/MMUL.2017.9","volume":"24","author":"M Larson","year":"2017","unstructured":"Larson, M., Soleymani, M., Gravier, G., Ionescu, B., Jones, G.J.: The benchmarking initiative for multimedia evaluation: mediaeval 2016. IEEE Multimedia 24(1), 93\u201396 (2017)","journal-title":"IEEE Multimedia"},{"key":"12_CR25","doi-asserted-by":"crossref","unstructured":"Lei, J., Li, L., Zhou, L., Gan, Z., Berg, T.L., Bansal, M., Liu, J.: Less is more: clipbert for video-and-language learning via sparse sampling. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7331\u20137341 (2021)","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"12_CR26","doi-asserted-by":"crossref","unstructured":"Li, L.H., et al.: Grounded language-image pre-training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10965\u201310975 (2022)","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"12_CR27","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1007\/978-3-030-58577-8_8","volume-title":"Computer Vision \u2013 ECCV 2020","author":"X Li","year":"2020","unstructured":"Li, X., et al.: Oscar: object-semantics aligned pre-training for vision-language tasks. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12375, pp. 121\u2013137. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_8"},{"key":"12_CR28","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Belongie, S., Hays, J.: Cross-view image geolocalization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 891\u2013898 (2013)","DOI":"10.1109\/CVPR.2013.120"},{"key":"12_CR29","unstructured":"Liu, Y., et al.: Roberta: a robustly optimized Bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)"},{"key":"12_CR30","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: International Conference on Learning Representations (2018)"},{"key":"12_CR31","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: Vilbert: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"12_CR32","doi-asserted-by":"publisher","first-page":"19516","DOI":"10.1109\/ACCESS.2021.3054937","volume":"9","author":"C Masone","year":"2021","unstructured":"Masone, C., Caputo, B.: A survey on deep visual place recognition. IEEE Access 9, 19516\u201319547 (2021)","journal-title":"IEEE Access"},{"key":"12_CR33","doi-asserted-by":"crossref","unstructured":"Misra, I., Girdhar, R., Joulin, A.: An end-to-end transformer model for 3D object detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2906\u20132917 (2021)","DOI":"10.1109\/ICCV48922.2021.00290"},{"key":"12_CR34","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"575","DOI":"10.1007\/978-3-030-01258-8_35","volume-title":"Computer Vision \u2013 ECCV 2018","author":"E M\u00fcller-Budack","year":"2018","unstructured":"M\u00fcller-Budack, E., Pustu-Iren, K., Ewerth, R.: Geolocation estimation of photos using a hierarchical model and scene classification. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11216, pp. 575\u2013592. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01258-8_35"},{"key":"12_CR35","unstructured":"Paszke, A., et al.: Pytorch: an imperative style, high-performance deep learning library. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"12_CR36","doi-asserted-by":"crossref","unstructured":"Peters, M., Neumann, M., Iyyer, M., Gardner, M., Clark, C., Lee, K., Zettlemoyer, L.: Deep contextualized word representations. In: Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers), pp. 2227\u20132237 (2018)","DOI":"10.18653\/v1\/N18-1202"},{"key":"12_CR37","doi-asserted-by":"crossref","unstructured":"Pramanick, S., Roy, A., Patel, V.M.: Multimodal learning using optimal transport for sarcasm and humor detection. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 3930\u20133940 (2022)","DOI":"10.1109\/WACV51458.2022.00062"},{"key":"12_CR38","doi-asserted-by":"crossref","unstructured":"Pramanick, S., Sharma, S., Dimitrov, D., Akhtar, M.S., Nakov, P., Chakraborty, T.: Momenta: a multimodal framework for detecting harmful memes and their targets. In: Findings of the Association for Computational Linguistics: EMNLP 2021, pp. 4439\u20134455 (2021)","DOI":"10.18653\/v1\/2021.findings-emnlp.379"},{"key":"12_CR39","unstructured":"Raghu, M., Unterthiner, T., Kornblith, S., Zhang, C., Dosovitskiy, A.: Do vision transformers see like convolutional neural networks? In: Advances in Neural Information Processing Systems, vol. 34 (2021)"},{"issue":"1","key":"12_CR40","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1109\/TPAMI.2017.2781233","volume":"41","author":"R Ranjan","year":"2017","unstructured":"Ranjan, R., Patel, V.M., Chellappa, R.: Hyperface: a deep multi-task learning framework for face detection, landmark localization, pose estimation, and gender recognition. IEEE Trans. Pattern Anal. Mach. Intell. 41(1), 121\u2013135 (2017)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"12_CR41","doi-asserted-by":"crossref","unstructured":"Regmi, K., Shah, M.: Bridging the domain gap for ground-to-aerial image matching. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 470\u2013479 (2019)","DOI":"10.1109\/ICCV.2019.00056"},{"key":"12_CR42","unstructured":"Ridnik, T., Ben-Baruch, E., Noy, A., Zelnik-Manor, L.: Imagenet-21k pretraining for the masses. arXiv preprint arXiv:2104.10972 (2021)"},{"key":"12_CR43","unstructured":"Ruder, S.: An overview of multi-task learning in deep neural networks. arXiv preprint arXiv:1706.05098 (2017)"},{"issue":"3","key":"12_CR44","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/s11263-015-0830-0","volume":"116","author":"O Saurer","year":"2016","unstructured":"Saurer, O., Baatz, G., K\u00f6ser, K., Pollefeys, M., et al.: Image based geo-localization in the alps. Int. J. Comput. Vision 116(3), 213\u2013225 (2016)","journal-title":"Int. J. Comput. Vision"},{"key":"12_CR45","doi-asserted-by":"crossref","unstructured":"Selvaraju, R.R., Cogswell, M., Das, A., Vedantam, R., Parikh, D., Batra, D.: Grad-cam: visual explanations from deep networks via gradient-based localization. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 618\u2013626 (2017)","DOI":"10.1109\/ICCV.2017.74"},{"key":"12_CR46","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"544","DOI":"10.1007\/978-3-030-01249-6_33","volume-title":"Computer Vision \u2013 ECCV 2018","author":"PH Seo","year":"2018","unstructured":"Seo, P.H., Weyand, T., Sim, J., Han, B.: CPlaNet: enhancing image geolocalization by combinatorial partitioning of maps. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11214, pp. 544\u2013560. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01249-6_33"},{"key":"12_CR47","unstructured":"Seymour, Z., Sikka, K., Chiu, H.P., Samarasekera, S., Kumar, R.: Semantically-aware attentive neural embeddings for image-based visual localization. arXiv preprint arXiv:1812.03402 (2018)"},{"key":"12_CR48","doi-asserted-by":"crossref","unstructured":"Strudel, R., Garcia, R., Laptev, I., Schmid, C.: Segmenter: transformer for semantic segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7262\u20137272 (2021)","DOI":"10.1109\/ICCV48922.2021.00717"},{"key":"12_CR49","doi-asserted-by":"crossref","unstructured":"Sun, C., Myers, A., Vondrick, C., Murphy, K., Schmid, C.: Videobert: a joint model for video and language representation learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 7464\u20137473 (2019)","DOI":"10.1109\/ICCV.2019.00756"},{"key":"12_CR50","unstructured":"Tan, M., Le, Q.: Efficientnet: rethinking model scaling for convolutional neural networks. In: International Conference on Machine Learning, pp. 6105\u20136114. PMLR (2019)"},{"key":"12_CR51","doi-asserted-by":"crossref","unstructured":"Theiner, J., M\u00fcller-Budack, E., Ewerth, R.: Interpretable semantic photo geolocation. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 750\u2013760 (2022)","DOI":"10.1109\/WACV51458.2022.00154"},{"issue":"2","key":"12_CR52","doi-asserted-by":"publisher","first-page":"64","DOI":"10.1145\/2812802","volume":"59","author":"B Thomee","year":"2016","unstructured":"Thomee, B., et al.: Yfcc100m: the new data in multimedia research. Commun. ACM 59(2), 64\u201373 (2016)","journal-title":"Commun. ACM"},{"key":"12_CR53","doi-asserted-by":"crossref","unstructured":"Tian, Y., Chen, C., Shah, M.: Cross-view image matching for geo-localization in urban environments. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3608\u20133616 (2017)","DOI":"10.1109\/CVPR.2017.216"},{"key":"12_CR54","doi-asserted-by":"crossref","unstructured":"Toker, A., Zhou, Q., Maximov, M., Leal-Taix\u00e9, L.: Coming down to earth: Satellite-to-street view synthesis for geo-localization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6488\u20136497 (2021)","DOI":"10.1109\/CVPR46437.2021.00642"},{"key":"12_CR55","doi-asserted-by":"crossref","unstructured":"Torii, A., Arandjelovic, R., Sivic, J., Okutomi, M., Pajdla, T.: 24\/7 place recognition by view synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1808\u20131817 (2015)","DOI":"10.1109\/CVPR.2015.7298790"},{"key":"12_CR56","doi-asserted-by":"crossref","unstructured":"Torii, A., Sivic, J., Pajdla, T., Okutomi, M.: Visual place recognition with repetitive structures. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 883\u2013890 (2013)","DOI":"10.1109\/CVPR.2013.119"},{"key":"12_CR57","doi-asserted-by":"crossref","unstructured":"Tzeng, E., Zhai, A., Clements, M., Townshend, R., Zakhor, A.: User-driven geolocation of untagged desert imagery using digital elevation models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops, pp. 237\u2013244 (2013)","DOI":"10.1109\/CVPRW.2013.42"},{"key":"12_CR58","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, pp. 5998\u20136008 (2017)"},{"key":"12_CR59","doi-asserted-by":"crossref","unstructured":"Vo, N., Jacobs, N., Hays, J.: Revisiting im2gps in the deep learning era. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2621\u20132630 (2017)","DOI":"10.1109\/ICCV.2017.286"},{"issue":"10","key":"12_CR60","doi-asserted-by":"publisher","first-page":"3349","DOI":"10.1109\/TPAMI.2020.2983686","volume":"43","author":"J Wang","year":"2020","unstructured":"Wang, J., et al.: Deep high-resolution representation learning for visual recognition. IEEE Trans. Pattern Anal. Mach. Intell. 43(10), 3349\u20133364 (2020)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"12_CR61","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"133","DOI":"10.1007\/978-3-642-35728-2_13","volume-title":"Advances in Multimedia Modeling","author":"Y Wang","year":"2013","unstructured":"Wang, Y., Cao, L.: Discovering latent clusters from geotagged beach images. In: Li, S., et al. (eds.) MMM 2013. LNCS, vol. 7733, pp. 133\u2013142. Springer, Heidelberg (2013). https:\/\/doi.org\/10.1007\/978-3-642-35728-2_13"},{"key":"12_CR62","doi-asserted-by":"crossref","unstructured":"Wang, Y., et al.: End-to-end video instance segmentation with transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8741\u20138750 (2021)","DOI":"10.1109\/CVPR46437.2021.00863"},{"key":"12_CR63","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"37","DOI":"10.1007\/978-3-319-46484-8_3","volume-title":"Computer Vision \u2013 ECCV 2016","author":"T Weyand","year":"2016","unstructured":"Weyand, T., Kostrikov, I., Philbin, J.: PlaNet - photo geolocation with convolutional neural networks. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9912, pp. 37\u201355. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46484-8_3"},{"key":"12_CR64","unstructured":"Xie, E., Wang, W., Yu, Z., Anandkumar, A., Alvarez, J.M., Luo, P.: Segformer: simple and efficient design for semantic segmentation with transformers. In: Advances in Neural Information Processing Systems, vol. 34 (2021)"},{"key":"12_CR65","doi-asserted-by":"crossref","unstructured":"Xu, H., et al.: Videoclip: contrastive pre-training for zero-shot video-text understanding. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, pp. 6787\u20136800 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.544"},{"key":"12_CR66","doi-asserted-by":"crossref","unstructured":"Yang, F., Yang, H., Fu, J., Lu, H., Guo, B.: Learning texture transformer network for image super-resolution. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5791\u20135800 (2020)","DOI":"10.1109\/CVPR42600.2020.00583"},{"key":"12_CR67","first-page":"29009","volume":"34","author":"H Yang","year":"2021","unstructured":"Yang, H., Lu, X., Zhu, Y.: Cross-view geo-localization with layer-to-layer transformer. Adv. Neural. Inf. Process. Syst. 34, 29009\u201329020 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"12_CR68","doi-asserted-by":"crossref","unstructured":"Yang, J., et al.: Unified contrastive learning in image-text-label space. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19163\u201319173 (2022)","DOI":"10.1109\/CVPR52688.2022.01857"},{"key":"12_CR69","doi-asserted-by":"crossref","unstructured":"Yang, L., Fan, Y., Xu, N.: Video instance segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5188\u20135197 (2019)","DOI":"10.1109\/ICCV.2019.00529"},{"key":"12_CR70","unstructured":"Yang, Z., Dai, Z., Yang, Y., Carbonell, J., Salakhutdinov, R.R., Le, Q.V.: XLNet: generalized autoregressive pretraining for language understanding. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"12_CR71","doi-asserted-by":"crossref","unstructured":"Ye, H.J., Hu, H., Zhan, D.C., Sha, F.: Few-shot learning via embedding adaptation with set-to-set functions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8808\u20138817 (2020)","DOI":"10.1109\/CVPR42600.2020.00883"},{"key":"12_CR72","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Yang, Q.: A survey on multi-task learning. IEEE Trans. Knowl. Data Eng. (2021)","DOI":"10.1109\/TKDE.2021.3070203"},{"key":"12_CR73","doi-asserted-by":"crossref","unstructured":"Zhao, H., Jiang, L., Jia, J., Torr, P.H., Koltun, V.: Point transformer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 16259\u201316268 (2021)","DOI":"10.1109\/ICCV48922.2021.01595"},{"key":"12_CR74","doi-asserted-by":"crossref","unstructured":"Zheng, S., et al.: Rethinking semantic segmentation from a sequence-to-sequence perspective with transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6881\u20136890 (2021)","DOI":"10.1109\/CVPR46437.2021.00681"},{"issue":"6","key":"12_CR75","doi-asserted-by":"publisher","first-page":"1452","DOI":"10.1109\/TPAMI.2017.2723009","volume":"40","author":"B Zhou","year":"2017","unstructured":"Zhou, B., Lapedriza, A., Khosla, A., Oliva, A., Torralba, A.: Places: a 10 million image database for scene recognition. IEEE Trans. Pattern Anal. Mach. Intell. 40(6), 1452\u20131464 (2017)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"12_CR76","doi-asserted-by":"crossref","unstructured":"Zhou, B., Zhao, H., Puig, X., Fidler, S., Barriuso, A., Torralba, A.: Scene parsing through ade20k dataset. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 633\u2013641 (2017)","DOI":"10.1109\/CVPR.2017.544"},{"issue":"3","key":"12_CR77","doi-asserted-by":"publisher","first-page":"302","DOI":"10.1007\/s11263-018-1140-0","volume":"127","author":"B Zhou","year":"2019","unstructured":"Zhou, B., et al.: Semantic understanding of scenes through the ade20k dataset. Int. J. Comput. Vision 127(3), 302\u2013321 (2019)","journal-title":"Int. J. Comput. Vision"},{"key":"12_CR78","doi-asserted-by":"crossref","unstructured":"Zhu, S., Shah, M., Chen, C.: Transgeo: transformer is all you need for cross-view image geo-localization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1162\u20131171 (2022)","DOI":"10.1109\/CVPR52688.2022.00123"},{"key":"12_CR79","doi-asserted-by":"crossref","unstructured":"Zhu, S., Yang, T., Chen, C.: Vigor: cross-view image geo-localization beyond one-to-one retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3640\u20133649, June 2021","DOI":"10.1109\/CVPR46437.2021.00364"},{"key":"12_CR80","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., Dai, J.: Deformable detr: deformable transformers for end-to-end object detection. In: International Conference on Learning Representations (2020)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-19839-7_12","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,7]],"date-time":"2024-03-07T12:21:47Z","timestamp":1709814107000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-19839-7_12"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031198380","9783031198397"],"references-count":80,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-19839-7_12","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"23 October 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}