{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:11:12Z","timestamp":1777655472761,"version":"3.51.4"},"publisher-location":"Cham","reference-count":62,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031733468","type":"print"},{"value":"9783031733475","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T00:00:00Z","timestamp":1730160000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T00:00:00Z","timestamp":1730160000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73347-5_18","type":"book-chapter","created":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T09:15:43Z","timestamp":1730106943000},"page":"309-327","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["Omniview-Tuning: Boosting Viewpoint Invariance of\u00a0Vision-Language Pre-training Models"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-0481-5855","authenticated-orcid":false,"given":"Shouwei","family":"Ruan","sequence":"first","affiliation":[]},{"given":"Yinpeng","family":"Dong","sequence":"additional","affiliation":[]},{"given":"Hanqing","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Yao","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Hang","family":"Su","sequence":"additional","affiliation":[]},{"given":"Xingxing","family":"Wei","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,29]]},"reference":[{"key":"18_CR1","unstructured":"Alayrac, J.B., et\u00a0al.: Flamingo: a visual language model for few-shot learning. In: Advances in Neural Information Processing Systems, pp. 23716\u201323736 (2022)"},{"key":"18_CR2","doi-asserted-by":"crossref","unstructured":"Alcorn, M.A., et al.: Strike (with) a pose: neural networks are easily fooled by strange poses of familiar objects. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4845\u20134854 (2019)","DOI":"10.1109\/CVPR.2019.00498"},{"key":"18_CR3","unstructured":"Awadalla, A., et\u00a0al.: OpenFlamingo: an open-source framework for training large autoregressive vision-language models. arXiv preprint arXiv:2308.01390 (2023)"},{"key":"18_CR4","unstructured":"Barbu, A., et al.: ObjectNet: a large-scale bias-controlled dataset for pushing the limits of object recognition models. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"issue":"2","key":"18_CR5","doi-asserted-by":"publisher","first-page":"115","DOI":"10.1037\/0033-295X.94.2.115","volume":"94","author":"I Biederman","year":"1987","unstructured":"Biederman, I.: Recognition-by-components: a theory of human image understanding. Psychol. Rev. 94(2), 115 (1987)","journal-title":"Psychol. Rev."},{"key":"18_CR6","unstructured":"Calian, D.A., et al.: Defending against image corruptions through adversarial augmentations. arXiv preprint arXiv:2104.01086 (2021)"},{"key":"18_CR7","doi-asserted-by":"crossref","unstructured":"Cha, J., Lee, K., Park, S., Chun, S.: Domain generalization by mutual-information regularization with pre-trained models. In: European Conference on Computer Vision (ECCV) (2022)","DOI":"10.1007\/978-3-031-20050-2_26"},{"key":"18_CR8","doi-asserted-by":"publisher","first-page":"104","DOI":"10.1007\/978-3-030-58577-8_7","volume-title":"Computer Vision \u2013 ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XXX","author":"YC Chen","year":"2020","unstructured":"Chen, Y.C., et al.: UNITER: universal Image-text Representation Learning. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) Computer Vision \u2013 ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XXX, pp. 104\u2013120. Springer International Publishing, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_7"},{"key":"18_CR9","doi-asserted-by":"crossref","unstructured":"Chuang, Y.S., et al.: DiffCSE: difference-based contrastive learning for sentence embeddings. arXiv preprint arXiv:2204.10298 (2022)","DOI":"10.18653\/v1\/2022.naacl-main.311"},{"key":"18_CR10","doi-asserted-by":"crossref","unstructured":"Collins, J., et\u00a0al.: ABO: dataset and benchmarks for real-world 3D object understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 21126\u201321136 (2022)","DOI":"10.1109\/CVPR52688.2022.02045"},{"key":"18_CR11","unstructured":"Dai, W., et al.: InstructBLIP: towards general-purpose vision-language models with instruction tuning (2023)"},{"key":"18_CR12","doi-asserted-by":"crossref","unstructured":"Das, S., Ryoo, M.S.: ViewCLR: learning self-supervised video representation for unseen viewpoints. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 5573\u20135583 (2023)","DOI":"10.1109\/WACV56688.2023.00553"},{"key":"18_CR13","doi-asserted-by":"crossref","unstructured":"Deitke, M., et al.: Objaverse: a universe of annotated 3D objects. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13142\u201313153 (2023)","DOI":"10.1109\/CVPR52729.2023.01263"},{"key":"18_CR14","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: 2009 IEEE conference on computer vision and pattern recognition, pp. 248\u2013255. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"18_CR15","doi-asserted-by":"crossref","unstructured":"Dong, Y., et al.: Benchmarking robustness of 3D object detection to common corruptions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1022\u20131032 (2023)","DOI":"10.1109\/CVPR52729.2023.00105"},{"key":"18_CR16","first-page":"36789","volume":"35","author":"Y Dong","year":"2022","unstructured":"Dong, Y., Ruan, S., Su, H., Kang, C., Wei, X., Zhu, J.: ViewFool: evaluating the robustness of visual recognition to adversarial viewpoints. Adv. Neural. Inf. Process. Syst. 35, 36789\u201336803 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"18_CR17","unstructured":"Fang, A., et al.: Data determines distributional robustness in contrastive language image pre-training (CLIP). In: Proceedings of the 39th International Conference on Machine Learning, pp. 6216\u20136234 (2022)"},{"key":"18_CR18","unstructured":"Gadre, S.Y., et\u00a0al.: DataComp: in search of the next generation of multimodal datasets. arXiv preprint arXiv:2304.14108 (2023)"},{"key":"18_CR19","doi-asserted-by":"crossref","unstructured":"Gao, P., ET AL.: Clip-adapter: Better vision-language models with feature adapters. Int. J. Comput. Vis. 132(2), 1\u201315 (2023)","DOI":"10.1007\/s11263-023-01891-x"},{"key":"18_CR20","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"22","DOI":"10.1007\/978-3-030-66415-2_2","volume-title":"Computer Vision \u2013 ECCV 2020 Workshops","author":"A Hamdi","year":"2020","unstructured":"Hamdi, A., Ghanem, B.: Towards analyzing semantic robustness of deep neural networks. In: Bartoli, A., Fusiello, A. (eds.) ECCV 2020. LNCS, vol. 12535, pp. 22\u201338. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-66415-2_2"},{"key":"18_CR21","doi-asserted-by":"crossref","unstructured":"Hendrycks, D., et\u00a0al.: The many faces of robustness: a critical analysis of out-of-distribution generalization. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 8340\u20138349 (2021)","DOI":"10.1109\/ICCV48922.2021.00823"},{"key":"18_CR22","unstructured":"Hendrycks, D., Dietterich, T.: Benchmarking neural network robustness to common corruptions and perturbations. arXiv preprint arXiv:1903.12261 (2019)"},{"key":"18_CR23","doi-asserted-by":"crossref","unstructured":"Hendrycks, D., Zhao, K., Basart, S., Steinhardt, J., Song, D.: Natural adversarial examples. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15262\u201315271 (2021)","DOI":"10.1109\/CVPR46437.2021.01501"},{"key":"18_CR24","doi-asserted-by":"crossref","unstructured":"Ho, C.H., Leung, B., Sandstrom, E., Chang, Y., Vasconcelos, N.: Catastrophic child\u2019s play: easy to perform, hard to defend adversarial attacks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9229\u20139237 (2019)","DOI":"10.1109\/CVPR.2019.00945"},{"key":"18_CR25","unstructured":"Hu, E.J., et al.: LoRA: low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)"},{"key":"18_CR26","doi-asserted-by":"publisher","unstructured":"Ilharco, G., et al.: OpenCLIP (2021). https:\/\/doi.org\/10.5281\/zenodo.5143773, https:\/\/doi.org\/10.5281\/zenodo.5143773","DOI":"10.5281\/zenodo.5143773"},{"key":"18_CR27","unstructured":"Krizhevsky, A., Hinton, G., et\u00a0al.: Learning multiple layers of features from tiny images (2009)"},{"key":"18_CR28","unstructured":"Li, B., Zhang, Y., Chen, L., Wang, J., Yang, J., Liu, Z.: Otter: a multi-modal model with in-context instruction tuning. arXiv preprint arXiv:2305.03726 (2023)"},{"key":"18_CR29","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning, pp. 12888\u201312900. PMLR (2022)"},{"key":"18_CR30","first-page":"9694","volume":"34","author":"J Li","year":"2021","unstructured":"Li, J., Selvaraju, R., Gotmare, A., Joty, S., Xiong, C., Hoi, S.C.H.: Align before fuse: vision and language representation learning with momentum distillation. Adv. Neural. Inf. Process. Syst. 34, 9694\u20139705 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"18_CR31","unstructured":"Li, L.H., Yatskar, M., Yin, D., Hsieh, C.J., Chang, K.W.: VisualBERT: a simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557 (2019)"},{"key":"18_CR32","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744 (2023)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"18_CR33","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: Advances in Neural Information Processing Systems (2023)"},{"key":"18_CR34","unstructured":"Liu, M., et al.: OpenShape: scaling up 3D shape representation towards open-world understanding. Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"18_CR35","unstructured":"Madan, S., et al.: When and how CNNs generalize to out-of-distribution category-viewpoint combinations. arXiv preprint arXiv:2007.08032 (2020)"},{"key":"18_CR36","unstructured":"Madry, A., Makelov, A., Schmidt, L., Tsipras, D., Vladu, A.: Towards deep learning models resistant to adversarial attacks. In: International Conference on Learning Representations (ICLR) (2018)"},{"key":"18_CR37","unstructured":"Mao, C., Geng, S., Yang, J., Wang, X., Vondrick, C.: Understanding zero-shot adversarial robustness for large-scale models. arXiv preprint arXiv:2212.07016 (2022)"},{"issue":"1","key":"18_CR38","doi-asserted-by":"publisher","first-page":"99","DOI":"10.1145\/3503250","volume":"65","author":"B Mildenhall","year":"2021","unstructured":"Mildenhall, B., Srinivasan, P.P., Tancik, M., Barron, J.T., Ramamoorthi, R., Ng, R.: NeRF: representing scenes as neural radiance fields for view synthesis. Commun. ACM 65(1), 99\u2013106 (2021)","journal-title":"Commun. ACM"},{"issue":"4","key":"18_CR39","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3528223.3530127","volume":"41","author":"T M\u00fcller","year":"2022","unstructured":"M\u00fcller, T., Evans, A., Schied, C., Keller, A.: Instant neural graphics primitives with a multiresolution hash encoding. ACM Trans. Graph. (ToG) 41(4), 1\u201315 (2022)","journal-title":"ACM Trans. Graph. (ToG)"},{"key":"18_CR40","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"18_CR41","unstructured":"Ramesh, A., et al.: Zero-shot text-to-image generation. In: Proceedings of the 38th International Conference on Machine Learning, pp. 8821\u20138831 (2021)"},{"key":"18_CR42","unstructured":"Recht, B., Roelofs, R., Schmidt, L., Shankar, V.: Do ImageNet classifiers generalize to ImageNet? In: International Conference on Machine Learning, pp. 5389\u20135400. PMLR (2019)"},{"key":"18_CR43","doi-asserted-by":"crossref","unstructured":"Reizenstein, J., Shapovalov, R., Henzler, P., Sbordone, L., Labatut, P., Novotny, D.: Common Objects in 3D: large-scale learning and evaluation of real-life 3D category reconstruction. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10901\u201310911 (2021)","DOI":"10.1109\/ICCV48922.2021.01072"},{"key":"18_CR44","unstructured":"Rong, X.: word2vec parameter learning explained. arXiv preprint arXiv:1411.2738 (2014)"},{"key":"18_CR45","doi-asserted-by":"crossref","unstructured":"Ruan, S., Dong, Y., Su, H., Peng, J., Chen, N., Wei, X.: Improving viewpoint robustness for visual recognition via adversarial training. arXiv preprint arXiv:2307.11528 (2023)","DOI":"10.1109\/ICCV51070.2023.00434"},{"key":"18_CR46","doi-asserted-by":"crossref","unstructured":"Ruan, S., Dong, Y., Su, H., Peng, J., Chen, N., Wei, X.: Towards viewpoint-invariant visual recognition via adversarial training. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4709\u20134719 (2023)","DOI":"10.1109\/ICCV51070.2023.00434"},{"key":"18_CR47","unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding. In: Advances in Neural Information Processing Systems (2022)"},{"key":"18_CR48","unstructured":"Schlarmann, C., Singh, N.D., Croce, F., Hein, M.: Robust CLIP: unsupervised adversarial fine-tuning of vision embeddings for robust large vision-language models. arXiv preprint arXiv:2402.12336 (2024)"},{"key":"18_CR49","first-page":"25278","volume":"35","author":"C Schuhmann","year":"2022","unstructured":"Schuhmann, C., et al.: LAION-5B: an open large-scale dataset for training next generation image-text models. Adv. Neural. Inf. Process. Syst. 35, 25278\u201325294 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"18_CR50","first-page":"16857","volume":"33","author":"K Song","year":"2020","unstructured":"Song, K., Tan, X., Qin, T., Lu, J., Liu, T.Y.: MPNet: masked and permuted pre-training for language understanding. Adv. Neural. Inf. Process. Syst. 33, 16857\u201316867 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"18_CR51","unstructured":"Sun, Q., Fang, Y., Wu, L., Wang, X., Cao, Y.: EVA-CLIP: improved training techniques for clip at scale. arXiv preprint arXiv:2303.15389 (2023)"},{"key":"18_CR52","unstructured":"Sun, Q., et al.: EVA-CLIP-18B: scaling clip to 18 billion parameters. arXiv preprint arXiv:2402.04252 (2024)"},{"issue":"2","key":"18_CR53","doi-asserted-by":"publisher","first-page":"64","DOI":"10.1145\/2812802","volume":"59","author":"B Thomee","year":"2016","unstructured":"Thomee, B., et al.: YFCC100M: the new data in multimedia research. Commun. ACM 59(2), 64\u201373 (2016)","journal-title":"Commun. ACM"},{"key":"18_CR54","unstructured":"Tu, W., Deng, W., Gedeon, T.: A closer look at the robustness of contrastive language-image pre-training (CLIP). In: Thirty-Seventh Conference on Neural Information Processing Systems (2023)"},{"key":"18_CR55","unstructured":"Wang, H., Ge, S., Lipton, Z., Xing, E.P.: Learning robust global representations by penalizing local predictive power. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"18_CR56","doi-asserted-by":"crossref","unstructured":"Wang, X., He, K., Gupta, A.: Transitive invariance for self-supervised visual representation learning. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1329\u20131338 (2017)","DOI":"10.1109\/ICCV.2017.149"},{"key":"18_CR57","unstructured":"Wu, Z., Wang, Z., Xu, X., Lu, J., Yan, H.: Embodied task planning with large language models. arXiv preprint arXiv:2307.01848 (2023)"},{"key":"18_CR58","unstructured":"Xu, H., et al.: Demystifying clip data. arXiv preprint arXiv:2309.16671 (2023)"},{"key":"18_CR59","doi-asserted-by":"crossref","unstructured":"Yu, X., et\u00a0al.: MVImgNet: a large-scale dataset of multi-view images. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9150\u20139161 (2023)","DOI":"10.1109\/CVPR52729.2023.00883"},{"key":"18_CR60","unstructured":"Zhang, Y., et\u00a0al.: Benchmarking trustworthiness of multimodal large language models: a comprehensive study. arXiv preprint arXiv:2406.07057 (2024)"},{"key":"18_CR61","unstructured":"Zhao, B., et al.: OOD-CV: a benchmark for robustness to individual nuisances in real-world out-of-distribution shifts. In: ICML 2022 Shift Happens Workshop (2022)"},{"key":"18_CR62","doi-asserted-by":"crossref","unstructured":"Zhou, X., Liu, M., Zagar, B.L., Yurtsever, E., Knoll, A.C.: Vision language models in autonomous driving and intelligent transportation systems. arXiv preprint arXiv:2310.14414 (2023)","DOI":"10.1109\/TIV.2024.3402136"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73347-5_18","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T09:48:37Z","timestamp":1730108917000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73347-5_18"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,29]]},"ISBN":["9783031733468","9783031733475"],"references-count":62,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73347-5_18","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,29]]},"assertion":[{"value":"29 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}