{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,17]],"date-time":"2026-03-17T18:38:49Z","timestamp":1773772729756,"version":"3.50.1"},"publisher-location":"Cham","reference-count":43,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726576","type":"print"},{"value":"9783031726583","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,2]],"date-time":"2024-10-02T00:00:00Z","timestamp":1727827200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,2]],"date-time":"2024-10-02T00:00:00Z","timestamp":1727827200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72658-3_9","type":"book-chapter","created":{"date-parts":[[2024,10,2]],"date-time":"2024-10-02T03:32:37Z","timestamp":1727839957000},"page":"146-162","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["X-Former: Unifying Contrastive and\u00a0Reconstruction Learning for\u00a0MLLMs"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-8842-5990","authenticated-orcid":false,"given":"Swetha","family":"Sirnam","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7004-3570","authenticated-orcid":false,"given":"Jinyu","family":"Yang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0005-0198-240X","authenticated-orcid":false,"given":"Tal","family":"Neiman","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5378-1697","authenticated-orcid":false,"given":"Mamshad Nayeem","family":"Rizve","sequence":"additional","affiliation":[]},{"given":"Son","family":"Tran","sequence":"additional","affiliation":[]},{"given":"Benjamin","family":"Yao","sequence":"additional","affiliation":[]},{"given":"Trishul","family":"Chilimbi","sequence":"additional","affiliation":[]},{"given":"Mubarak","family":"Shah","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,2]]},"reference":[{"key":"9_CR1","doi-asserted-by":"crossref","unstructured":"Agrawal, H., et al.: Nocaps: novel object captioning at scale. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (2019)","DOI":"10.1109\/ICCV.2019.00904"},{"key":"9_CR2","unstructured":"Alayrac, J.B., et\u00a0al.: Flamingo: a visual language model for few-shot learning. Advances in Neural Information Processing Systems (2022)"},{"key":"9_CR3","unstructured":"Brown, T., et\u00a0al.: Language models are few-shot learners. Adv. Neural Inf. Processing Syst. (2020)"},{"key":"9_CR4","doi-asserted-by":"crossref","unstructured":"Changpinyo, S., Sharma, P., Ding, N., Soricut, R.: Conceptual 12m: pushing web-scale image-text pre-training to recognize long-tail visual concepts. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2021)","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"9_CR5","unstructured":"Chen, C., et al.: Why do we need large batchsizes in contrastive learning? A gradient-bias perspective. Adv. Neural Inf. Process. Syst. (2022)"},{"key":"9_CR6","unstructured":"Chen, K., Zhang, Z., Zeng, W., Zhang, R., Zhu, F., Zhao, R.: Shikra: unleashing multimodal llm\u2019s referential dialogue magic (2023)"},{"key":"9_CR7","doi-asserted-by":"crossref","unstructured":"Dai, W., Hou, L., Shang, L., Jiang, X., Liu, Q., Fung, P.: Enabling multimodal generation on CLIP via vision-language knowledge distillation. In: Muresan, S., Nakov, P., Villavicencio, A. (eds.) Findings of the Association for Computational Linguistics: ACL 2022 (2022)","DOI":"10.18653\/v1\/2022.findings-acl.187"},{"key":"9_CR8","unstructured":"Dai, W., et al.: Instructblip: towards general-purpose vision-language models with instruction tuning (2023)"},{"key":"9_CR9","doi-asserted-by":"crossref","unstructured":"Duan, J., et al.: Multi-modal alignment using representation codebook. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2022)","DOI":"10.1109\/CVPR52688.2022.01520"},{"key":"9_CR10","doi-asserted-by":"crossref","unstructured":"Fang, Y., et al.: Eva: Exploring the limits of masked visual representation learning at scale. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2023)","DOI":"10.1109\/CVPR52729.2023.01855"},{"key":"9_CR11","unstructured":"Ge, Y., Ge, Y., Zeng, Z., Wang, X., Shan, Y.: Planting a seed of vision in large language model (2023)"},{"key":"9_CR12","doi-asserted-by":"crossref","unstructured":"Goyal, Y., Khot, T., Summers-Stay, D., Batra, D., Parikh, D.: Making the V in VGA matter: elevating the role of image understanding in visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2017)","DOI":"10.1109\/CVPR.2017.670"},{"key":"9_CR13","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.: Masked autoencoders are scalable vision learners. In: Proceedings of the IEEE\/CVF Conference on Computer vision and Pattern Recognition (2022)","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"9_CR14","doi-asserted-by":"crossref","unstructured":"Hu, Z., Zhu, X., Tran, S., Vidal, R., Dhua, A.: Provla: compositional image search with progressive vision-language alignment and multimodal fusion. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (2023)","DOI":"10.1109\/ICCVW60793.2023.00293"},{"key":"9_CR15","doi-asserted-by":"crossref","unstructured":"Hudson, D.A., Manning, C.D.: GGA: a new dataset for real-world visual reasoning and compositional question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2019)","DOI":"10.1109\/CVPR.2019.00686"},{"key":"9_CR16","unstructured":"Jaegle, A., et\u00a0al.: Perceiver IO: a general architecture for structured inputs & outputs. arXiv preprint arXiv:2107.14795 (2021)"},{"key":"9_CR17","unstructured":"Jaegle, A., Gimeno, F., Brock, A., Vinyals, O., Zisserman, A., Carreira, J.: Perceiver: general perception with iterative attention. In: International Conference on Machine Learning. PMLR (2021)"},{"key":"9_CR18","doi-asserted-by":"crossref","unstructured":"Jiang, Q., et al.: Understanding and constructing latent modality structures in multi-modal representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2023)","DOI":"10.1109\/CVPR52729.2023.00740"},{"key":"9_CR19","doi-asserted-by":"crossref","unstructured":"Jin, W., Cheng, Y., Shen, Y., Chen, W., Ren, X.: A good prompt is worth millions of parameters: low-resource prompt-based learning for vision-language models. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (2022)","DOI":"10.18653\/v1\/2022.acl-long.197"},{"key":"9_CR20","unstructured":"Koh, J.Y., Salakhutdinov, R., Fried, D.: Grounding language models to images for multimodal inputs and outputs. International Conference on Machine Learning (2023)"},{"key":"9_CR21","doi-asserted-by":"crossref","unstructured":"Krishna, R., et\u00a0al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vision 123(1), 32\u201373 (2017)","DOI":"10.1007\/s11263-016-0981-7"},{"key":"9_CR22","unstructured":"Li, C.: Large multimodal models: notes on CVPR 2023 tutorial. arXiv preprint arXiv:2306.14895 (2023)"},{"key":"9_CR23","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: ICML (2023)"},{"key":"9_CR24","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning. PMLR (2022)"},{"key":"9_CR25","unstructured":"Lin, T.Y., et al.: Microsoft coco: common objects in context. In: Proceedings of the Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, 6\u201312 September 2014, Part V 13. Springer (2014)"},{"key":"9_CR26","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: Thirty-Seventh Conference on Neural Information Processing Systems (2023)"},{"key":"9_CR27","doi-asserted-by":"crossref","unstructured":"Marino, K., Rastegari, M., Farhadi, A., Mottaghi, R.: OK-VGA: a visual question answering benchmark requiring external knowledge. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2019)","DOI":"10.1109\/CVPR.2019.00331"},{"key":"9_CR28","doi-asserted-by":"crossref","unstructured":"Mukhoti, J., et al.: Open vocabulary semantic segmentation with patch aligned contrastive learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2023)","DOI":"10.1109\/CVPR52729.2023.01860"},{"key":"9_CR29","unstructured":"Ordonez, V., Kulkarni, G., Berg, T.: Im2text: describing images using 1 million captioned photographs. Adv. Neural Inf. Process. Syst. (2011)"},{"key":"9_CR30","unstructured":"Park, N., Kim, W., Heo, B., Kim, T., Yun, S.: What do self-supervised vision transformers learn? arXiv preprint arXiv:2305.00729 (2023)"},{"key":"9_CR31","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., Caicedo, J.C., Hockenmaier, J., Lazebnik, S.: Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. In: Proceedings of the IEEE International Conference on Computer Vision (2015)","DOI":"10.1109\/ICCV.2015.303"},{"key":"9_CR32","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning. PMLR (2021)"},{"key":"9_CR33","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., Soricut, R.: Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) (2018)","DOI":"10.18653\/v1\/P18-1238"},{"key":"9_CR34","doi-asserted-by":"crossref","unstructured":"Tong, S., Liu, Z., Zhai, Y., Ma, Y., LeCun, Y., Xie, S.: Eyes wide shut? exploring the visual shortcomings of multimodal LLMS (2024)","DOI":"10.1109\/CVPR52733.2024.00914"},{"key":"9_CR35","unstructured":"Tsimpoukelli, M., Menick, J.L., Cabi, S., Eslami, S.M.A., Vinyals, O., Hill, F.: Multimodal few-shot learning with frozen language models. In: Ranzato, M., Beygelzimer, A., Dauphin, Y., Liang, P., Vaughan, J.W. (eds.) Advances in Neural Information Processing Systems (2021)"},{"key":"9_CR36","unstructured":"Wang, G., Ge, Y., Ding, X., Kankanhalli, M., Shan, Y.: What makes for good visual tokenizers for large language models? (2023)"},{"key":"9_CR37","doi-asserted-by":"crossref","unstructured":"Weers, F., Shankar, V., Katharopoulos, A., Yang, Y., Gunter, T.: Masked autoencoding does not help natural language supervision at scale. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.02244"},{"key":"9_CR38","unstructured":"Wei, J., et\u00a0al.: Emergent abilities of large language models. arXiv preprint arXiv:2206.07682 (2022)"},{"key":"9_CR39","doi-asserted-by":"crossref","unstructured":"Yang, J., et al.: Vision-language pre-training with triple contrastive learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2022)","DOI":"10.1109\/CVPR52688.2022.01522"},{"key":"9_CR40","unstructured":"Yin, S., et al.: A survey on multimodal large language models. arXiv preprint arXiv:2306.13549 (2023)"},{"key":"9_CR41","doi-asserted-by":"crossref","unstructured":"Zellers, R., Bisk, Y., Farhadi, A., Choi, Y.: From recognition to cognition: visual commonsense reasoning. In: The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2019)","DOI":"10.1109\/CVPR.2019.00688"},{"key":"9_CR42","unstructured":"Zhang, S., et\u00a0al.: Opt: open pre-trained transformer language models. arXiv preprint arXiv:2205.01068 (2022)"},{"key":"9_CR43","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: Minigpt-4: enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72658-3_9","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,2]],"date-time":"2024-10-02T03:36:31Z","timestamp":1727840191000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72658-3_9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,2]]},"ISBN":["9783031726576","9783031726583"],"references-count":43,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72658-3_9","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,2]]},"assertion":[{"value":"2 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}