{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,18]],"date-time":"2026-03-18T13:27:44Z","timestamp":1773840464722,"version":"3.50.1"},"publisher-location":"Cham","reference-count":60,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031731945","type":"print"},{"value":"9783031731952","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T00:00:00Z","timestamp":1732665600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T00:00:00Z","timestamp":1732665600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73195-2_19","type":"book-chapter","created":{"date-parts":[[2024,11,26]],"date-time":"2024-11-26T09:35:43Z","timestamp":1732613743000},"page":"324-341","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["Understanding Multi-compositional Learning in\u00a0Vision and\u00a0Language Models via\u00a0Category Theory"],"prefix":"10.1007","author":[{"given":"Sotirios Panagiotis","family":"Chytas","sequence":"first","affiliation":[]},{"given":"Hyunwoo J.","family":"Kim","sequence":"additional","affiliation":[]},{"given":"Vikas","family":"Singh","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,27]]},"reference":[{"key":"19_CR1","unstructured":"Introducing stable lm zephyr 3b: a new addition to stable lm, bringing powerful llm assistants to edge devices. https:\/\/stability.ai\/news\/stablelm-zephyr-3b-stability-llm"},{"key":"19_CR2","unstructured":"Llm - detect ai generated text. https:\/\/www.kaggle.com\/competitions\/llm-detect-ai-generated-text"},{"key":"19_CR3","unstructured":"Llm hallucination index. https:\/\/github.com\/rungalileo\/hallucination-index"},{"key":"19_CR4","unstructured":"Phi-2: The surprising power of small language models. https:\/\/www.microsoft.com\/en-us\/research\/blog\/phi-2-the-surprising-power-of-small-language-models\/"},{"key":"19_CR5","unstructured":"Zig et puce. https:\/\/www.coolfrenchcomics.com\/zigpuce.htm"},{"key":"19_CR6","unstructured":"Bao, H., Dong, L., Piao, S., Wei, F.: Beit: bert pre-training of image transformers. In: International Conference on Learning Representations (2022). https:\/\/www.microsoft.com\/en-us\/research\/publication\/beit-bert-pre-training-of-image-transformers\/"},{"key":"19_CR7","unstructured":"BehnamGhader, P., Adlakha, V., Mosbach, M., Bahdanau, D., Chapados, N., Reddy, S.: Llm2vec: large language models are secretly powerful text encoders. arXiv preprint arXiv:2404.05961 (2024)"},{"key":"19_CR8","doi-asserted-by":"crossref","unstructured":"Bradley, T.D., Terilla, J., Vlassopoulos, Y.: An enriched category theory of language: from syntax to semantics. La Matematica 1(2) (2022)","DOI":"10.1007\/s44007-022-00021-2"},{"key":"19_CR9","doi-asserted-by":"crossref","unstructured":"Chen, C.Y., Grauman, K.: Inferring analogous attributes. In: IEEE Conference on Computer Vision and Pattern Recognition (2014)","DOI":"10.1109\/CVPR.2014.33"},{"key":"19_CR10","doi-asserted-by":"crossref","unstructured":"Cheng, Z.Q., Wu, X., Huang, S., Li, J.X., Hauptmann, A., Peng, Q.: Learning to transfer: generalizable attribute learning with multitask neural model search. In: Proceedings of the 26th ACM International Conference on Multimedia (2018)","DOI":"10.1145\/3240508.3240518"},{"key":"19_CR11","unstructured":"Chytas, S.P., Lokhande, V.S., Singh, V.: Pooling image datasets with multiple covariate shift and imbalance. In: International Conference on Learning Representations (2024). https:\/\/openreview.net\/forum?id=2Mo7v69otj"},{"key":"19_CR12","unstructured":"Cruttwell, G.S.H., Gavranovi\u0107, B., Ghani, N., Wilson, P., Zanasi, F.: Categorical foundations of gradient-based learning (2021). https:\/\/arxiv.org\/abs\/2103.01931"},{"key":"19_CR13","unstructured":"Cui, Y., Niekum, S., Gupta, A., Kumar, V., Rajeswaran, A.: Can foundation models perform zero-shot task specification for robot manipulation? In: Proceedings of The 4th Annual Learning for Dynamics and Control Conference, vol.\u00a0168. PMLR (2022). https:\/\/proceedings.mlr.press\/v168\/cui22a.html"},{"key":"19_CR14","doi-asserted-by":"publisher","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: Imagenet: a large-scale hierarchical image database. In: IEEE Conference on Computer Vision and Pattern Recognition (2009). https:\/\/doi.org\/10.1109\/CVPR.2009.5206848","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"19_CR15","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, vol. 1 (Long and Short Papers). Association for Computational Linguistics (2019). https:\/\/aclanthology.org\/N19-1423"},{"key":"19_CR16","unstructured":"Drozdov, A., et al.: Compositional semantic parsing with large language models. arXiv preprint arXiv:2209.15003 (2022)"},{"key":"19_CR17","doi-asserted-by":"crossref","unstructured":"Everingham, M., Gool, L.V., Williams, C.K.I., Winn, J.M., Zisserman, A.: The pascal visual object classes (voc) challenge. Int. J. Comput. Vision (2010)","DOI":"10.1007\/s11263-009-0275-4"},{"key":"19_CR18","doi-asserted-by":"crossref","unstructured":"Farhadi, A., Endres, I., Hoiem, D., Forsyth, D.A.: Describing objects by their attributes. In: IEEE Conference on Computer Vision and Pattern Recognition (2009)","DOI":"10.1109\/CVPRW.2009.5206772"},{"key":"19_CR19","doi-asserted-by":"publisher","unstructured":"Fong, B., Spivak, D., Tuy\u00e9ras, R.: Backprop as functor: a compositional perspective on supervised learning. In: 2019 34th Annual ACM\/IEEE Symposium on Logic in Computer Science (LICS) (2019). https:\/\/doi.org\/10.1109\/LICS.2019.8785665","DOI":"10.1109\/LICS.2019.8785665"},{"key":"19_CR20","doi-asserted-by":"crossref","unstructured":"Fong, B., Spivak, D.I.: Seven sketches in compositionality: an invitation to applied category theory (2018). https:\/\/arxiv.org\/abs\/1803.05316","DOI":"10.1017\/9781108668804"},{"key":"19_CR21","unstructured":"Freedman, D., Pisani, R., Purves, R.: Statistics (international student edition). Pisani, R. Purves, 4th edn. WW Norton & Company, New York (2007)"},{"key":"19_CR22","unstructured":"Furrer, D.P., van Zee, M., Scales, N., Sch\u00e4rli, N.: Compositional generalization in semantic parsing: pre-training vs. specialized architectures. arXiv e-prints arXiv:2007.08970 (2020)"},{"key":"19_CR23","doi-asserted-by":"crossref","unstructured":"Gavranovi\u0107 , B.: Learning functors using gradient descent. In: Electronic Proceedings in Theoretical Computer Science (2020). https:\/\/doi.org\/10.4204%2Feptcs.323.15","DOI":"10.4204\/EPTCS.323.15"},{"key":"19_CR24","unstructured":"Gavranovi\u0107, B.: Compositional deep learning (2019). https:\/\/arxiv.org\/abs\/1907.08292"},{"key":"19_CR25","doi-asserted-by":"crossref","unstructured":"Hand, E.M., Chellappa, R.: Attributes for improved attributes: a multi-task network utilizing implicit and explicit relationships for facial attribute classification. In: AAAI Conference on Artificial Intelligence (2017)","DOI":"10.1609\/aaai.v31i1.11229"},{"key":"19_CR26","doi-asserted-by":"publisher","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: IEEE Conference on Computer Vision and Pattern Recognition (2016). https:\/\/doi.org\/10.1109\/CVPR.2016.90","DOI":"10.1109\/CVPR.2016.90"},{"key":"19_CR27","unstructured":"He, P., Gao, J., Chen, W.: Debertav3: improving deberta using electra-style pre-training with gradient-disentangled embedding sharing. arXiv preprint arXiv:2111.09543 (2021)"},{"key":"19_CR28","doi-asserted-by":"crossref","unstructured":"Hudson, D.A., Manning, C.D.: Gqa: a new dataset for real-world visual reasoning and compositional question answering. In: IEEE Conference on Computer Vision and Pattern Recognition (2019)","DOI":"10.1109\/CVPR.2019.00686"},{"key":"19_CR29","doi-asserted-by":"crossref","unstructured":"Isola, P., Lim, J.J., Adelson, E.H.: Discovering states and transformations in image collections. In: IEEE Conference on Computer Vision and Pattern Recognition (2015)","DOI":"10.1109\/CVPR.2015.7298744"},{"key":"19_CR30","unstructured":"Jiang, A.Q., et\u00a0al.: Mistral 7b. arXiv preprint arXiv:2310.06825 (2023)"},{"key":"19_CR31","unstructured":"Keysers, D., et al.: Measuring compositional generalization: a comprehensive method on realistic data. In: International Conference on Learning Representations (2020)"},{"key":"19_CR32","unstructured":"Kojima, T., Gu, S.S., Reid, M., Matsuo, Y., Iwasawa, Y.: Large language models are zero-shot reasoners (2022). https:\/\/arxiv.org\/abs\/2205.11916"},{"key":"19_CR33","unstructured":"Krizhevsky, A.: Learning multiple layers of features from tiny images. University of Toronto (2012)"},{"key":"19_CR34","unstructured":"Lan, Z., Chen, M., Goodman, S., Gimpel, K., Sharma, P., Soricut, R.: Albert: a lite bert for self-supervised learning of language representations. arXiv preprint arXiv:1909.11942 (2019)"},{"key":"19_CR35","doi-asserted-by":"crossref","unstructured":"Lawvere, F.W., Schanuel, S.H.: Conceptual Mathematics: A First Introduction to Categories. Cambridge University Press, Cambridge (2009)","DOI":"10.1017\/CBO9780511804199"},{"key":"19_CR36","unstructured":"Lee, C., et al.: Nv-embed: improved techniques for training llms as generalist embedding models. arXiv preprint arXiv:2405.17428 (2024)"},{"key":"19_CR37","doi-asserted-by":"crossref","unstructured":"Li, J., Xu, Y., Lv, T., Cui, L., Zhang, C., Wei, F.: Dit: self-supervised pre-training for document image transformer. In: ACM Multimedia 2022 (2022). https:\/\/www.microsoft.com\/en-us\/research\/publication\/dit-self-supervised-pre-training-for-document-image-transformer\/","DOI":"10.1145\/3503161.3547911"},{"issue":"12","key":"19_CR38","doi-asserted-by":"publisher","first-page":"9043","DOI":"10.1109\/TPAMI.2021.3119406","volume":"44","author":"YL Li","year":"2022","unstructured":"Li, Y.L., Xu, Y., Xu, X., Mao, X., Lu, C.: Learning single\/multi-attribute of object with symmetry and group. IEEE Trans. Pattern Anal. Mach. Intell. 44(12), 9043\u20139055 (2022). https:\/\/doi.org\/10.1109\/TPAMI.2021.3119406","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"19_CR39","unstructured":"Liu, Y., et al.: Roberta: a robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)"},{"key":"19_CR40","doi-asserted-by":"publisher","unstructured":"Lu, Y., Kumar, A., Zhai, S., Cheng, Y., Javidi, T., Feris, R.: Fully-adaptive feature sharing in multi-task networks with applications in person attribute classification. In: IEEE Conference on Computer Vision and Pattern Recognition (2017). https:\/\/doi.org\/10.1109\/CVPR.2017.126","DOI":"10.1109\/CVPR.2017.126"},{"key":"19_CR41","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4757-4721-8","volume-title":"Categories for the Working Mathematician","author":"S MacLane","year":"2014","unstructured":"MacLane, S.: Categories for the Working Mathematician. Springer, New York (2014). https:\/\/doi.org\/10.1007\/978-1-4757-4721-8"},{"key":"19_CR42","doi-asserted-by":"publisher","unstructured":"Mancini, M., Naeem, M.F., Xian, Y., Akata, Z.: Open world compositional zero-shot learning. In: IEEE Conference on Computer Vision and Pattern Recognition (2021). https:\/\/doi.org\/10.1109\/CVPR46437.2021.00518","DOI":"10.1109\/CVPR46437.2021.00518"},{"key":"19_CR43","unstructured":"Marquis, J.P.: Category Theory. In: Zalta, E.N. (ed.) The Stanford Encyclopedia of Philosophy. Metaphysics Research Lab, Stanford University, Fall 2021 edn. (2021)"},{"key":"19_CR44","doi-asserted-by":"crossref","unstructured":"Misra, I., Gupta, A.K., Hebert, M.: From red wine to red tomato: composition with context. In: IEEE Conference on Computer Vision and Pattern Recognition (2017)","DOI":"10.1109\/CVPR.2017.129"},{"key":"19_CR45","unstructured":"Moschella, L., Maiorca, V., Fumero, M., Norelli, A., Locatello, F., Rodol\u00e0, E.: Relative representations enable zero-shot latent space communication. In: International Conference on Learning Representations (2023). https:\/\/openreview.net\/forum?id=SrC-nwieGJ"},{"key":"19_CR46","doi-asserted-by":"crossref","unstructured":"Naeem, M.F., Xian, Y., Tombari, F., Akata, Z.: Learning graph embeddings for compositional zero-shot learning. In: IEEE Conference on Computer Vision and Pattern Recognition (2021)","DOI":"10.1109\/CVPR46437.2021.00101"},{"key":"19_CR47","unstructured":"Nayak, N.V., Yu, P., Bach, S.: Learning to compose soft prompts for compositional zero-shot learning. In: International Conference on Learning Representations (2023). https:\/\/openreview.net\/forum?id=S8-A2FXnIh"},{"key":"19_CR48","doi-asserted-by":"crossref","unstructured":"Pennington, J., Socher, R., Manning, C.D.: Glove: global vectors for word representation. In: Conference on Empirical Methods in Natural Language Processing (2014)","DOI":"10.3115\/v1\/D14-1162"},{"key":"19_CR49","unstructured":"Plato: The Republic (1994). http:\/\/classics.mit.edu\/Plato\/republic.html"},{"key":"19_CR50","doi-asserted-by":"crossref","unstructured":"Purushwalkam, S., Nickel, M., Gupta, A.K., Ranzato, M.: Task-driven modular networks for zero-shot compositional learning. In: IEEE International Conference on Computer Vision (ICCV) (2019)","DOI":"10.1109\/ICCV.2019.00369"},{"key":"19_CR51","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: Proceedings of the 38th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol.\u00a0139. PMLR (2021). https:\/\/proceedings.mlr.press\/v139\/radford21a.html"},{"key":"19_CR52","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I.: Language models are unsupervised multitask learners (2019). https:\/\/api.semanticscholar.org\/CorpusID:160025533"},{"key":"19_CR53","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022)"},{"key":"19_CR54","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: IEEE Conference on Computer Vision and Pattern Recognition (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"19_CR55","doi-asserted-by":"publisher","unstructured":"Saini, N., Pham, K., Shrivastava, A.: Disentangling visual embeddings for attributes and objects. In: IEEE Conference on Computer Vision and Pattern Recognition (2022). https:\/\/doi.org\/10.1109\/CVPR52688.2022.01329","DOI":"10.1109\/CVPR52688.2022.01329"},{"key":"19_CR56","unstructured":"Shiebler, D., Gavranovi\u0107, B., Wilson, P.: Category theory in machine learning (2021). https:\/\/arxiv.org\/abs\/2106.07032"},{"key":"19_CR57","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol.\u00a030. Curran Associates, Inc. (2017). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"},{"key":"19_CR58","doi-asserted-by":"publisher","unstructured":"Xiao, J., Hays, J., Ehinger, K.A., Oliva, A., Torralba, A.: Sun database: large-scale scene recognition from abbey to zoo. In: IEEE Conference on Computer Vision and Pattern Recognition (2010). https:\/\/doi.org\/10.1109\/CVPR.2010.5539970","DOI":"10.1109\/CVPR.2010.5539970"},{"key":"19_CR59","doi-asserted-by":"crossref","unstructured":"Ye-Bin, M., Kim, J., Kim, H., Son, K., Oh, T.H.: Textmania: enriching visual feature by text-driven manifold augmentation. In: IEEE International Conference on Computer Vision (2023)","DOI":"10.1109\/ICCV51070.2023.00239"},{"key":"19_CR60","doi-asserted-by":"publisher","unstructured":"Yu, A., Grauman, K.: Fine-grained visual comparisons with local learning. In: IEEE Conference on Computer Vision and Pattern Recognition (2014). https:\/\/doi.org\/10.1109\/CVPR.2014.32","DOI":"10.1109\/CVPR.2014.32"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73195-2_19","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,26]],"date-time":"2024-11-26T10:12:05Z","timestamp":1732615925000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73195-2_19"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,27]]},"ISBN":["9783031731945","9783031731952"],"references-count":60,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73195-2_19","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,27]]},"assertion":[{"value":"27 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}