{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,29]],"date-time":"2025-03-29T16:54:07Z","timestamp":1743267247051,"version":"3.40.3"},"publisher-location":"Cham","reference-count":28,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031624940"},{"type":"electronic","value":"9783031624957"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-62495-7_13","type":"book-chapter","created":{"date-parts":[[2024,6,21]],"date-time":"2024-06-21T20:19:24Z","timestamp":1719001164000},"page":"166-177","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Exploiting LMM-Based Knowledge for\u00a0Image Classification Tasks"],"prefix":"10.1007","author":[{"given":"Maria","family":"Tzelepi","sequence":"first","affiliation":[]},{"given":"Vasileios","family":"Mezaris","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,6,22]]},"reference":[{"key":"13_CR1","unstructured":"Achiam, J., et al.: GPT-4 technical report. arXiv preprint arXiv:2303.08774 (2023)"},{"key":"13_CR2","doi-asserted-by":"crossref","unstructured":"Ao, T., Zhang, Z., Liu, L.: GestureDiffuCLIP: gesture diffusion model with CLIP latents. arXiv preprint arXiv:2303.14613 (2023)","DOI":"10.1145\/3592097"},{"key":"13_CR3","doi-asserted-by":"crossref","unstructured":"Aubakirova, D., Gerdes, K., Liu, L.: PatFig: generating short and long captions for patent figures. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2843\u20132849 (2023)","DOI":"10.1109\/ICCVW60793.2023.00305"},{"key":"13_CR4","unstructured":"Brown, T., et al.: Language models are few-shot learners. In: Advances in Neural Information Processing Systems, vol. 33, pp. 1877\u20131901 (2020)"},{"key":"13_CR5","doi-asserted-by":"crossref","unstructured":"Fu, D., et al.: Drive like a human: rethinking autonomous driving with large language models. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 910\u2013919 (2024)","DOI":"10.1109\/WACVW60836.2024.00102"},{"key":"13_CR6","doi-asserted-by":"publisher","first-page":"108797","DOI":"10.1109\/ACCESS.2022.3213652","volume":"10","author":"N Gkalelis","year":"2022","unstructured":"Gkalelis, N., Daskalakis, D., Mezaris, V.: ViGAT: bottom-up event recognition and explanation in video using factorized graph attention network. IEEE Access 10, 108797\u2013108816 (2022)","journal-title":"IEEE Access"},{"key":"13_CR7","doi-asserted-by":"crossref","unstructured":"Jiao, W., Huang, J.T., Wang, W., Wang, X., Shi, S., Tu, Z.: ParroT: translating during chat using large language models. arXiv preprint arXiv:2304.02426 (2023)","DOI":"10.18653\/v1\/2023.findings-emnlp.1001"},{"key":"13_CR8","unstructured":"Koh, P.W., et al.: Concept bottleneck models. In: International Conference on Machine Learning, pp. 5338\u20135348. PMLR (2020)"},{"key":"13_CR9","unstructured":"K\u00f6pf, A., et al.: Openassistant conversations-democratizing large language model alignment. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"13_CR10","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)"},{"key":"13_CR11","doi-asserted-by":"crossref","unstructured":"Maniparambil, M., Vorster, C., Molloy, D., Murphy, N., McGuinness, K., O\u2019Connor, N.E.: Enhancing CLIP with GPT-4: harnessing visual descriptions as prompts. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 262\u2013271 (2023)","DOI":"10.1109\/ICCVW60793.2023.00034"},{"issue":"4","key":"13_CR12","doi-asserted-by":"publisher","first-page":"125","DOI":"10.1109\/MGRS.2020.3005751","volume":"8","author":"L Mou","year":"2020","unstructured":"Mou, L., Hua, Y., Jin, P., Zhu, X.X.: ERA: a data set and deep learning benchmark for event recognition in aerial videos. IEEE Geosci. Remote Sens. Mag. 8(4), 125\u2013133 (2020)","journal-title":"IEEE Geosci. Remote Sens. Mag."},{"key":"13_CR13","unstructured":"Nam, J., Cha, H., Ahn, S., Lee, J., Shin, J.: Learning from failure: de-biasing classifier from biased classifier. In: Advances in Neural Information Processing Systems, vol. 33, pp. 20673\u201320684 (2020)"},{"key":"13_CR14","unstructured":"Naveed, H., et al.: A comprehensive overview of large language models. arXiv preprint arXiv:2307.06435 (2023)"},{"key":"13_CR15","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"13_CR16","unstructured":"Soomro, K., Zamir, A.R., Shah, M.: UCF101: a dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)"},{"key":"13_CR17","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: a neural image caption generator. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3156\u20133164 (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"13_CR18","doi-asserted-by":"crossref","unstructured":"Wu, H.H., Seetharaman, P., Kumar, K., Bello, J.P.: Wav2CLIP: learning robust audio representations from clip. In: ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4563\u20134567. IEEE (2022)","DOI":"10.1109\/ICASSP43922.2022.9747669"},{"key":"13_CR19","doi-asserted-by":"crossref","unstructured":"Yang, Y., Panagopoulou, A., Zhou, S., Jin, D., Callison-Burch, C., Yatskar, M.: Language in a bottle: language model guided concept bottlenecks for interpretable image classification. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19187\u201319197 (2023)","DOI":"10.1109\/CVPR52729.2023.01839"},{"key":"13_CR20","unstructured":"Yin, S., et al.: A survey on multimodal large language models. arXiv preprint arXiv:2306.13549 (2023)"},{"key":"13_CR21","doi-asserted-by":"crossref","unstructured":"Yu, W., Liu, Y., Hua, W., Jiang, D., Ren, B., Bai, X.: Turning a CLIP model into a scene text detector. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6978\u20136988 (2023)","DOI":"10.1109\/CVPR52729.2023.00674"},{"key":"13_CR22","unstructured":"Yuan, Z., Xue, H., Wang, X., Liu, Y., Zhao, Z., Wang, K.: ArtGPT-4: artistic vision-language understanding with adapter-enhanced MiniGPT-4. arXiv preprint arXiv:2305.07490 (2023)"},{"key":"13_CR23","doi-asserted-by":"crossref","unstructured":"Zhang, D., et al.: MM-LLMs: recent advances in multimodal large language models. arXiv preprint arXiv:2401.13601 (2024)","DOI":"10.18653\/v1\/2024.findings-acl.738"},{"key":"13_CR24","unstructured":"Zhao, W.X., et al.: A survey of large language models. arXiv preprint arXiv:2303.18223 (2023)"},{"key":"13_CR25","unstructured":"Zhou, J., et al.: SkinGPT-4: an interactive dermatology diagnostic system with visual large language model (2023)"},{"key":"13_CR26","doi-asserted-by":"crossref","unstructured":"Zhou, K., Yang, J., Loy, C.C., Liu, Z.: Conditional prompt learning for vision-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16816\u201316825 (2022)","DOI":"10.1109\/CVPR52688.2022.01631"},{"issue":"9","key":"13_CR27","doi-asserted-by":"publisher","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","volume":"130","author":"K Zhou","year":"2022","unstructured":"Zhou, K., Yang, J., Loy, C.C., Liu, Z.: Learning to prompt for vision-language models. Int. J. Comput. Vis. 130(9), 2337\u20132348 (2022)","journal-title":"Int. J. Comput. Vis."},{"key":"13_CR28","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: MiniGPT-4: enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)"}],"container-title":["Communications in Computer and Information Science","Engineering Applications of Neural Networks"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-62495-7_13","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,22]],"date-time":"2024-11-22T11:46:00Z","timestamp":1732275960000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-62495-7_13"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031624940","9783031624957"],"references-count":28,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-62495-7_13","relation":{},"ISSN":["1865-0929","1865-0937"],"issn-type":[{"type":"print","value":"1865-0929"},{"type":"electronic","value":"1865-0937"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"22 June 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"EANN","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Engineering Applications of Neural Networks","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Corfu","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Greece","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 June 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"30 June 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"25","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eann2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eannconf.org\/2024\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}