{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,5]],"date-time":"2025-12-05T09:33:45Z","timestamp":1764927225920,"version":"3.46.0"},"reference-count":45,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,12,5]],"date-time":"2025-12-05T00:00:00Z","timestamp":1764892800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2025,12,5]],"date-time":"2025-12-05T00:00:00Z","timestamp":1764892800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"funder":[{"DOI":"10.13039\/501100001809","name":"the National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["No. 61472348","61672455"],"award-info":[{"award-number":["No. 61472348","61672455"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"name":"the Open Project Program of the State Key Laboratory of CAD&CG","award":["No.  A2329"],"award-info":[{"award-number":["No.  A2329"]}]},{"name":"the General Scientific Research Project of Zhejiang Provincial Department of Education","award":["No.Y202352364"],"award-info":[{"award-number":["No.Y202352364"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Intell Syst"],"DOI":"10.1007\/s44196-025-01061-6","type":"journal-article","created":{"date-parts":[[2025,12,5]],"date-time":"2025-12-05T09:31:21Z","timestamp":1764927081000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["A Language-Image Pre-training Model Based on Context Optimization and Region of Interest"],"prefix":"10.1007","volume":"18","author":[{"given":"Ran","family":"Jin","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tao","family":"Jin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhengang","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tianzi","family":"Wu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ya","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qinghui","family":"Zhu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Min","family":"Luo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,12,5]]},"reference":[{"key":"1061_CR1","unstructured":"Radford, A., Kim, J. W., Hallacy, C., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. (2021)"},{"key":"1061_CR2","unstructured":"Ramesh, A., Pavlov, M., Goh, G., et al.: Zero-shot text-to-image generation. In: International Conference on Machine Learning, pp. 8821\u20138831. (2021)"},{"key":"1061_CR3","unstructured":"Jia, C., Yang, Y., Xia, Y., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: International Conference on Machine Learning, pp. 4904\u20134916. (2021)"},{"key":"1061_CR4","first-page":"9694","volume":"34","author":"J Li","year":"2021","unstructured":"Li, J., Selvaraju, R., Gotmare, A., et al.: Align before fuse: Vision and language representation learning with momentum distillation. Adv. Neural. Inf. Process. Syst. 34, 9694\u20139705 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1061_CR5","unstructured":"Zeng, Z., Mao, W.: A comprehensive empirical study of vision-language pre-trained model for supervised cross-modal retrieval. arXiv preprint arXiv:2201.02772 (2022)"},{"issue":"2","key":"1061_CR6","doi-asserted-by":"publisher","first-page":"2194","DOI":"10.1109\/TNNLS.2022.3188569","volume":"35","author":"S Peng","year":"2022","unstructured":"Peng, S., He, Y., Liu, X., et al.: Relation-aggregated cross-graph correlation learning for fine-grained image\u2013text retrieval. IEEE Trans Neural Netw Learn Syst 35(2), 2194\u20132207 (2022)","journal-title":"IEEE Trans Neural Netw Learn Syst"},{"key":"1061_CR7","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i2.25363","author":"S Wang","year":"2023","unstructured":"Wang, S., Chang, J., Wang, Z., et al.: Fine-grained retrieval prompt tuning. Proc. AAAI Conf. Artif. Intell. (2023). https:\/\/doi.org\/10.1609\/aaai.v37i2.25363","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"issue":"12","key":"1061_CR8","doi-asserted-by":"publisher","first-page":"11588","DOI":"10.1109\/TVT.2018.2890405","volume":"68","author":"F Kou","year":"2019","unstructured":"Kou, F., Du, J., Cui, W., et al.: Common semantic representation method based on object attention and adversarial learning for cross-modal data in IoV. IEEE Trans. Veh. Technol. 68(12), 11588\u201311598 (2019)","journal-title":"IEEE Trans. Veh. Technol."},{"issue":"8","key":"1061_CR9","doi-asserted-by":"publisher","first-page":"4393","DOI":"10.1002\/int.22723","volume":"37","author":"L Shi","year":"2022","unstructured":"Shi, L., Du, J., Cheng, G., et al.: Cross-media search method based on complementary attention and generative adversarial network for social networks. Int. J. Intell. Syst. 37(8), 4393\u20134416 (2022)","journal-title":"Int. J. Intell. Syst."},{"key":"1061_CR10","doi-asserted-by":"publisher","first-page":"2851","DOI":"10.1109\/TMM.2022.3152086","volume":"25","author":"Y Liu","year":"2022","unstructured":"Liu, Y., Wu, J., Qu, L., et al.: Self-supervised correlation learning for cross-modal retrieval. IEEE Trans Multimed 25, 2851\u20132863 (2022)","journal-title":"IEEE Trans Multimed"},{"key":"1061_CR11","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3245400","author":"K Luo","year":"2023","unstructured":"Luo, K., Zhang, C., Li, H., et al.: Adaptive marginalized semantic hashing for unpaired cross-modal retrieval. IEEE Trans. Multimed. (2023). https:\/\/doi.org\/10.1109\/TMM.2023.3245400","journal-title":"IEEE Trans. Multimed."},{"key":"1061_CR12","doi-asserted-by":"crossref","unstructured":"Yang, D., Wu, D., Zhang, W., et al.: Deep semantic-alignment hashing for unsupervised cross-modal retrieval. Proceedings of the 2020 international conference on multimedia retrieval, pp. 44\u201352. (2020)","DOI":"10.1145\/3372278.3390673"},{"key":"1061_CR13","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i5.16592","author":"J Yu","year":"2021","unstructured":"Yu, J., Zhou, H., Zhan, Y., et al.: Deep graph-neighbor coherence preserving network for unsupervised cross-modal hashing. Proc. AAAI Conf. Artif. Intell. (2021). https:\/\/doi.org\/10.1609\/aaai.v35i5.16592","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"1061_CR14","doi-asserted-by":"publisher","first-page":"1671","DOI":"10.1109\/TIP.2022.3145159","volume":"31","author":"Z Chen","year":"2022","unstructured":"Chen, Z., Luo, X., Wang, Y., et al.: Fine-grained hashing with double filtering. IEEE Trans. Image Process. 31, 1671\u20131683 (2022)","journal-title":"IEEE Trans. Image Process."},{"key":"1061_CR15","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., et al.: Show and tell: A neural image caption generator. Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 3156\u20133164. (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"1061_CR16","doi-asserted-by":"crossref","unstructured":"Antol, S., Agrawal, A., Lu, J., et al.: Vqa: Visual question answering. Proceedings of the IEEE international conference on computer vision, pp. 2425\u20132433 (2015).","DOI":"10.1109\/ICCV.2015.279"},{"key":"1061_CR17","doi-asserted-by":"crossref","unstructured":"Hudson, D. A., Manning, C. D.: Gqa: A new dataset for real-world visual reasoning and compositional question answering. Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 6700\u20136709. (2019)","DOI":"10.1109\/CVPR.2019.00686"},{"key":"1061_CR18","doi-asserted-by":"crossref","unstructured":"Suhr, A., Zhou, S., Zhang, A., et al.: A corpus for reasoning about natural language grounded in photographs. arXiv preprint arXiv:1811.00491 (2018)","DOI":"10.18653\/v1\/P19-1644"},{"key":"1061_CR19","doi-asserted-by":"crossref","unstructured":"Zellers, R., Bisk, Y., Farhadi, A., et al.: From recognition to cognition: Visual commonsense reasoning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 6720\u20136731 (2019).","DOI":"10.1109\/CVPR.2019.00688"},{"key":"1061_CR20","doi-asserted-by":"crossref","unstructured":"Marino, K., Rastegari, M., Farhadi, A., et al.: Ok-vqa: a visual question answering benchmark requiring external knowledge. In: Proceedings of the IEEE\/cvf conference on computer vision and pattern recognition, 3195\u20133204. (2019)","DOI":"10.1109\/CVPR.2019.00331"},{"key":"1061_CR21","doi-asserted-by":"crossref","unstructured":"Singh, A., Natarajan, V., Shah, M., et al. Towards vqa models that can read. Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 8317\u20138326. (2019)","DOI":"10.1109\/CVPR.2019.00851"},{"key":"1061_CR22","unstructured":"Wang, Z., Yu, J., Yu, A. W., et al.: Simvlm: Simple visual language model pre-training with weak supervision. arXiv preprint arXiv:2108.10904 (2021)"},{"key":"1061_CR23","unstructured":"Yuan, L., Chen, D., Chen, Y., et al.: Florence: a new foundation model for computer vision. arXiv preprint arXiv:2111.11432 (2021)"},{"key":"1061_CR24","first-page":"23716","volume":"35","author":"J Alayrac","year":"2022","unstructured":"Alayrac, J., Donahue, J., Luc, P., et al.: Flamingo: a visual language model for few-shot learning. Adv. Neural. Inf. Process. Syst. 35, 23716\u201323736 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1061_CR25","unstructured":"Yu, J., Wang, Z., Vasudevan, V., et al.: Coca: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917 (2022)."},{"key":"1061_CR26","unstructured":"Wang, J., Yang, Z., Hu, X., et al.: Git: A generative image-to-text transformer for vision and language. arXiv preprint arXiv:2205.14100 (2022)."},{"key":"1061_CR27","doi-asserted-by":"crossref","unstructured":"Xu, H., Yan, M., Li, C., et al.: E2E-VLP: End-to-end vision-language pre-training enhanced by visual learning. arXiv preprint arXiv:2106.01804 (2021).","DOI":"10.18653\/v1\/2021.acl-long.42"},{"key":"1061_CR28","doi-asserted-by":"crossref","unstructured":"Li, L. H., Zhang, P., Zhang, H., et al.: Grounded language-image pre-training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition pp. 10965\u201310975. (2022)","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"1061_CR29","first-page":"35959","volume":"35","author":"Y Gao","year":"2022","unstructured":"Gao, Y., Liu, J., Xu, Z., et al.: Pyramidclip: Hierarchical feature alignment for vision-language model pre-training. Adv. Neural. Inf. Process. Syst. 35, 35959\u201335970 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1061_CR30","doi-asserted-by":"crossref","unstructured":"Gao, Y., Liu, J., Xu, Z., et al.: Softclip: Softer cross-modal alignment makes clip stronger. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 1860\u20131868. (2024)","DOI":"10.1609\/aaai.v38i3.27955"},{"key":"1061_CR31","doi-asserted-by":"crossref","unstructured":"Hu, Z., Iscen, A., Sun, C., et al.: Reveal: retrieval-augmented visual-language pre-training with multi-source multimodal knowledge memory. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition[C], pp. 23369\u201323379. (2023)","DOI":"10.1109\/CVPR52729.2023.02238"},{"key":"1061_CR32","doi-asserted-by":"crossref","unstructured":"Lin, J., Yin, H., Ping, W., et al.: Vila: On pre-training for visual language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 26689\u201326699 (2024).","DOI":"10.1109\/CVPR52733.2024.02520"},{"key":"1061_CR33","unstructured":"Devlin, J., Chang, M., Lee, K., et al.: Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"1061_CR34","unstructured":"Yang, Z., Dai, Z., Yang, Y., et al.: Xlnet: Generalized autoregressive pre-training for language understanding. Advances in Neural Information Processing Systems, p. 32. (2019)"},{"key":"1061_CR35","unstructured":"Lan, Z., Chen, M., Goodman, S., et al.: Albert: A lite bert for self-supervised learning of language representations. arXiv preprint arXiv:1909.11942 (2019)."},{"key":"1061_CR36","unstructured":"Clark, K., Luong, M., Le, Q. V., et al.: Electra: Pre-training text encoders as discriminators rather than generators. arXiv preprint arXiv:2003.10555 (2020)."},{"key":"1061_CR37","unstructured":"Radford, A., Narasimhan, K., Salimans, T., et al.: Improving language understanding by generative pre-training (2018)."},{"key":"1061_CR38","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., et al.: An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"1061_CR39","doi-asserted-by":"crossref","unstructured":"Zhong, Z., Friedman, D., Chen, D.: Factual probing is [mask]: Learning vs. learning to recall. arXiv preprint arXiv:2104.05240 (2021).","DOI":"10.18653\/v1\/2021.naacl-main.398"},{"key":"1061_CR40","doi-asserted-by":"crossref","unstructured":"Khattak, M. U., Rasheed. H., Maaz, M., et al.: Maple: Multi-modal prompt learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19113\u201319122. (2023)","DOI":"10.1109\/CVPR52729.2023.01832"},{"key":"1061_CR41","doi-asserted-by":"crossref","unstructured":"Chen, Y., Liu, Y., Dong, L., et al.: Adaprompt: Adaptive model training for prompt-based nlp. arXiv preprint arXiv:2202.04824 (2022).","DOI":"10.18653\/v1\/2022.findings-emnlp.448"},{"key":"1061_CR42","doi-asserted-by":"crossref","unstructured":"Ding, N., Chen, Y., Han, X., et al.: Prompt-learning for fine-grained entity typing. arXiv preprint arXiv:2108.10604 (2021).","DOI":"10.18653\/v1\/2022.findings-emnlp.512"},{"issue":"10","key":"1061_CR43","doi-asserted-by":"publisher","first-page":"5328","DOI":"10.1109\/TKDE.2023.3332787","volume":"36","author":"Y Zhu","year":"2023","unstructured":"Zhu, Y., Wang, Y., Qiang, J., et al.: Prompt-learning for short text classification. IEEE Trans. Knowl. Data Eng. 36(10), 5328\u20135339 (2023)","journal-title":"IEEE Trans. Knowl. Data Eng."},{"key":"1061_CR44","unstructured":"Qi, D., Su, L., Song, J., et al.: Imagebert: Cross-modal pre-training with large-scale weak-supervised image-text data. arXiv preprint arXiv:2001.07966 (2020)."},{"issue":"9","key":"1061_CR45","doi-asserted-by":"publisher","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","volume":"130","author":"K Zhou","year":"2022","unstructured":"Zhou, K., Yang, J., Loy, C.C., et al.: Learning to prompt for vision-language models. Int. J. Comput. Vis. 130(9), 2337\u20132348 (2022)","journal-title":"Int. J. Comput. Vis."}],"container-title":["International Journal of Computational Intelligence Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s44196-025-01061-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s44196-025-01061-6\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s44196-025-01061-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,5]],"date-time":"2025-12-05T09:31:28Z","timestamp":1764927088000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s44196-025-01061-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,5]]},"references-count":45,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2025,12]]}},"alternative-id":["1061"],"URL":"https:\/\/doi.org\/10.1007\/s44196-025-01061-6","relation":{},"ISSN":["1875-6883"],"issn-type":[{"value":"1875-6883","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,12,5]]},"assertion":[{"value":"5 December 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 July 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 October 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 December 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"322"}}