{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,4]],"date-time":"2026-04-04T04:26:28Z","timestamp":1775276788359,"version":"3.50.1"},"reference-count":44,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition"],"published-print":{"date-parts":[[2024,12]]},"DOI":"10.1016\/j.patcog.2024.110809","type":"journal-article","created":{"date-parts":[[2024,7,20]],"date-time":"2024-07-20T00:39:01Z","timestamp":1721435941000},"page":"110809","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":9,"special_numbering":"C","title":["Vision-language pre-training via modal interaction"],"prefix":"10.1016","volume":"156","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2443-2820","authenticated-orcid":false,"given":"Hang","family":"Cheng","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hehui","family":"Ye","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaofei","family":"Zhou","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ximeng","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3676-6011","authenticated-orcid":false,"given":"Fei","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Meiqing","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"78","reference":[{"key":"10.1016\/j.patcog.2024.110809_b1","series-title":"Road Traffic Modeling and Management: Using Statistical Monitoring and Deep Learning","author":"Harrou","year":"2021"},{"key":"10.1016\/j.patcog.2024.110809_b2","series-title":"2022 IEEE 20th International Conference on Industrial Informatics","first-page":"107","article-title":"Efficient SST prediction in the red sea using hybrid deep learning-based approach","author":"Hittawe","year":"2022"},{"issue":"2","key":"10.1016\/j.patcog.2024.110809_b3","doi-asserted-by":"crossref","first-page":"021012","DOI":"10.1117\/1.JEI.28.2.021012","article-title":"Abnormal events detection using deep neural networks: application to extreme sea surface temperature detection in the red sea","volume":"28","author":"Hittawe","year":"2019","journal-title":"J. Electron. Imaging"},{"key":"10.1016\/j.patcog.2024.110809_b4","first-page":"53","article-title":"Active learning with Bayesian CNN using the BALD method for hyperspectral image classification","volume":"2023","author":"Qadir","year":"2023","journal-title":"Mesop. J. Big Data"},{"key":"10.1016\/j.patcog.2024.110809_b5","series-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018"},{"key":"10.1016\/j.patcog.2024.110809_b6","series-title":"Improving language understanding by generative pre-training","author":"Radford","year":"2018"},{"key":"10.1016\/j.patcog.2024.110809_b7","doi-asserted-by":"crossref","unstructured":"K. He, X. Zhang, S. Ren, J. Sun, Deep residual learning for image recognition, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2016, pp. 770\u2013778.","DOI":"10.1109\/CVPR.2016.90"},{"issue":"4","key":"10.1016\/j.patcog.2024.110809_b8","doi-asserted-by":"crossref","first-page":"757","DOI":"10.3390\/app9040757","article-title":"An on-line and adaptive method for detecting abnormal events in videos using spatio-temporal convnet","volume":"9","author":"Bouindour","year":"2019","journal-title":"Appl. Sci."},{"key":"10.1016\/j.patcog.2024.110809_b9","series-title":"ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"911","article-title":"Fine-tune the pretrained atst model for sound event detection","author":"Shao","year":"2024"},{"key":"10.1016\/j.patcog.2024.110809_b10","doi-asserted-by":"crossref","unstructured":"K. He, X. Chen, S. Xie, Y. Li, P. Doll\u00e1r, R. Girshick, Masked autoencoders are scalable vision learners, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 16000\u201316009.","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"10.1016\/j.patcog.2024.110809_b11","article-title":"Vision-language models for vision tasks: A survey","author":"Zhang","year":"2024","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.patcog.2024.110809_b12","doi-asserted-by":"crossref","unstructured":"N. Rasiwasia, J. Costa Pereira, E. Coviello, G. Doyle, G.R. Lanckriet, R. Levy, N. Vasconcelos, A new approach to cross-modal multimedia retrieval, in: Proceedings of the 18th ACM International Conference on Multimedia, 2010, pp. 251\u2013260.","DOI":"10.1145\/1873951.1873987"},{"key":"10.1016\/j.patcog.2024.110809_b13","doi-asserted-by":"crossref","unstructured":"L. Ma, Z. Lu, L. Shang, H. Li, Multimodal convolutional neural networks for matching image and sentence, in: Proceedings of the IEEE International Conference on Computer Vision, 2015, pp. 2623\u20132631.","DOI":"10.1109\/ICCV.2015.301"},{"key":"10.1016\/j.patcog.2024.110809_b14","doi-asserted-by":"crossref","unstructured":"Y. Liu, Y. Guo, E.M. Bakker, M.S. Lew, Learning a recurrent residual fusion network for multimodal matching, in: Proceedings of the IEEE International Conference on Computer Vision, 2017, pp. 4107\u20134116.","DOI":"10.1109\/ICCV.2017.442"},{"key":"10.1016\/j.patcog.2024.110809_b15","doi-asserted-by":"crossref","unstructured":"Y. Zhang, H. Lu, Deep cross-modal projection learning for image-text matching, in: Proceedings of the European Conference on Computer Vision, ECCV, 2018, pp. 686\u2013701.","DOI":"10.1007\/978-3-030-01246-5_42"},{"key":"10.1016\/j.patcog.2024.110809_b16","doi-asserted-by":"crossref","unstructured":"N. Sarafianos, X. Xu, I.A. Kakadiaris, Adversarial representation learning for text-to-image matching, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019, pp. 5814\u20135824.","DOI":"10.1109\/ICCV.2019.00591"},{"key":"10.1016\/j.patcog.2024.110809_b17","doi-asserted-by":"crossref","unstructured":"Z. Wang, X. Liu, H. Li, L. Sheng, J. Yan, X. Wang, J. Shao, Camp: Cross-modal adaptive message passing for text-image retrieval, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019, pp. 5764\u20135773.","DOI":"10.1109\/ICCV.2019.00586"},{"key":"10.1016\/j.patcog.2024.110809_b18","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2022.109211","article-title":"Semi-supervised cross-modal hashing via modality-specific and cross-modal graph convolutional networks","volume":"136","author":"Wu","year":"2023","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2024.110809_b19","doi-asserted-by":"crossref","unstructured":"S. Wang, R. Wang, Z. Yao, S. Shan, X. Chen, Cross-modal scene graph matching for relationship-aware image-text retrieval, in: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, 2020, pp. 1508\u20131517.","DOI":"10.1109\/WACV45572.2020.9093614"},{"key":"10.1016\/j.patcog.2024.110809_b20","first-page":"2417","article-title":"Structure-CLIP: Towards scene graph knowledge to enhance multi-modal structured representations","volume":"vol. 38","author":"Huang","year":"2024"},{"key":"10.1016\/j.patcog.2024.110809_b21","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.110555","article-title":"CAST: Cross-modal retrieval and visual conditioning for image captioning","volume":"153","author":"Cao","year":"2024","journal-title":"Pattern Recognit."},{"issue":"12","key":"10.1016\/j.patcog.2024.110809_b22","doi-asserted-by":"crossref","first-page":"2891","DOI":"10.1109\/TPAMI.2012.162","article-title":"Babytalk: Understanding and generating simple image descriptions","volume":"35","author":"Kulkarni","year":"2013","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.patcog.2024.110809_b23","doi-asserted-by":"crossref","unstructured":"O. Vinyals, A. Toshev, S. Bengio, D. Erhan, Show and tell: A neural image caption generator, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2015, pp. 3156\u20133164.","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"10.1016\/j.patcog.2024.110809_b24","series-title":"European Conference on Computer Vision","first-page":"353","article-title":"Learning to generate grounded visual captions without localization supervision","author":"Ma","year":"2020"},{"key":"10.1016\/j.patcog.2024.110809_b25","doi-asserted-by":"crossref","unstructured":"L. Zhou, Y. Kalantidis, X. Chen, J.J. Corso, M. Rohrbach, Grounded video description, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 6578\u20136587.","DOI":"10.1109\/CVPR.2019.00674"},{"key":"10.1016\/j.patcog.2024.110809_b26","doi-asserted-by":"crossref","unstructured":"P. Anderson, X. He, C. Buehler, D. Teney, M. Johnson, S. Gould, L. Zhang, Bottom-up and top-down attention for image captioning and visual question answering, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2018, pp. 6077\u20136086.","DOI":"10.1109\/CVPR.2018.00636"},{"key":"10.1016\/j.patcog.2024.110809_b27","doi-asserted-by":"crossref","unstructured":"X. Yang, K. Tang, H. Zhang, J. Cai, Auto-encoding scene graphs for image captioning, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 10685\u201310694.","DOI":"10.1109\/CVPR.2019.01094"},{"key":"10.1016\/j.patcog.2024.110809_b28","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2023.109420","article-title":"Towards local visual modeling for image captioning","volume":"138","author":"Ma","year":"2023","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2024.110809_b29","article-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","volume":"32","author":"Lu","year":"2019","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2024.110809_b30","series-title":"Lxmert: Learning cross-modality encoder representations from transformers","author":"Tan","year":"2019"},{"key":"10.1016\/j.patcog.2024.110809_b31","first-page":"3208","article-title":"Ernie-vil: Knowledge enhanced vision-language representations through scene graphs","volume":"vol. 35","author":"Yu","year":"2021"},{"key":"10.1016\/j.patcog.2024.110809_b32","doi-asserted-by":"crossref","unstructured":"H. Kim, H.Y. Jhoo, E. Park, S. Yoo, Tag2pix: Line art colorization using text tag with secat and changing loss, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019, pp. 9056\u20139065.","DOI":"10.1109\/ICCV.2019.00915"},{"key":"10.1016\/j.patcog.2024.110809_b33","article-title":"Bilinear attention networks","volume":"31","author":"Kim","year":"2018","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2024.110809_b34","series-title":"International Conference on Machine Learning","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","author":"Jia","year":"2021"},{"key":"10.1016\/j.patcog.2024.110809_b35","series-title":"Uniter: Learning universal image-text representations","author":"Chen","year":"2019"},{"key":"10.1016\/j.patcog.2024.110809_b36","doi-asserted-by":"crossref","unstructured":"S. Sun, Y.-C. Chen, L. Li, S. Wang, Y. Fang, J. Liu, Lightningdot: Pre-training visual-semantic embeddings for real-time image-text retrieval, in: Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, 2021, pp. 982\u2013997.","DOI":"10.18653\/v1\/2021.naacl-main.77"},{"key":"10.1016\/j.patcog.2024.110809_b37","series-title":"International Conference on Machine Learning","first-page":"5583","article-title":"Vilt: Vision-and-language transformer without convolution or region supervision","author":"Kim","year":"2021"},{"key":"10.1016\/j.patcog.2024.110809_b38","series-title":"European Conference on Computer Vision","first-page":"121","article-title":"Oscar: Object-semantics aligned pre-training for vision-language tasks","author":"Li","year":"2020"},{"key":"10.1016\/j.patcog.2024.110809_b39","series-title":"Retrieve fast, rerank smart: Cooperative and joint approaches for improved cross-modal retrieval","author":"Geigle","year":"2021"},{"key":"10.1016\/j.patcog.2024.110809_b40","first-page":"9694","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","volume":"34","author":"Li","year":"2021","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2024.110809_b41","series-title":"RefineCap: Concept-aware refinement for image captioning","author":"Chai","year":"2021"},{"key":"10.1016\/j.patcog.2024.110809_b42","doi-asserted-by":"crossref","unstructured":"L. Huang, W. Wang, J. Chen, X.-Y. Wei, Attention on attention for image captioning, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019, pp. 4634\u20134643.","DOI":"10.1109\/ICCV.2019.00473"},{"key":"10.1016\/j.patcog.2024.110809_b43","doi-asserted-by":"crossref","unstructured":"T. Kim, G. Song, S. Lee, S. Kim, Y. Seo, S. Lee, S.H. Kim, H. Lee, K. Bae, L-Verse: Bidirectional Generation Between Image and Text, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 16526\u201316536.","DOI":"10.1109\/CVPR52688.2022.01603"},{"key":"10.1016\/j.patcog.2024.110809_b44","doi-asserted-by":"crossref","unstructured":"Y. Pan, T. Yao, Y. Li, T. Mei, X-linear attention networks for image captioning, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 10971\u201310980.","DOI":"10.1109\/CVPR42600.2020.01098"}],"container-title":["Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320324005600?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320324005600?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2024,9,5]],"date-time":"2024-09-05T02:11:22Z","timestamp":1725502282000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0031320324005600"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12]]},"references-count":44,"alternative-id":["S0031320324005600"],"URL":"https:\/\/doi.org\/10.1016\/j.patcog.2024.110809","relation":{},"ISSN":["0031-3203"],"issn-type":[{"value":"0031-3203","type":"print"}],"subject":[],"published":{"date-parts":[[2024,12]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Vision-language pre-training via modal interaction","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patcog.2024.110809","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2024 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"110809"}}