{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,18]],"date-time":"2026-04-18T03:06:48Z","timestamp":1776481608634,"version":"3.51.2"},"reference-count":38,"publisher":"Springer Science and Business Media LLC","issue":"11-12","license":[{"start":{"date-parts":[[2024,1,18]],"date-time":"2024-01-18T00:00:00Z","timestamp":1705536000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,18]],"date-time":"2024-01-18T00:00:00Z","timestamp":1705536000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Scientific and Technological Research Program of Chongqing Municipal Education Commission","award":["KJZD-K202201604"],"award-info":[{"award-number":["KJZD-K202201604"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Soft Comput"],"published-print":{"date-parts":[[2024,6]]},"DOI":"10.1007\/s00500-023-09536-4","type":"journal-article","created":{"date-parts":[[2024,1,18]],"date-time":"2024-01-18T08:02:29Z","timestamp":1705564949000},"page":"6969-6982","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Enhancing machine vision: the impact of a novel innovative technology on video question-answering"],"prefix":"10.1007","volume":"28","author":[{"given":"Songjian","family":"Dan","sequence":"first","affiliation":[]},{"given":"Wei","family":"Feng","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,1,18]]},"reference":[{"key":"9536_CR1","first-page":"2148","volume":"2021","author":"A Akula","year":"2021","unstructured":"Akula A, Changpinyo S, Gong B et al (2021) Crossvqa: scalably generating benchmarks for systematically testing vqa generalization. Proc Conf Empir Methods Nat Lang Process 2021:2148\u20132166","journal-title":"Proc Conf Empir Methods Nat Lang Process"},{"key":"9536_CR2","doi-asserted-by":"crossref","unstructured":"Anderson P, He X, Buehler C et al (2018) Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 6077\u20136086","DOI":"10.1109\/CVPR.2018.00636"},{"key":"9536_CR3","doi-asserted-by":"crossref","unstructured":"Antol S, Agrawal A, Lu J, et al (2015) Vqa: Visual question answering. In: Proceedings of the IEEE international conference on computer vision, pp 2425\u20132433","DOI":"10.1109\/ICCV.2015.279"},{"key":"9536_CR4","unstructured":"Berrios W, Mittal G, Thrush T et al (2023) Towards language models that can see: computer vision through the LENS of natural language. arXiv preprint arXiv:2306.16410"},{"key":"9536_CR5","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown T, Mann B, Ryder N et al (2020) Language models are few-shot learners. Adv Neural Inf Process Syst 33:1877\u20131901","journal-title":"Adv Neural Inf Process Syst"},{"key":"9536_CR6","doi-asserted-by":"crossref","unstructured":"Dai W, Hou L, Shang L et al (2022) Enabling multimodal generation on CLIP via vision-language knowledge distillation. arXiv preprint arXiv:2203.06386","DOI":"10.18653\/v1\/2022.findings-acl.187"},{"key":"9536_CR7","unstructured":"Devlin J, Chang M W, Lee K et al (2018) Bert: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805"},{"key":"9536_CR8","doi-asserted-by":"crossref","unstructured":"Dong X, Ning X, Xu J et al (2023) A recognizable expression line portrait synthesis method in portrait rendering robot. In: IEEE Transactions on Computational Social Systems","DOI":"10.1109\/TCSS.2023.3241003"},{"key":"9536_CR9","doi-asserted-by":"crossref","unstructured":"Dou Z Y, Xu Y, Gan Z et al (2022) An empirical study of training end-to-end vision-and-language transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 18166\u201318176","DOI":"10.1109\/CVPR52688.2022.01763"},{"key":"9536_CR10","doi-asserted-by":"publisher","first-page":"351","DOI":"10.1007\/s11243-011-9477-z","volume":"36","author":"AM El-Hendawy","year":"2011","unstructured":"El-Hendawy AM, Fayed AM, Mostafa MR (2011) Complexes of a diacetylmonoxime Schiff base of S-methyldithiocarbazate (H2 damsm) with Fe (III), Ru (III)\/Ru (II), and V (IV); catalytic activity and X-ray crystal structure of [Fe (Hdamsm) 2] NO3H2O. Transition Met Chem 36:351\u2013361","journal-title":"Transition Met Chem"},{"key":"9536_CR11","doi-asserted-by":"publisher","first-page":"227","DOI":"10.1016\/j.neucom.2018.11.102","volume":"391","author":"L Gao","year":"2020","unstructured":"Gao L, Cao L, Xu X, Shao J, Song J (2020) Question-led object attention for visual question answering. Neurocomputing 391:227\u2013233","journal-title":"Neurocomputing"},{"key":"9536_CR12","doi-asserted-by":"crossref","unstructured":"Guo J, Li J, Li D et al (2023) From images to textual prompts: zero-shot visual question answering with frozen large language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 10867\u201310877","DOI":"10.1109\/CVPR52729.2023.01046"},{"issue":"1","key":"9536_CR13","doi-asserted-by":"publisher","first-page":"87","DOI":"10.1109\/TPAMI.2022.3152247","volume":"45","author":"K Han","year":"2022","unstructured":"Han K, Wang Y, Chen H et al (2022) A survey on vision transformer. IEEE Trans Pattern Anal Mach Intell 45(1):87\u2013110","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"9536_CR14","doi-asserted-by":"publisher","first-page":"152","DOI":"10.1016\/j.bbi.2023.02.022","volume":"110","author":"EL Hill-Yardin","year":"2023","unstructured":"Hill-Yardin EL, Hutchinson MR, Laycock R et al (2023) A Chat (GPT) about the future of scientific publishing. Brain Behav Immun 110:152\u2013154","journal-title":"Brain Behav Immun"},{"key":"9536_CR15","doi-asserted-by":"publisher","DOI":"10.1016\/j.image.2020.115782","volume":"83","author":"R Hou","year":"2020","unstructured":"Hou R, Zhao YH, Hu Y et al (2020) No-reference video quality evaluation by a deep transfer CNN architecture. Signal Process Image Commun 83:115782","journal-title":"Signal Process Image Commun"},{"key":"9536_CR16","doi-asserted-by":"crossref","unstructured":"Jin W, Cheng Y, Shen Y et al (2021) A good prompt is worth millions of parameters: low-resource prompt-based learning for vision-language models. arXiv preprint arXiv:2110.08484","DOI":"10.18653\/v1\/2022.acl-long.197"},{"key":"9536_CR17","doi-asserted-by":"crossref","unstructured":"Li X, Yin X, Li C, Zhang P, Hu X, Zhang L, Wang L, Hu H, Dong L, Wei F et al (2020) Oscar: object-semantics aligned pre-training for vision-language tasks. In: Computer Vision\u2014ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XXX 16","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"9536_CR19","first-page":"9694","volume":"34","author":"J Li","year":"2021","unstructured":"Li J, Selvaraju R, Gotmare A et al (2021) Align before fuse: vision and language representation learning with momentum distillation. Adv Neural Inf Process Syst 34:9694\u20139705","journal-title":"Adv Neural Inf Process Syst"},{"key":"9536_CR20","unstructured":"Li J, Li D, Xiong C et al (2022) Blip: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning. PMLR, pp 12888\u201312900"},{"key":"9536_CR21","doi-asserted-by":"publisher","first-page":"110487","DOI":"10.1016\/j.asoc.2023.110487","volume":"144","author":"D Li","year":"2023","unstructured":"Li D, Jiang MR, Li MW et al (2023) A floating offshore platform motion forecasting approach based on EEMD hybrid ConvLSTM and chaotic quantum ALO. Appl Soft Comput 144:110487","journal-title":"Appl Soft Comput"},{"key":"9536_CR22","doi-asserted-by":"publisher","first-page":"105584","DOI":"10.1016\/j.asoc.2019.105584","volume":"82","author":"Y Liu","year":"2019","unstructured":"Liu Y, Zhang X, Huang F, Tang X, Li Z (2019) Visual question answering via attention-based syntactic structure tree-LSTM. Appl Soft Comput 82:105584","journal-title":"Appl Soft Comput"},{"key":"9536_CR24","doi-asserted-by":"crossref","unstructured":"Marino K, Rastegari M, Farhadi A et al (2019) Ok-vqa: a visual question answering benchmark requiring external knowledge. In: Proceedings of the IEEE\/cvf conference on computer vision and pattern recognition, pp 3195\u20133204","DOI":"10.1109\/CVPR.2019.00331"},{"key":"9536_CR25","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2022.108873","volume":"131","author":"X Ning","year":"2022","unstructured":"Ning X, Tian W, Yu Z, Li W, Bai X, Wang Y (2022) HCFNN: high-order coverage function neural network for image classification. Pattern Recogn 131:108873","journal-title":"Pattern Recogn"},{"key":"9536_CR26","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2022.109216","volume":"136","author":"X Ning","year":"2023","unstructured":"Ning X, Tian W, He F, Bai X, Sun L, Li W (2023) Hyper-sausage coverage function neuron model and learning algorithm for image classification. Pattern Recogn 136:109216","journal-title":"Pattern Recogn"},{"key":"9536_CR27","doi-asserted-by":"crossref","unstructured":"Plummer BA, Wang L, Cervantes CM et al (2015) Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. In: Proceedings of the IEEE international conference on computer vision, pp 2641\u20132649","DOI":"10.1109\/ICCV.2015.303"},{"key":"9536_CR28","unstructured":"Scao TL, Fan A, Akiki C et al (2022) Bloom: a 176b-parameter open-access multilingual language model. arXiv preprint arXiv:2211.05100"},{"key":"9536_CR29","doi-asserted-by":"crossref","unstructured":"Schwenk D, Khandelwal A, Clark C et al (2022) A-okvqa: a benchmark for visual question answering using world knowledge. In: European Conference on Computer Vision. Cham: Springer Nature Switzerland, pp 146\u2013162","DOI":"10.1007\/978-3-031-20074-8_9"},{"key":"9536_CR30","doi-asserted-by":"crossref","unstructured":"Sharma DK (2021) Information measure computation and its impact in MI COCO dataset. In: 2021 7th International Conference on Advanced Computing and Communication Systems (ICACCS). IEEE, pp 1964\u20131969","DOI":"10.1109\/ICACCS51430.2021.9441788"},{"key":"9536_CR31","unstructured":"Shen S, Li LH, Tan H, Bansal M, Rohrbach A, Chang K-W, Yao Z, Keutzer K (2021) How much can clip benefit vision-and-language tasks? arXiv preprint arXiv:2107.06383"},{"key":"9536_CR32","doi-asserted-by":"crossref","unstructured":"Si Q, Lin Z, Zheng M et al (2021) Check it again: progressive visual question answering via visual entailment. arXiv preprint arXiv:2106.04605","DOI":"10.18653\/v1\/2021.acl-long.317"},{"key":"9536_CR33","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2023.126300","volume":"545","author":"S Tian","year":"2023","unstructured":"Tian S, Li W, Ning X et al (2023) Continuous transfer of neural network representational similarity for incremental learning. Neurocomputing 545:126300","journal-title":"Neurocomputing"},{"key":"9536_CR34","unstructured":"Vaswani A, Shazeer N, Parmar N et al (2017) Attention is all you need. Adv Neural Inf Process Systems 30"},{"key":"9536_CR35","first-page":"1","volume":"60","author":"C Wang","year":"2022","unstructured":"Wang C, Ning X, Sun L et al (2022) Learning discriminative features by covering local geometric space for point cloud analysis. IEEE Trans Geosci Remote Sens 60:1\u201315","journal-title":"IEEE Trans Geosci Remote Sens"},{"issue":"1","key":"9536_CR36","first-page":"1","volume":"23","author":"J Whalen","year":"2023","unstructured":"Whalen J, Mouza C (2023) ChatGPT: challenges, opportunities, and implications for teacher education. Contemp Issues Technol Teacher Educ 23(1):1\u201323","journal-title":"Contemp Issues Technol Teacher Educ"},{"key":"9536_CR37","doi-asserted-by":"crossref","unstructured":"Yang Z, He X, Gao J et al (2016) Stacked attention networks for image question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 21\u201329","DOI":"10.1109\/CVPR.2016.10"},{"key":"9536_CR38","doi-asserted-by":"crossref","unstructured":"Yang Z, Gan Z, Wang J, et al. An empirical study of gpt-3 for few-shot knowledge-based vqa. In: Proceedings of the AAAI Conference on Artificial Intelligence. vol 36, pp 3081\u20133089","DOI":"10.1609\/aaai.v36i3.20215"},{"issue":"7","key":"9536_CR39","doi-asserted-by":"publisher","first-page":"9277","DOI":"10.1007\/s11042-021-11549-w","volume":"81","author":"L Zhang","year":"2022","unstructured":"Zhang L, Li H, Zhu R, Du P (2022) An infrared and visible image fusion algorithm based on ResNet-152. Multimed Tools Appl 81(7):9277\u20139287","journal-title":"Multimed Tools Appl"},{"key":"9536_CR40","unstructured":"Zhang S, Roller S, Goyal N et al (2022) Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068"}],"container-title":["Soft Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00500-023-09536-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00500-023-09536-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00500-023-09536-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,7,19]],"date-time":"2024-07-19T13:05:38Z","timestamp":1721394338000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00500-023-09536-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,1,18]]},"references-count":38,"journal-issue":{"issue":"11-12","published-print":{"date-parts":[[2024,6]]}},"alternative-id":["9536"],"URL":"https:\/\/doi.org\/10.1007\/s00500-023-09536-4","relation":{},"ISSN":["1432-7643","1433-7479"],"issn-type":[{"value":"1432-7643","type":"print"},{"value":"1433-7479","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,1,18]]},"assertion":[{"value":"30 November 2023","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 January 2024","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have not disclosed any conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}