{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,12]],"date-time":"2026-02-12T08:02:22Z","timestamp":1770883342886,"version":"3.50.1"},"reference-count":60,"publisher":"Springer Science and Business Media LLC","issue":"9","license":[{"start":{"date-parts":[[2026,2,12]],"date-time":"2026-02-12T00:00:00Z","timestamp":1770854400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,2,12]],"date-time":"2026-02-12T00:00:00Z","timestamp":1770854400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Front. Comput. Sci."],"published-print":{"date-parts":[[2026,9]]},"DOI":"10.1007\/s11704-025-41126-5","type":"journal-article","created":{"date-parts":[[2026,2,12]],"date-time":"2026-02-12T05:52:35Z","timestamp":1770875555000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Patching the visual ability of large multimodal models by collaborating with small models"],"prefix":"10.1007","volume":"20","author":[{"given":"Hao","family":"Liang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaolong","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Meina","family":"Kan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shiguang","family":"Shan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xilin","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,2,12]]},"reference":[{"key":"41126_CR1","first-page":"26296","volume-title":"Proceedings of 2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"H Liu","year":"2024","unstructured":"Liu H, Li C, Li Y, Lee Y J. Improved baselines with visual instruction tuning. In: Proceedings of 2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2024, 26296\u201326306"},{"key":"41126_CR2","unstructured":"OpenAI, Achiam J, Adler S, Agarwal S, Ahmad L, et al. GPT-4 technical report. 2023, arXiv preprint arXiv: 2303.08774"},{"key":"41126_CR3","unstructured":"Yang Z, Li L, Lin K, Wang J, Lin C C, Liu Z, Wang L. The dawn of LMMs: preliminary explorations with GPT-4V (ision). 2023, arXiv preprint arXiv: 2309.17421"},{"issue":"12","key":"41126_CR4","doi-asserted-by":"publisher","first-page":"220102","DOI":"10.1007\/s11432-024-4235-6","volume":"67","author":"Y Liu","year":"2024","unstructured":"Liu Y, Li Z, Huang M, Yang B, Yu W, Li C, Yin X C, Liu C L, Jin L, Bai X. OCRBench: on the hidden mystery of OCR in large multimodal models. Science China Information Sciences, 2024, 67(12): 220102","journal-title":"Science China Information Sciences"},{"key":"41126_CR5","unstructured":"Wu C, Yin S, Qi W, Wang X, Tang Z, Duan N. Visual ChatGPT: talking, drawing and editing with visual foundation models. 2023, arXiv preprint arXiv: 2303.04671"},{"key":"41126_CR6","first-page":"2997","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems","author":"T Schick","year":"2023","unstructured":"Schick T, Dwivedi-Yu J, Dess\u00ed R, Raileanu R, Lomeli M, Hambro E, Zettlemoyer L, Cancedda N, Scialom T. Toolformer: language models can teach themselves to use tools. In: Proceedings of the 37th International Conference on Neural Information Processing Systems. 2023, 2997"},{"key":"41126_CR7","volume-title":"Introducing ChatGPT","author":"OpenAI","year":"2022","unstructured":"OpenAI. Introducing ChatGPT. See openai.com\/blog\/chatgpt\/website, 2022"},{"key":"41126_CR8","unstructured":"Touvron H, Lavril T, Izacard G, Martinet X, Lachaux M A, Lacroix T, Rozi\u00e8re B, Goyal N, Hambro E, Azhar F, Rodriguez A, Joulin A, Grave E, Lample G. LLaMA: open and efficient foundation language models. 2023, arXiv preprint arXiv: 2302.13971"},{"key":"41126_CR9","unstructured":"Touvron H, Martin L, Stone K, Albert P, Almahairi A, et al. Llama 2: open foundation and fine-tuned chat models. 2023, arXiv preprint arXiv: 2307.09288"},{"key":"41126_CR10","volume-title":"Proceedings of the 11th International Conference on Learning Representations","author":"A Zeng","year":"2023","unstructured":"Zeng A, Liu X, Du Z, Wang Z, Lai H, Ding M, Yang Z, Xu Y, Zheng W, Xia X, Tam W L, Ma Z, Xue Y, Zhai J, Chen W, Liu Z, Zhang P, Dong Y, Tang J. GLM-130B: an open bilingual pre-trained model. In: Proceedings of the 11th International Conference on Learning Representations. 2023"},{"key":"41126_CR11","volume-title":"Vicuna: an open-source chatbot impressing GPT-4 with 90%* ChatGPT quality","author":"The Vicuna Team","year":"2023","unstructured":"The Vicuna Team. Vicuna: an open-source chatbot impressing GPT-4 with 90%* ChatGPT quality. See lmsys.org\/blog\/2023-03-30-vicuna\/website, 2023"},{"key":"41126_CR12","first-page":"23716","volume-title":"Proceedings of the 36th International Conference on Neural Information Processing Systems","author":"J B Alayrac","year":"2022","unstructured":"Alayrac J B, Donahue J, Luc P, Miech A, Barr I, et al. Flamingo: a visual language model for few-shot learning. In: Proceedings of the 36th International Conference on Neural Information Processing Systems. 2022, 23716\u201323736"},{"key":"41126_CR13","volume-title":"Proceedings of the 12th International Conference on Learning Representations","author":"D Zhu","year":"2024","unstructured":"Zhu D, Chen J, Shen X, Li X, Elhoseiny M. MiniGPT-4: enhancing vision-language understanding with advanced large language models. In: Proceedings of the 12th International Conference on Learning Representations. 2024"},{"key":"41126_CR14","volume-title":"Proceedings of the 9th International Conference on Learning Representations","author":"A Dosovitskiy","year":"2021","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Dehghani M, Minderer M, Heigold G, Gelly S, Uszkoreit J, Houlsby N. An image is worth 16\u00d716 words: transformers for image recognition at scale. In: Proceedings of the 9th International Conference on Learning Representations. 2021"},{"key":"41126_CR15","first-page":"814","volume-title":"Proceedings of the 40th International Conference on Machine Learning","author":"J Li","year":"2023","unstructured":"Li J, Li D, Savarese S, Hoi S. BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: Proceedings of the 40th International Conference on Machine Learning. 2023, 814"},{"key":"41126_CR16","unstructured":"Gemini Team Google, Anil R, Borgeaud S, Alayrac J B, Yu J, et al. Gemini: a family of highly capable multimodal models. 2023, arXiv preprint arXiv: 2312.11805"},{"key":"41126_CR17","unstructured":"Li K, He Y, Wang Y, Li Y, Wang W, Luo P, Wang Y, Wang L, Qiao Y. VideoChat: chat-centric video understanding. 2223, arXiv preprint arXiv: 2305.06355"},{"key":"41126_CR18","first-page":"323","volume-title":"Proceedings of the 18th European Conference on Computer Vision","author":"Y Li","year":"2025","unstructured":"Li Y, Wang C, Jia J. LLaMA-VID: an image is worth 2 tokens in large language models. In: Proceedings of the 18th European Conference on Computer Vision. 2025, 323\u2013340"},{"key":"41126_CR19","unstructured":"Rubenstein P K, Asawaroengchai C, Nguyen D D, Bapna A, Borsos Z, et al. AudioPaLM: a large language model that can speak and listen. 2023, arXiv preprint arXiv: 2306.12925"},{"key":"41126_CR20","first-page":"15180","volume-title":"Proceedings of 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"R Girdhar","year":"2023","unstructured":"Girdhar R, El-Nouby A, Liu Z, Singh M, Alwala K V, Joulin A, Misra I. ImageBind one embedding space to bind them all. In: Proceedings of 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2023, 15180\u201315190"},{"key":"41126_CR21","first-page":"38154","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems","author":"Y Shen","year":"2023","unstructured":"Shen Y, Song K, Tan X, Li D, Lu W, Zhuang Y. HuggingGPT: solving AI tasks with ChatGPT and its friends in hugging face. In: Proceedings of the 37th International Conference on Neural Information Processing Systems. 2023, 38154\u201338180"},{"key":"41126_CR22","unstructured":"Thoppilan R, De Freitas D, Hall J, Shazeer N, Kulshreshtha A, et al. LaMDA: language models for dialog applications. 2022, arXiv preprint arXiv: 2201.08239"},{"key":"41126_CR23","first-page":"10764","volume-title":"Proceedings of the 40th International Conference on Machine Learning","author":"L Gao","year":"2023","unstructured":"Gao L, Madaan A, Zhou S, Alon U, Liu P, Yang Y, Callan J, Neubig G. PAL: program-aided language models. In: Proceedings of the 40th International Conference on Machine Learning. 2023, 10764\u201310799"},{"key":"41126_CR24","first-page":"3149","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems","author":"R Yang","year":"2023","unstructured":"Yang R, Song L, Li Y, Zhao S, Ge Y, Li X, Shan Y. GPT4Tools: teaching large language model to use tools via self-instruction. In: Proceedings of the 37th International Conference on Neural Information Processing Systems. 2023, 3149"},{"key":"41126_CR25","unstructured":"Yang Z, Li L, Wang J, Lin K, Azarnasab E, Ahmed F, Liu Z, Liu C, Zeng M, Wang L. MM-REACT: prompting ChatGPT for multimodal reasoning and action. 2023, arXiv preprint arXiv: 2303.11381"},{"key":"41126_CR26","unstructured":"Weng Y, He S, Liu K, Liu S, Zhao J. ControlLM: crafting diverse personalities for language models. 2024, arXiv preprint arXiv: 2402.10151"},{"key":"41126_CR27","first-page":"126","volume-title":"Proceedings of the 18th European Conference on Computer Vision","author":"S Liu","year":"2025","unstructured":"Liu S, Cheng H, Liu H, Zhang H, Li F, Ren T, Zou X, Yang J, Su H, Zhu J, Zhang L, Gao J, Li C. LLaVA-Plus: learning to use tools for creating multimodal agents. In: Proceedings of the 18th European Conference on Computer Vision. 2025, 126\u2013142"},{"key":"41126_CR28","first-page":"55976","volume-title":"Proceedings of the 41st International Conference on Machine Learning","author":"Z Yang","year":"2024","unstructured":"Yang Z, Chen G, Li X, Wang W, Yang Y. DoraemonGPT: toward understanding dynamic scenes with large language models (Exemplified as a video agent). In: Proceedings of the 41st International Conference on Machine Learning. 2024, 55976\u201355997"},{"issue":"12","key":"41126_CR29","doi-asserted-by":"publisher","first-page":"1551","DOI":"10.1631\/FITEE.2100463","volume":"22","author":"Y Yang","year":"2021","unstructured":"Yang Y, Zhuang Y, Pan Y. Multiple knowledge representation for big data artificial intelligence: framework, applications, and case studies. Frontiers of Information Technology & Electronic Engineering, 2021, 22(12): 1551\u20131558","journal-title":"Frontiers of Information Technology & Electronic Engineering"},{"key":"41126_CR30","first-page":"3750","volume-title":"Proceedings of 2019 IEEE\/CVF International Conference on Computer Vision","author":"R Quan","year":"2019","unstructured":"Quan R, Dong X, Wu Y, Zhu L, Yang Y. Auto-ReID: searching for a part-aware ConvNet for person re-identification. In: Proceedings of 2019 IEEE\/CVF International Conference on Computer Vision. 2019, 3750\u20133759"},{"key":"41126_CR31","volume-title":"Proceedings of the 7th International Conference on Learning Representations","author":"H Liu","year":"2019","unstructured":"Liu H, Simonyan K, Yang Y. DARTS: differentiable architecture search. In: Proceedings of the 7th International Conference on Learning Representations. 2019"},{"issue":"70","key":"41126_CR32","first-page":"1","volume":"25","author":"H W Chung","year":"2024","unstructured":"Chung H W, Hou L, Longpre S, Zoph B, Tai Y, et al. Scaling instruction-finetuned language models. Journal of Machine Learning Research, 2024, 25(70): 1\u201353","journal-title":"Journal of Machine Learning Research"},{"key":"41126_CR33","unstructured":"Iyer S, Lin X V, Pasunuru R, Mihaylov T, Simig D, Yu P, Shuster K, Wang T, Liu Q, Koura P S, Li X, O\u2019Horo B, Pereyra G, Wang J, Dewan C, Celikyilmaz A, Zettlemoyer L, Stoyanov V. OPT-IML: scaling language model instruction meta learning through the lens of generalization. 2022, arXiv preprint arXiv: 2212.12017"},{"key":"41126_CR34","first-page":"13484","volume-title":"Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics","author":"Y Wang","year":"2023","unstructured":"Wang Y, Kordi Y, Mishra S, Liu A, Smith N A, Khashabi D, Hajishirzi H. Self-instruct: aligning language models with self-generated instructions. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics. 2023, 13484\u201313508"},{"issue":"7972","key":"41126_CR35","doi-asserted-by":"publisher","first-page":"172","DOI":"10.1038\/s41586-023-06291-2","volume":"620","author":"K Singhal","year":"2023","unstructured":"Singhal K, Azizi S, Tu T, Mahdavi S S, Wei J, Chung H W, Scales N, Tanwani A, Cole-Lewis H, Pfohl S, Payne P, Seneviratne M, Gamble P, Kelly C, Babiker A, Sch\u00e4rli N, Chowdhery A, Mansfield P, Demner-Fushman D, Ag\u00fcera Y arcas B, Webster D, Corrado G S, Matias Y, Chou K, Gottweis J, Tomasev N, Liu Y, Rajkomar A, Barral J, Semturs C, Karthikesalingam A, Natarajan V. Large language models encode clinical knowledge. Nature, 2023, 620(7972): 172\u2013180","journal-title":"Nature"},{"key":"41126_CR36","unstructured":"Taylor R, Kardas M, Cucurull G, Scialom T, Hartshorn A, Saravia E, Poulton A, Kerkez V, Stojnic R. Galactica: a large language model for science. 2022, arXiv preprint arXiv: 2211.09085"},{"key":"41126_CR37","unstructured":"Wu S, Irsoy O, Lu S, Dabravolski V, Dredze M, Gehrmann S, Kambadur P, Rosenberg D, Mann G. BloombergGPT: a large language model for finance. 2023, arXiv preprint arXiv: 2303.17564"},{"key":"41126_CR38","first-page":"485","volume-title":"Proceedings of the 34th International Conference on Neural Information Processing Systems","author":"W Wang","year":"2020","unstructured":"Wang W, Wei F, Dong L, Bao H, Yang N, Zhou M. MINILM: deep self-attention distillation for task-agnostic compression of pretrained transformers. In: Proceedings of the 34th International Conference on Neural Information Processing Systems. 2020, 485"},{"key":"41126_CR39","unstructured":"Schulman J, Wolski F, Dhariwal P, Radford A, Klimov O. Proximal policy optimization algorithms. 2017, arXiv preprint arXiv: 1707.06347"},{"key":"41126_CR40","first-page":"1516","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems","author":"H Liu","year":"2023","unstructured":"Liu H, Li C, Wu Q, Lee Y J. Visual instruction tuning. In: Proceedings of the 37th International Conference on Neural Information Processing Systems. 2023, 1516"},{"key":"41126_CR41","first-page":"57730","volume-title":"Proceedings of the 41st International Conference on Machine Learning","author":"W Yu","year":"2024","unstructured":"Yu W, Yang Z, Li L, Wang J, Lin K, Liu Z, Wang X, Wang L. MM-Vet: evaluating large multimodal models for integrated capabilities. In: Proceedings of the 41st International Conference on Machine Learning. 2024, 57730\u201357754"},{"key":"41126_CR42","doi-asserted-by":"publisher","first-page":"3608","DOI":"10.1109\/CVPR.2018.00380","volume-title":"Proceedings of 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"D Gurari","year":"2018","unstructured":"Gurari D, Li Q, Stangl A J, Guo A, Lin C, Grauman K, Luo J, Bigham J P. VizWiz grand challenge: answering visual questions from blind people. In: Proceedings of 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2018, 3608\u20133617"},{"key":"41126_CR43","first-page":"6904","volume-title":"Proceedings of 2017 IEEE Conference on Computer Vision and Pattern Recognition","author":"Y Goyal","year":"2017","unstructured":"Goyal Y, Khot T, Summers-Stay D, Batra D, Parikh D. Making the V in VQA matter: elevating the role of image understanding in visual question answering. In: Proceedings of 2017 IEEE Conference on Computer Vision and Pattern Recognition. 2017, 6904\u20136913"},{"key":"41126_CR44","first-page":"213","volume-title":"Proceedings of the 16th European Conference on Computer Vision","author":"N Carion","year":"2020","unstructured":"Carion N, Massa F, Synnaeve G, Usunier N, Kirillov A, Zagoruyko S. End-to-end object detection with transformers. In: Proceedings of the 16th European Conference on Computer Vision. 2020, 213\u2013229"},{"key":"41126_CR45","first-page":"1290","volume-title":"Proceedings of 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"B Cheng","year":"2022","unstructured":"Cheng B, Misra I, Schwing A G, Kirillov A, Girdhar R. Masked-attention mask transformer for universal image segmentation. In: Proceedings of 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2022, 1290\u20131299"},{"key":"41126_CR46","first-page":"38","volume-title":"Proceedings of the 18th European Conference on Computer Vision","author":"S Liu","year":"2025","unstructured":"Liu S, Zeng Z, Ren T, Li F, Zhang H, Yang J, Jiang Q, Li C, Yang J, Su H, Zhu J, Zhang L. Grounding DINO: marrying DINO with grounded pre-training for open-set object detection. In: Proceedings of the 18th European Conference on Computer Vision. 2025, 38\u201355"},{"key":"41126_CR47","first-page":"12888","volume-title":"Proceedings of the 39th International Conference on Machine Learning","author":"J Li","year":"2022","unstructured":"Li J, Li D, Xiong C, Hoi S C H. BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: Proceedings of the 39th International Conference on Machine Learning. 2022, 12888\u201312900"},{"key":"41126_CR48","volume-title":"Cash recognition API document","author":"Baidu","year":"2024","unstructured":"Baidu. Cash recognition API document. See ai.baidu.com\/ai-doc\/IMAGERECOGNITION\/pk3bcxavy website, 2024"},{"key":"41126_CR49","volume-title":"PaddleOCR project","author":"PaddleOCR","year":"2024","unstructured":"PaddleOCR. PaddleOCR project. See github.com\/PaddlePaddle\/PaddleOCR website, 2024"},{"key":"41126_CR50","volume-title":"Vision transformer (ViT) for facial expression recognition model card","author":"Trpakov","year":"2024","unstructured":"Trpakov. Vision transformer (ViT) for facial expression recognition model card. See huggingface.co\/trpakov\/vit-face-expression website, 2024"},{"key":"41126_CR51","volume-title":"Human action recognition ViT model card","author":"Rvv-karma","year":"2024","unstructured":"Rvv-karma. Human action recognition ViT model card. See huggingface.co\/rvv-karma\/Human-Action-Recognition-VIT-Base-patch16-224 website, 2024"},{"key":"41126_CR52","volume-title":"Dish recognition API document","author":"Baidu","year":"2024","unstructured":"Baidu. Dish recognition API document. See ai.baidu.com\/ai-doc\/IMAGERECOGNITION\/tk3bcxbb0 website, 2024"},{"key":"41126_CR53","volume-title":"Fruit and vegetable recognition API document","author":"Baidu","year":"2024","unstructured":"Baidu. Fruit and vegetable recognition API document. See ai.baidu.com\/ai-doc\/IMAGERECOGNITION\/wk3bcxevq website, 2024"},{"key":"41126_CR54","doi-asserted-by":"publisher","first-page":"122109","DOI":"10.1016\/j.eswa.2023.122109","volume":"238","author":"X Zhang","year":"2024","unstructured":"Zhang X, Liang L, Zhao S, Wang Z. GRFB-UNet: a new multi-scale attention network with group receptive field block for tactile paving segmentation. Expert Systems with Applications, 2024, 238: 122109","journal-title":"Expert Systems with Applications"},{"key":"41126_CR55","first-page":"259","volume-title":"Proceedings of the 18th International Conference on Computer Analysis of Images and Patterns","author":"S Yu","year":"2019","unstructured":"Yu S, Lee H, Kim J. LYTNet: a convolutional neural network for real-time pedestrian traffic lights and zebra crossing recognition for the visually impaired. In: Proceedings of the 18th International Conference on Computer Analysis of Images and Patterns. 2019, 259\u2013270"},{"key":"41126_CR56","first-page":"740","volume-title":"Proceedings of the 13th European Conference on Computer Vision","author":"T Y Lin","year":"2014","unstructured":"Lin T Y, Maire M, Belongie S, Hays J, Perona P, Ramanan D, Doll\u00e1r P, Zitnick C L. Microsoft COCO: common objects in context. In: Proceedings of the 13th European Conference on Computer Vision. 2014, 740\u2013755"},{"key":"41126_CR57","first-page":"633","volume-title":"Proceedings of 2017 IEEE Conference on Computer Vision and Pattern Recognition","author":"B Zhou","year":"2017","unstructured":"Zhou B, Zhao H, Puig X, Fidler S, Barriuso A, Torralba A. Scene parsing through ADE20K dataset. In: Proceedings of 2017 IEEE Conference on Computer Vision and Pattern Recognition. 2017, 633\u2013641"},{"key":"41126_CR58","first-page":"4990","volume-title":"Proceedings of 2017 IEEE International Conference on Computer Vision","author":"G Neuhold","year":"2017","unstructured":"Neuhold G, Ollmann T, Rota Bul\u00f2 S, Kontschieder P. The mapillary vistas dataset for semantic understanding of street scenes. In: Proceedings of 2017 IEEE International Conference on Computer Vision. 2017, 4990\u20134999"},{"key":"41126_CR59","doi-asserted-by":"publisher","first-page":"59","DOI":"10.1016\/j.neunet.2014.09.005","volume":"64","author":"I J Goodfellow","year":"2015","unstructured":"Goodfellow I J, Erhan D, Luc Carrier P, Courville A, Mirza M, Hamner B, Cukierski W, Tang Y, Thaler D, Lee D H, Zhou Y, Ramaiah C, Feng F, Li R, Wang X, Athanasakis D, Shawe-Taylor J, Milakov M, Park J, Ionescu R, Popescu M, Grozea C, Bergstra J, Xie J, Romaszko L, Xu B, Chuang Z, Bengio Y. Challenges in representation learning: a report on three machine learning contests. Neural Networks, 2015, 64: 59\u201363","journal-title":"Neural Networks"},{"key":"41126_CR60","volume-title":"Human action recognition (HAR) dataset","author":"Kaggle","year":"2022","unstructured":"Kaggle. Human action recognition (HAR) dataset. See www.kaggle.com\/datasets\/meetnagadia\/human-action-recognition-har-dataset website, 2022"}],"container-title":["Frontiers of Computer Science"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11704-025-41126-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11704-025-41126-5","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11704-025-41126-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,12]],"date-time":"2026-02-12T07:03:34Z","timestamp":1770879814000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11704-025-41126-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2,12]]},"references-count":60,"journal-issue":{"issue":"9","published-print":{"date-parts":[[2026,9]]}},"alternative-id":["41126"],"URL":"https:\/\/doi.org\/10.1007\/s11704-025-41126-5","relation":{},"ISSN":["2095-2228","2095-2236"],"issn-type":[{"value":"2095-2228","type":"print"},{"value":"2095-2236","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,2,12]]},"assertion":[{"value":"20 October 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 May 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 February 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"Shiguang SHAN is an Editorial Board member of the journal and a co-author of this article. To minimize bias, he was excluded from all editorial decision-making related to the acceptance of this article for publication. The remaining authors declare no conflict of interest.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"2009705"}}