{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T23:35:14Z","timestamp":1780356914026,"version":"3.54.1"},"reference-count":60,"publisher":"Informa UK Limited","issue":"1","license":[{"start":{"date-parts":[[2024,3,27]],"date-time":"2024-03-27T00:00:00Z","timestamp":1711497600000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/creativecommons.org\/licenses\/by-nc\/4.0\/"}],"content-domain":{"domain":["www.tandfonline.com"],"crossmark-restriction":true},"short-container-title":["Connection Science"],"published-print":{"date-parts":[[2024,12,31]]},"DOI":"10.1080\/09540091.2024.2325474","type":"journal-article","created":{"date-parts":[[2024,3,27]],"date-time":"2024-03-27T17:29:45Z","timestamp":1711560585000},"update-policy":"https:\/\/doi.org\/10.1080\/tandf_crossmark_01","source":"Crossref","is-referenced-by-count":2,"title":["Cross-modal learning with multi-modal model for video action recognition based on adaptive weight training"],"prefix":"10.1080","volume":"36","author":[{"given":"Qingguo","family":"Zhou","sequence":"first","affiliation":[{"name":"School of Information Science and Engineering, Lanzhou University, Lanzhou, People's Republic of China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yufeng","family":"Hou","sequence":"additional","affiliation":[{"name":"School of Information Science and Engineering, Lanzhou University, Lanzhou, People's Republic of China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Rui","family":"Zhou","sequence":"additional","affiliation":[{"name":"School of Information Science and Engineering, Lanzhou University, Lanzhou, People's Republic of China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yan","family":"Li","sequence":"additional","affiliation":[{"name":"School of Information Science and Engineering, Lanzhou University, Lanzhou, People's Republic of China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"JinQiang","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Information Science and Engineering, Lanzhou University, Lanzhou, People's Republic of China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhen","family":"Wu","sequence":"additional","affiliation":[{"name":"School of Information Science and Engineering, Lanzhou University, Lanzhou, People's Republic of China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Hung-Wei","family":"Li","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Information Engineering, Providence University, Taichung City, Taiwan"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Tien-Hsiung","family":"Weng","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Information Engineering, Providence University, Taichung City, Taiwan"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"301","published-online":{"date-parts":[[2024,3,27]]},"reference":[{"key":"e_1_3_3_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"e_1_3_3_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.02080"},{"key":"e_1_3_3_4_1","first-page":"32897","article-title":"Vlmo: Unified vision-language pre-training with mixture-of-modality-experts","volume":"35","author":"Bao H.","year":"2022","unstructured":"Bao, H., Wang, W., Dong, L., Liu, Q., Mohammed, O. K., Aggarwal, K., Som, S., Piao, S., & Wei, F. (2022). Vlmo: Unified vision-language pre-training with mixture-of-modality-experts. Advances in Neural Information Processing Systems, 35, 32897\u201332912.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_3_5_1","volume-title":"In ICML (Vol. 2, pp. 4)","author":"Bertasius G.","year":"2021","unstructured":"Bertasius, G., Wang, H., & Torresani, L. (2021). Is space-time attention all you need for video understanding?. In ICML (Vol. 2, pp. 4). ICML."},{"issue":"3","key":"e_1_3_3_6_1","first-page":"3522","article-title":"Mmnet: A model-based multimodal network for human action recognition in rgb-d videos","volume":"45","author":"Bruce X.","year":"2022","unstructured":"Bruce, X., Liu, Y., Zhang, X., Zhong, S.-h., & Chan, K. C. (2022). Mmnet: A model-based multimodal network for human action recognition in rgb-d videos. IEEE Transactions on Pattern Analysis and Machine Intelligence, 45(3), 3522\u20133538.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_3_7_1","unstructured":"Carreira J. Noland E. Banki-Horvath A. Hillier C. & Zisserman A. (2018). A short note about kinetics-600. abs\/1808.01340."},{"key":"e_1_3_3_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00359"},{"key":"e_1_3_3_9_1","unstructured":"Dosovitskiy A. Beyer L. Kolesnikov A. Weissenborn D. Zhai X. Unterthiner T. Dehghani M. Minderer M. Heigold G. Gelly S. Uszkoreit J. & Houlsby N. (2020). An image is worth 16x16 words: Transformers for image recognition at scale. volume abs\/2010.11929."},{"key":"e_1_3_3_10_1","doi-asserted-by":"crossref","unstructured":"Duan S. Xia C. Gao X. Ge B. Zhang H. & Li K.-C. (2022). Multi-modality diversity fusion network with swintransformer for rgb-d salient object detection. In 2022 IEEE international conference on image processing (ICIP) (pp. 1076\u20131080). IEEE.","DOI":"10.1109\/ICIP46576.2022.9897410"},{"key":"e_1_3_3_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"e_1_3_3_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"e_1_3_3_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.213"},{"key":"e_1_3_3_14_1","first-page":"1","article-title":"Clip-adapter: Better vision-language models with feature adapters","author":"Gao P.","year":"2023","unstructured":"Gao, P., Geng, S., Zhang, R., Ma, T., Fang, R., Zhang, Y., Li, H., & Qiao, Y. (2023). Clip-adapter: Better vision-language models with feature adapters. International journal of computer vision, 1\u201315.\u00a0Springer.","journal-title":"International journal of computer vision"},{"key":"e_1_3_3_15_1","doi-asserted-by":"crossref","unstructured":"Hajati F. & Tavakolian M. (2020). Video classification using deep autoencoder network. In Complex intelligent and software intensive systems: Proceedings of the 13th international conference on complex intelligent and software intensive systems (CISIS-2019) (pp. 508\u2013518). Springer.","DOI":"10.1007\/978-3-030-22354-0_45"},{"key":"e_1_3_3_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00685"},{"key":"e_1_3_3_17_1","doi-asserted-by":"crossref","unstructured":"Hataya R. Zdenek J. Yoshizoe K. & Nakayama H. (2020). Faster autoaugment: Learning augmentation strategies using backpropagation. In Computer Vision\u2013ECCV 2020: 16th European Conference Glasgow UK August 23\u201328 2020 Proceedings Part XXV 16 (pp. 1\u201316). Springer.","DOI":"10.1007\/978-3-030-58595-2_1"},{"key":"e_1_3_3_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_3_19_1","volume-title":"In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition workshops (pp. 958\u2013959)","author":"Iashin V.","year":"2020","unstructured":"Iashin, V., & Rahtu, E. (2020). Multi-modal dense video captioning. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition workshops (pp. 958\u2013959).\u00a0IEEE."},{"key":"e_1_3_3_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2012.59"},{"key":"e_1_3_3_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00209"},{"key":"e_1_3_3_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401075"},{"key":"e_1_3_3_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00942"},{"key":"e_1_3_3_24_1","unstructured":"Kim W. Son B. & Kim I. (2021). Vilt: Vision-and-language transformer without convolution or region supervision. In International conference on machine learning (pp. 5583\u20135594). PMLR."},{"key":"e_1_3_3_25_1","doi-asserted-by":"crossref","unstructured":"Klaser A. Marsza\u0142ek M. & Schmid C. (2008). A spatio-temporal descriptor based on 3D-gradients. In BMVC 2008-19th british machine vision conference (pp. 275\u20131). British Machine Vision Association.","DOI":"10.5244\/C.22.99"},{"key":"e_1_3_3_26_1","doi-asserted-by":"crossref","unstructured":"Kuehne H. Jhuang H. Garrote E. Poggio T. & Serre T. (2011). Hmdb: A large video database for human motion recognition. In 2011 International conference on computer vision (pp. 2556\u20132563). IEEE.","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"e_1_3_3_27_1","doi-asserted-by":"publisher","DOI":"10.1023\/A:1007467927290"},{"key":"e_1_3_3_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"e_1_3_3_29_1","first-page":"9694","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","volume":"34","author":"Li J.","year":"2021","unstructured":"Li, J., Selvaraju, R., Gotmare, A., Joty, S., Xiong, C., & Hoi, S. C. H (2021). Align before fuse: Vision and language representation learning with momentum distillation. Advances in Neural Information Processing Systems, 34, 9694\u20139705.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_3_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00117"},{"key":"e_1_3_3_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00099"},{"key":"e_1_3_3_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00718"},{"key":"e_1_3_3_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01852"},{"key":"e_1_3_3_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3560815"},{"key":"e_1_3_3_35_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"e_1_3_3_36_1","doi-asserted-by":"publisher","DOI":"10.26599\/TST.2021.9010068"},{"key":"e_1_3_3_37_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cortex.2017.07.006"},{"key":"e_1_3_3_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00355"},{"key":"e_1_3_3_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01430"},{"key":"e_1_3_3_40_1","doi-asserted-by":"publisher","DOI":"10.1080\/09540091.2022.2115010"},{"key":"e_1_3_3_41_1","unstructured":"Radford A. Kim J. W. Hallacy C. Ramesh A. Goh G. Agarwal S. Sastry G. Askell A. Mishkin P. Clark J. & Krueger G. (2021). Learning transferable visual models from natural language supervision. In International conference on machine learning (pp. 8748\u20138763). PMLR."},{"issue":"4","key":"e_1_3_3_42_1","first-page":"2249","article-title":"Sports video classification with deep convolution neural network: A test on ucf101 dataset","volume":"8","author":"Ramesh M.","year":"2019","unstructured":"Ramesh, M., & Mahesh, K. (2019). Sports video classification with deep convolution neural network: A test on ucf101 dataset. International Journal of Engineering and Advanced Technology, 8(4S2), 2249\u20138958.","journal-title":"International Journal of Engineering and Advanced Technology"},{"key":"e_1_3_3_43_1","volume-title":"33rd Conference on Neural Information Processing Systems (NeurIPS)(Vol 32)","author":"Ravuri S.","year":"2019","unstructured":"Ravuri, S., & Vinyals, O. (2019). Classification accuracy score for conditional generative models. 33rd Conference on Neural Information Processing Systems (NeurIPS)(Vol 32).\u00a0NEURAL INFORMATION PROCESSING SYSTEMS (NIPS)."},{"key":"e_1_3_3_44_1","volume-title":"28th Conference on Neural Information Processing Systems (NIPS)(Vol 27)","author":"Simonyan K.","year":"2014","unstructured":"Simonyan, K., & Zisserman, A. (2014). Two-stream convolutional networks for action recognition in videos. 28th Conference on Neural Information Processing Systems (NIPS)(Vol 27).\u00a0NEURAL INFORMATION PROCESSING SYSTEMS (NIPS)."},{"key":"e_1_3_3_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.83"},{"key":"e_1_3_3_46_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58568-6_16"},{"key":"e_1_3_3_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00675"},{"key":"e_1_3_3_48_1","first-page":"30","article-title":"Attention is all you need","author":"Vaswani A.","year":"2017","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., Kaiser, \u0141., & Polosukhin, I (2017). Attention is all you need. Advances in Neural Information Processing Systems. 30.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_3_49_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-012-0594-8"},{"key":"e_1_3_3_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00043"},{"key":"e_1_3_3_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00413"},{"key":"e_1_3_3_52_1","doi-asserted-by":"crossref","unstructured":"Wang L. Xiong Y. Wang Z. Qiao Y. Lin D. Tang X. & Van Gool L. (2016). Temporal segment networks: Towards good practices for deep action recognition. In European conference on computer vision (pp. 20\u201336). Springer.","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"e_1_3_3_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2023.3341807"},{"key":"e_1_3_3_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00813"},{"key":"e_1_3_3_55_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.dsp.2022.103775"},{"key":"e_1_3_3_56_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10489-020-01691-7"},{"key":"e_1_3_3_57_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2023.121441"},{"key":"e_1_3_3_58_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i3.25412"},{"key":"e_1_3_3_59_1","doi-asserted-by":"crossref","unstructured":"Zhang R. Zhang W. Fang R. Gao P. Li K. Dai J. Qiao Y. & Li H. (2022). Tip-adapter: Training-free adaption of clip for few-shot classification. In European conference on computer vision (pp. 493\u2013510). Springer.","DOI":"10.1007\/978-3-031-19833-5_29"},{"key":"e_1_3_3_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"e_1_3_3_61_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"}],"container-title":["Connection Science"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.tandfonline.com\/doi\/pdf\/10.1080\/09540091.2024.2325474","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,17]],"date-time":"2024-12-17T15:02:42Z","timestamp":1734447762000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.tandfonline.com\/doi\/full\/10.1080\/09540091.2024.2325474"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,3,27]]},"references-count":60,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2024,12,31]]}},"alternative-id":["10.1080\/09540091.2024.2325474"],"URL":"https:\/\/doi.org\/10.1080\/09540091.2024.2325474","relation":{},"ISSN":["0954-0091","1360-0494"],"issn-type":[{"value":"0954-0091","type":"print"},{"value":"1360-0494","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,3,27]]},"assertion":[{"value":"The publishing and review policy for this title is described in its Aims & Scope.","order":1,"name":"peerreview_statement","label":"Peer Review Statement"},{"value":"http:\/\/www.tandfonline.com\/action\/journalInformation?show=aimsScope&journalCode=ccos20","URL":"http:\/\/www.tandfonline.com\/action\/journalInformation?show=aimsScope&journalCode=ccos20","order":2,"name":"aims_and_scope_url","label":"Aim & Scope"},{"value":"2023-12-25","order":0,"name":"received","label":"Received","group":{"name":"publication_history","label":"Publication History"}},{"value":"2024-02-26","order":2,"name":"accepted","label":"Accepted","group":{"name":"publication_history","label":"Publication History"}},{"value":"2024-03-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}],"article-number":"2325474"}}