{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,4]],"date-time":"2026-06-04T00:01:43Z","timestamp":1780531303287,"version":"3.54.1"},"reference-count":30,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2026,5,25]],"date-time":"2026-05-25T00:00:00Z","timestamp":1779667200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,5,25]],"date-time":"2026-05-25T00:00:00Z","timestamp":1779667200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["61902016"],"award-info":[{"award-number":["61902016"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"name":"Beijing Education Science 14th Five Year Plan Project","award":["CDDB24252"],"award-info":[{"award-number":["CDDB24252"]}]},{"name":"Beijing key laboratory of super intelligent technology for urban architecture","award":["BKL-SITUA-202501"],"award-info":[{"award-number":["BKL-SITUA-202501"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["SIViP"],"published-print":{"date-parts":[[2026,6]]},"DOI":"10.1007\/s11760-026-05437-9","type":"journal-article","created":{"date-parts":[[2026,5,25]],"date-time":"2026-05-25T07:16:53Z","timestamp":1779693413000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Multimodal Sign Language Recognition with Bidirectional Visual-Pose Alternating Attention and Semantic Embedding"],"prefix":"10.1007","volume":"20","author":[{"given":"Shanshan","family":"Wan","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yuhan","family":"Zhu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Lan","family":"Yang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Houchen","family":"Lv","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2026,5,25]]},"reference":[{"key":"5437_CR1","unstructured":"Camgoz, N.C., Koller, O., Hadfield, S., Bowden, R.: Sign language transformers: Joint end-to-end sign language recognition and translation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10023\u201310033. (2020)"},{"key":"5437_CR2","doi-asserted-by":"crossref","unstructured":"Li, D., Yu, X., Xu, C., Petersson, L., Li, H.: Transferring cross-domain knowledge for video sign language recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6205\u20136214 (2020)","DOI":"10.1109\/CVPR42600.2020.00624"},{"key":"5437_CR3","doi-asserted-by":"crossref","unstructured":"Cheng, K.L., Yang, Z., Chen, Q., Tai, Y.-W.: Fully convolutional networks for continuous sign language recognition. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XXIV, pp. 697\u2013714 (2020)","DOI":"10.1007\/978-3-030-58586-0_41"},{"key":"5437_CR4","doi-asserted-by":"crossref","unstructured":"Min, Y., Hao, A., Chai, X., Chen, X.: Visual alignment constraint for continuous sign language recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 11542\u201311551. (2021)","DOI":"10.1109\/ICCV48922.2021.01134"},{"key":"5437_CR5","doi-asserted-by":"crossref","unstructured":"Pu, J., Zhou, W., Hu, H., Li, H.: Boosting continuous sign language recognition via cross modality augmentation. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 1497\u20131505. (2020)","DOI":"10.1145\/3394171.3413931"},{"key":"5437_CR6","doi-asserted-by":"crossref","unstructured":"Albanie, S., Varol, G., Momeni, L., Afouras, T., Chung, J.S., Fox, N., Zisserman, A.: Bsl-1k: Scaling up co-articulated sign language recognition using mouthing cues. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XI, pp. 35\u201353 (2020)","DOI":"10.1007\/978-3-030-58621-8_3"},{"key":"5437_CR7","unstructured":"Joze, H.R.V., Koller, O.: Ms-asl: A large-scale data set and benchmark for understanding american sign language, (2019). arXiv:1812.01053 arXiv preprint"},{"key":"5437_CR8","doi-asserted-by":"crossref","unstructured":"Li, D., Rodriguez, O.C., Yu, X., Li, H.: Word-level deep sign language recognition from video: A new large-scale dataset and methods comparison. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 1459\u20131469 (2020)","DOI":"10.1109\/WACV45572.2020.9093512"},{"key":"5437_CR9","doi-asserted-by":"crossref","unstructured":"Li, D., Rodriguez, C., Yu, X., Li, H.: Word-level deep sign language recognition from video: A new large-scale dataset and methods comparison. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 1459\u20131469 (2020)","DOI":"10.1109\/WACV45572.2020.9093512"},{"key":"5437_CR10","doi-asserted-by":"crossref","unstructured":"Zhang, J., Wang, Q., Wang, Q.: A sign language recognition framework based on cross-modal complementary information fusion. IEEE Transactions on Multimedia, 8131\u20138144 (2024)","DOI":"10.1109\/TMM.2024.3377095"},{"key":"5437_CR11","doi-asserted-by":"crossref","unstructured":"Nishimura, T., Abbasi, B.: Improving sign language recognition performance using multimodal data. In: Proceedings of the IEEE International Conference on Information Reuse and Integration for Data Science, pp. 184\u2013189. (2024)","DOI":"10.1109\/IRI62200.2024.00047"},{"key":"5437_CR12","doi-asserted-by":"crossref","unstructured":"Hakim, Z.I.A., Swargo, R.M., Adnan, M.A.: Exploring attention mechanisms in integration of multi-modal information for sign language recognition and translation. In: Proceedings of the IEEE International Conference on Image Processing, pp. 2529\u20132535. (2024)","DOI":"10.1109\/ICIP51287.2024.10648021"},{"key":"5437_CR13","unstructured":"Sincan, O.M., Tur, A.O., Keles, H.Y.: Isolated sign language recognition with multi-scale features using lstm. In: 2019 27th Signal Processing and Communications Applications Conference, pp. 1\u20134 (2019)"},{"key":"5437_CR14","doi-asserted-by":"crossref","unstructured":"Hu, H., Zhou, W., Li, H.: Hand-model-aware sign language recognition. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 35, pp. 1558\u20131566. (2021)","DOI":"10.1609\/aaai.v35i2.16247"},{"key":"5437_CR15","unstructured":"Zhou, H., Zhou, W., Zhou, Y., Li, H.: Spatial-temporal multi-cue network for sign language recognition and translation. IEEE Transactions on Multimedia PP(99), 1\u20131 (2021)"},{"key":"5437_CR16","doi-asserted-by":"crossref","unstructured":"Zhou, H., Zhou, W., Qi, W., Pu, J., Li, H.: Improving sign language translation with monolingual data by sign back-translation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1316\u20131325 (2021)","DOI":"10.1109\/CVPR46437.2021.00137"},{"key":"5437_CR17","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? a new model and the kinetics dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6299\u20136308. (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"5437_CR18","doi-asserted-by":"crossref","unstructured":"Qiu, Z., Yao, T., Mei, T.: Learning spatio-temporal representation with pseudo-3d residual networks. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5533\u20135541. (2017)","DOI":"10.1109\/ICCV.2017.590"},{"key":"5437_CR19","doi-asserted-by":"crossref","unstructured":"Zuo, R., Wei, F., Mak, B.: Natural language-assisted sign language recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14890\u201314900. (2023)","DOI":"10.1109\/CVPR52729.2023.01430"},{"key":"5437_CR20","doi-asserted-by":"crossref","unstructured":"Jin, S., Xu, L., Xu, J., Wang, C., Liu, W., Qian, C., Ouyang, W., Luo, P.: Whole-body human pose estimation in the wild. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part IX, pp. 196\u2013214 (2020)","DOI":"10.1007\/978-3-030-58545-7_12"},{"key":"5437_CR21","doi-asserted-by":"crossref","unstructured":"Boh\u00e1\u010dek, M., Hr\u00faz, M.: Sign pose-based transformer for word-level sign language recognition. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 182\u2013191. (2022)","DOI":"10.1109\/WACVW54805.2022.00024"},{"key":"5437_CR22","doi-asserted-by":"crossref","unstructured":"Hu, H., Zhao, W., Zhou, W., Wang, Y., Li, H.: Signbert: Pre-training of hand-model-aware representation for sign language recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 11087\u201311096. (2021)","DOI":"10.1109\/ICCV48922.2021.01090"},{"key":"5437_CR23","doi-asserted-by":"crossref","unstructured":"Li, P., Gu, J., Kuen, J., Morariu, V.I., Zhao, H., Jain, R., Manjunatha, V., Liu, H.: Selfdoc: Self-supervised document representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5652\u20135660 (2021)","DOI":"10.1109\/CVPR46437.2021.00560"},{"key":"5437_CR24","doi-asserted-by":"crossref","unstructured":"Hu, H., Zhou, W., Pu, J., Li, H.: Global-local enhancement network for nmf-aware sign language recognition. ACM Trans. Multimed. Comput. Commun. Appl. 17(3), 1\u201319 (2021)","DOI":"10.1145\/3436754"},{"key":"5437_CR25","doi-asserted-by":"crossref","unstructured":"Yan, S., Xiong, Y., Lin, D.: Spatial temporal graph convolutional networks for skeleton-based action recognition. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 32, (2018)","DOI":"10.1609\/aaai.v32i1.12328"},{"key":"5437_CR26","doi-asserted-by":"crossref","unstructured":"Zhao, W., Hu, H., Zhou, W., Shi, J., Li, H.: Best: Bert pre-training for sign language recognition with coupling tokenization, (2023). arXiv preprint arXiv:2302.05075","DOI":"10.1609\/aaai.v37i3.25470"},{"key":"5437_CR27","doi-asserted-by":"crossref","unstructured":"Hu, H., Zhao, W., Zhou, W., Li, H.: Signbert+: Hand-model-aware self-supervised pre-training for sign language understanding. IEEE Transactions on Pattern Analysis and Machine Intelligence (2023)","DOI":"10.1109\/TPAMI.2023.3269220"},{"key":"5437_CR28","doi-asserted-by":"crossref","unstructured":"Tunga, A., Nuthalapati, S.V., Wachs, J.: Pose-based sign language recognition using gcn and bert. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 31\u201340. (2021)","DOI":"10.1109\/WACVW52041.2021.00008"},{"key":"5437_CR29","doi-asserted-by":"crossref","unstructured":"Lin, J., Gan, C., Han, S.: Tsm: Temporal shift module for efficient video understanding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7083\u20137093. (2019)","DOI":"10.1109\/ICCV.2019.00718"},{"key":"5437_CR30","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., He, K.: Slowfast networks for video recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6202\u20136211. (2019)","DOI":"10.1109\/ICCV.2019.00630"}],"container-title":["Signal, Image and Video Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-026-05437-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11760-026-05437-9","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-026-05437-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T23:12:26Z","timestamp":1780528346000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11760-026-05437-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,5,25]]},"references-count":30,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2026,6]]}},"alternative-id":["5437"],"URL":"https:\/\/doi.org\/10.1007\/s11760-026-05437-9","relation":{},"ISSN":["1863-1703","1863-1711"],"issn-type":[{"value":"1863-1703","type":"print"},{"value":"1863-1711","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,5,25]]},"assertion":[{"value":"18 May 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"1 May 2026","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 May 2026","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 May 2026","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"The authors declare no competing interests.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"368"}}