{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T19:49:27Z","timestamp":1777060167097,"version":"3.51.4"},"reference-count":30,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T00:00:00Z","timestamp":1776988800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T00:00:00Z","timestamp":1776988800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["SIViP"],"published-print":{"date-parts":[[2026,5]]},"DOI":"10.1007\/s11760-026-05260-2","type":"journal-article","created":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T19:24:29Z","timestamp":1777058669000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Hybrid Mamba-attention encoder for text-to-time 3D human motion grounding"],"prefix":"10.1007","volume":"20","author":[{"given":"Haoqiang","family":"Wang","sequence":"first","affiliation":[]},{"given":"Lianyu","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Duoli","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Yuna","family":"Zhong","sequence":"additional","affiliation":[]},{"given":"Xingru","family":"Lu","sequence":"additional","affiliation":[]},{"given":"Biqun","family":"Xiang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,4,24]]},"reference":[{"issue":"7","key":"5260_CR1","doi-asserted-by":"publisher","first-page":"3459","DOI":"10.1109\/TIP.2018.2818328","volume":"27","author":"S Song","year":"2018","unstructured":"Song, S., Lan, C., Xing, J., Zeng, W., Liu, J.: Spatio-temporal attention-based lstm networks for 3d action recognition and detection. IEEE Trans. Image Process. 27(7), 3459\u20133471 (2018)","journal-title":"IEEE Trans. Image Process."},{"issue":"9","key":"5260_CR2","doi-asserted-by":"publisher","first-page":"4382","DOI":"10.1109\/TIP.2018.2837386","volume":"27","author":"H Wang","year":"2018","unstructured":"Wang, H., Wang, L.: Beyond joints: Learning representations from primitive geometries for skeleton-based action recognition and detection. IEEE Trans. Image Process. 27(9), 4382\u20134394 (2018)","journal-title":"IEEE Trans. Image Process."},{"key":"5260_CR3","unstructured":"Sun, J., Zhou, B., Black, M.J., Chandrasekaran, A.: Locate: End-to-end localization of actions in 3d with transformers. arXiv preprint arXiv:2203.10719 (2022)"},{"issue":"1","key":"5260_CR4","doi-asserted-by":"publisher","first-page":"202","DOI":"10.1109\/TETC.2022.3230912","volume":"12","author":"B Filtjens","year":"2022","unstructured":"Filtjens, B., Vanrumste, B., Slaets, P.: Skeleton-based action segmentation with multi-stage spatial-temporal graph convolutional neural networks. IEEE Trans. Emerg. Top. Comput. 12(1), 202\u2013212 (2022)","journal-title":"IEEE Trans. Emerg. Top. Comput."},{"key":"5260_CR5","doi-asserted-by":"publisher","first-page":"4039","DOI":"10.1109\/TMM.2023.3321438","volume":"26","author":"Y Wang","year":"2023","unstructured":"Wang, Y., Kang, H., Wu, D., Yang, W., Zhang, L.: Global and local spatio-temporal encoder for 3d human pose estimation. IEEE Trans. Multimedia 26, 4039\u20134049 (2023)","journal-title":"IEEE Trans. Multimedia"},{"key":"5260_CR6","doi-asserted-by":"crossref","unstructured":"Liu, C., Hu, Y., Li, Y., Song, S., Liu, J.: Pku-mmd: A large scale benchmark for continuous multi-modal human action understanding. arXiv preprint arXiv:1703.07475 (2017)","DOI":"10.1145\/3132734.3132739"},{"issue":"15","key":"5260_CR7","doi-asserted-by":"publisher","first-page":"4083","DOI":"10.3390\/s20154083","volume":"20","author":"F Niemann","year":"2020","unstructured":"Niemann, F., Reining, C., Moya Rueda, F., Nair, N.R., Steffens, J.A., Fink, G.A., Ten Hompel, M.: Lara: Creating a dataset for human activity recognition in logistics using semantic attributes. Sensors 20(15), 4083 (2020)","journal-title":"Sensors"},{"key":"5260_CR8","doi-asserted-by":"crossref","unstructured":"Punnakkal, A.R., Chandrasekaran, A., Athanasiou, N., Quiros-Ramirez, A., Black, M.J.: Babel: Bodies, action and behavior with english labels. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 722\u2013731 (2021)","DOI":"10.1109\/CVPR46437.2021.00078"},{"key":"5260_CR9","doi-asserted-by":"crossref","unstructured":"Wang, X., Kang, Z., Mu, Y.: Text-controlled motion mamba: text-instructed temporal grounding of human motion. arXiv preprint arXiv:2404.11375 (2024)","DOI":"10.1109\/TIP.2025.3624601"},{"key":"5260_CR10","doi-asserted-by":"crossref","unstructured":"Yan, S., Liu, M., Wang, Y., Liu, Y., Liu, H.: Mlp: Motion label prior for temporal sentence localization in untrimmed 3d human motions. IEEE Transactions on Circuits and Systems for Video Technology (2024)","DOI":"10.1109\/TCSVT.2024.3421565"},{"key":"5260_CR11","doi-asserted-by":"crossref","unstructured":"Zhang, H., Sun, A., Jing, W., Zhen, L., Zhou, J.T., Goh, R.S.M.: Parallel attention network with sequence matching for video grounding. arXiv preprint arXiv:2105.08481 (2021)","DOI":"10.18653\/v1\/2021.findings-acl.69"},{"key":"5260_CR12","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141., Polosukhin, I.: Attention is all you need. Advances in neural information processing systems 30 (2017)"},{"key":"5260_CR13","unstructured":"Gu, A., Dao, T.: Mamba: Linear-time sequence modeling with selective state spaces. arXiv preprint arXiv:2312.00752 (2023)"},{"key":"5260_CR14","unstructured":"Gu, A., Goel, K., R\u00e9, C.: Efficiently modeling long sequences with structured state spaces. arXiv preprint arXiv:2111.00396 (2021)"},{"key":"5260_CR15","first-page":"1474","volume":"33","author":"A Gu","year":"2020","unstructured":"Gu, A., Dao, T., Ermon, S., Rudra, A., R\u00e9, C.: Hippo: Recurrent memory with optimal polynomial projections. Adv. Neural. Inf. Process. Syst. 33, 1474\u20131487 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"5260_CR16","unstructured":"Fu, D.Y., Dao, T., Saab, K.K., Thomas, A.W., Rudra, A., R\u00e9, C.: Hungry hungry hippos: Towards language modeling with state space models. arXiv preprint arXiv:2212.14052 (2022)"},{"key":"5260_CR17","first-page":"35971","volume":"35","author":"A Gu","year":"2022","unstructured":"Gu, A., Goel, K., Gupta, A., R\u00e9, C.: On the parameterization and initialization of diagonal state space models. Adv. Neural. Inf. Process. Syst. 35, 35971\u201335983 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"5260_CR18","unstructured":"Gu, A., Johnson, I., Timalsina, A., Rudra, A., R\u00e9, C.: How to train your hippo: State space models with generalized orthogonal basis projections. arXiv preprint arXiv:2206.12037 (2022)"},{"key":"5260_CR19","first-page":"22982","volume":"35","author":"A Gupta","year":"2022","unstructured":"Gupta, A., Gu, A., Berant, J.: Diagonal state spaces are as effective as structured state spaces. Adv. Neural. Inf. Process. Syst. 35, 22982\u201322994 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"5260_CR20","unstructured":"Ma, J., Li, F., Wang, B.: U-mamba: Enhancing long-range dependency for biomedical image segmentation. arXiv preprint arXiv:2401.04722 (2024)"},{"key":"5260_CR21","doi-asserted-by":"crossref","unstructured":"Ruan, J., Li, J., Xiang, S.: Vm-unet: Vision mamba unet for medical image segmentation. arXiv preprint arXiv:2402.02491 (2024)","DOI":"10.1145\/3767748"},{"key":"5260_CR22","doi-asserted-by":"crossref","unstructured":"Li, K., Li, X., Wang, Y., He, Y., Wang, Y., Wang, L., Qiao, Y.: Videomamba: State space model for efficient video understanding. In: European Conference on Computer Vision, pp. 237\u2013255 (2024). Springer","DOI":"10.1007\/978-3-031-73347-5_14"},{"key":"5260_CR23","doi-asserted-by":"crossref","unstructured":"Chen, G., Huang, Y., Xu, J., Pei, B., Chen, Z., Li, Z., Wang, J., Li, K., Lu, T., Wang, L.: Video mamba suite: State space model as a versatile alternative for video understanding. arXiv preprint arXiv:2403.09626 (2024)","DOI":"10.1007\/s11263-025-02597-y"},{"key":"5260_CR24","unstructured":"Dao, T., Gu, A.: Transformers are ssms: Generalized models and efficient algorithms through structured state space duality. arXiv preprint arXiv:2405.21060 (2024)"},{"key":"5260_CR25","doi-asserted-by":"crossref","unstructured":"Mahmood, N., Ghorbani, N., Troje, N.F., Pons-Moll, G., Black, M.J.: Amass: Archive of motion capture as surface shapes. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5442\u20135451 (2019)","DOI":"10.1109\/ICCV.2019.00554"},{"key":"5260_CR26","doi-asserted-by":"crossref","unstructured":"Guo, C., Zou, S., Zuo, X., Wang, S., Ji, W., Li, X., Cheng, L.: Generating diverse and natural 3d human motions from text. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5152\u20135161 (2022)","DOI":"10.1109\/CVPR52688.2022.00509"},{"key":"5260_CR27","doi-asserted-by":"crossref","unstructured":"Wang, Z., Wang, L., Wu, T., Li, T., Wu, G.: Negative sample matters: A renaissance of metric learning for temporal grounding. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 36, pp. 2613\u20132623 (2022)","DOI":"10.1609\/aaai.v36i3.20163"},{"key":"5260_CR28","doi-asserted-by":"crossref","unstructured":"Mun, J., Cho, M., Han, B.: Local-global video-text interactions for temporal grounding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10810\u201310819 (2020)","DOI":"10.1109\/CVPR42600.2020.01082"},{"key":"5260_CR29","doi-asserted-by":"crossref","unstructured":"Zhang, H., Sun, A., Jing, W., Zhou, J.T.: Span-based localizing network for natural language video localization. arXiv preprint arXiv:2004.13931 (2020)","DOI":"10.18653\/v1\/2020.acl-main.585"},{"key":"5260_CR30","unstructured":"Wang, J., Sun, A., Zhang, H., Li, X.: Ms-detr: Natural language video localization with sampling moment-moment interaction. arXiv preprint arXiv:2305.18969 (2023)"}],"container-title":["Signal, Image and Video Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-026-05260-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11760-026-05260-2","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-026-05260-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T19:24:35Z","timestamp":1777058675000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11760-026-05260-2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,24]]},"references-count":30,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2026,5]]}},"alternative-id":["5260"],"URL":"https:\/\/doi.org\/10.1007\/s11760-026-05260-2","relation":{},"ISSN":["1863-1703","1863-1711"],"issn-type":[{"value":"1863-1703","type":"print"},{"value":"1863-1711","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,4,24]]},"assertion":[{"value":"2 August 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 February 2026","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"1 March 2026","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 April 2026","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no relevant financial or non-financial interests to disclose.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflicts of interest"}}],"article-number":"279"}}