{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T19:35:47Z","timestamp":1775244947886,"version":"3.50.1"},"reference-count":48,"publisher":"Springer Science and Business Media LLC","issue":"7","license":[{"start":{"date-parts":[[2025,2,7]],"date-time":"2025-02-07T00:00:00Z","timestamp":1738886400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,2,7]],"date-time":"2025-02-07T00:00:00Z","timestamp":1738886400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62106089"],"award-info":[{"award-number":["62106089"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62020106012"],"award-info":[{"award-number":["62020106012"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62332008"],"award-info":[{"award-number":["62332008"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62336004"],"award-info":[{"award-number":["62336004"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,7]]},"DOI":"10.1007\/s11263-025-02355-0","type":"journal-article","created":{"date-parts":[[2025,2,7]],"date-time":"2025-02-07T14:35:45Z","timestamp":1738938945000},"page":"3858-3876","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Learning Structure-Supporting Dependencies via Keypoint Interactive Transformer for General Mammal Pose Estimation"],"prefix":"10.1007","volume":"133","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9015-3128","authenticated-orcid":false,"given":"Tianyang","family":"Xu","sequence":"first","affiliation":[]},{"given":"Jiyong","family":"Rao","sequence":"additional","affiliation":[]},{"given":"Xiaoning","family":"Song","sequence":"additional","affiliation":[]},{"given":"Zhenhua","family":"Feng","sequence":"additional","affiliation":[]},{"given":"Xiao-Jun","family":"Wu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,2,7]]},"reference":[{"key":"2355_CR1","doi-asserted-by":"crossref","unstructured":"Cao, J., Tang, H., Fang, H.S., Shen, X., Lu, C., & Tai, Y.W. (2019) Cross-domain adaptation for animal pose estimation. In IEEE International Conference on Computer Vision, (pp. 9498\u20139507)","DOI":"10.1109\/ICCV.2019.00959"},{"key":"2355_CR2","unstructured":"Contributors, M. (2020), Openmmlab pose estimation toolbox and benchmark. https:\/\/github.com\/open-mmlab\/mmpose"},{"issue":"7550","key":"2355_CR3","doi-asserted-by":"publisher","first-page":"7","DOI":"10.1038\/521007a","volume":"521","author":"K Davies","year":"2015","unstructured":"Davies, K. (2015). Keep the directive that protects research animals. Nature, 521(7550), 7\u20137.","journal-title":"Nature"},{"key":"2355_CR4","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., & Fei-Fei, L. (2009) Imagenet: A large-scale hierarchical image database. In IEEE Computer Vision and Pattern Recognition, (pp. 248\u2013255)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"2355_CR5","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., & Gelly, S. et\u00a0al. (2020). An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929"},{"key":"2355_CR6","doi-asserted-by":"crossref","unstructured":"Hirschorn, O., & Avidan, S. (2023). Pose anything: A graph-based approach for category-agnostic pose estimation. arXiv preprint arXiv:2311.17891","DOI":"10.1007\/978-3-031-73036-8_27"},{"key":"2355_CR7","doi-asserted-by":"crossref","unstructured":"Huang, J., Zhu, Z., Guo, F., & Huang, G. (2020) The devil is in the details: Delving into unbiased data processing for human pose estimation. In IEEE Computer Vision and Pattern Recognition, (pp 5700\u20135709)","DOI":"10.1109\/CVPR42600.2020.00574"},{"key":"2355_CR8","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2022.103483","volume":"222","author":"L Jiang","year":"2022","unstructured":"Jiang, L., Lee, C., Teotia, D., & Ostadabbas, S. (2022). Animal pose estimation: A closer look at the state-of-the-art, existing gaps and opportunities. Computer Vision and Image Understanding, 222, 103483.","journal-title":"Computer Vision and Image Understanding"},{"issue":"10","key":"2355_CR9","doi-asserted-by":"publisher","first-page":"144","DOI":"10.1109\/97.329844","volume":"1","author":"I Katsavounidis","year":"1994","unstructured":"Katsavounidis, I., Kuo, C. C. J., & Zhang, Z. (1994). A new initialization technique for generalized Lloyd iteration. IEEE Signal Processing Letters, 1(10), 144\u2013146.","journal-title":"IEEE Signal Processing Letters"},{"key":"2355_CR10","doi-asserted-by":"crossref","unstructured":"Ke, L., Chang, M. C., Qi, H., & Lyu, S. (2018). Multi-scale structure-aware network for human pose estimation., In European Conference on Computer Vision, (pp. 713\u2013728)","DOI":"10.1007\/978-3-030-01216-8_44"},{"key":"2355_CR11","unstructured":"Kingma, D.P., & Ba, J. (2014) Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980"},{"issue":"4","key":"2355_CR12","doi-asserted-by":"publisher","first-page":"496","DOI":"10.1038\/s41592-022-01443-0","volume":"19","author":"J Lauer","year":"2022","unstructured":"Lauer, J., Zhou, M., Ye, S., Menegas, W., Schneider, S., Nath, T., Rahman, M. M., Di Santo, V., Soberanes, D., Feng, G., et al. (2022). Multi-animal pose estimation, identification and tracking with deeplabcut. Nature Methods, 19(4), 496\u2013504.","journal-title":"Nature Methods"},{"key":"2355_CR13","doi-asserted-by":"crossref","unstructured":"Li, C., & Lee, G. H. (2021). From synthetic to real: Unsupervised domain adaptation for animal pose estimation., In IEEE Computer Vision and Pattern Recognition, (pp. 1482\u20131491).","DOI":"10.1109\/CVPR46437.2021.00153"},{"key":"2355_CR14","doi-asserted-by":"crossref","unstructured":"Li, C., & Lee, G.H. (2023) Scarcenet: Animal pose estimation with scarce annotations. In IEEE Computer Vision and Pattern Recognition, (pp. 17174\u201317183)","DOI":"10.1109\/CVPR52729.2023.01647"},{"key":"2355_CR15","doi-asserted-by":"crossref","unstructured":"Li, K., Wang, S., Zhang, X., Xu, Y., Xu, W., & Tu, Z. (2021a). Pose recognition with cascade transformers., In IEEE Computer Vision and Pattern Recognition, (pp. 1944\u20131953)","DOI":"10.1109\/CVPR46437.2021.00198"},{"key":"2355_CR16","first-page":"21002","volume":"33","author":"X Li","year":"2020","unstructured":"Li, X., Wang, W., Wu, L., Chen, S., Hu, X., Li, J., Tang, J., & Yang, J. (2020). Generalized focal loss: Learning qualified and distributed bounding boxes for dense object detection. Advances in Neural Information Processing Systems, 33, 21002\u201321012.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2355_CR17","doi-asserted-by":"crossref","unstructured":"Li, Y., Zhang, S., Wang, Z., Yang, S., Yang, W., Xia, S. T., & Zhou, E. (2021b). Tokenpose: Learning keypoint tokens for human pose estimation, In IEEE International Conference on Computer Vision, (pp. 11313\u201311322)","DOI":"10.1109\/ICCV48922.2021.01112"},{"key":"2355_CR18","doi-asserted-by":"crossref","unstructured":"Lin, T. Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., & Zitnick, C. L. (2014). Microsoft coco,: Common objects in context. In European Conference on Computer Vision, (pp. 740\u2013755).","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"2355_CR19","doi-asserted-by":"crossref","unstructured":"Lin, T. Y., Goyal, P., Girshick, R., He, K., & Doll\u00e1r, P. (2017). Focal loss for dense object detection., In: IEEE International Conference on Computer Vision, (pp. 2980\u20132988).","DOI":"10.1109\/ICCV.2017.324"},{"key":"2355_CR20","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., & Guo, B. (2021). Swin transformer: Hierarchical vision transformer using shifted windows., In IEEE International Conference on Computer Vision, (pp. 10012\u201310022)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"2355_CR21","doi-asserted-by":"crossref","unstructured":"Mao, X., Qi, G., Chen, Y., Li, X., Duan, R., Ye, S., He, Y., & Xue, H. (2022). Towards robust vision transformer., In IEEE Computer Vision and Pattern Recognition, (pp. 12042\u201312051)","DOI":"10.1109\/CVPR52688.2022.01173"},{"issue":"9","key":"2355_CR22","doi-asserted-by":"publisher","first-page":"1281","DOI":"10.1038\/s41593-018-0209-y","volume":"21","author":"A Mathis","year":"2018","unstructured":"Mathis, A., Mamidanna, P., Cury, K. M., Abe, T., Murthy, V. N., Mathis, M. W., & Bethge, M. (2018). Deeplabcut: Markerless pose estimation of user-defined body parts with deep learning. Nature Neuroscience, 21(9), 1281\u20131289.","journal-title":"Nature Neuroscience"},{"key":"2355_CR23","doi-asserted-by":"crossref","unstructured":"Mathis, A., Biasi, T., Schneider, S., Yuksekgonul, M., Rogers, B., Bethge, M., & Mathis, M. W. (2021). Pretraining boosts out-of-domain robustness for pose estimation., In Winter Conference on Applications of Computer Vision, (pp 1859\u20131868)","DOI":"10.1109\/WACV48630.2021.00190"},{"key":"2355_CR24","doi-asserted-by":"crossref","unstructured":"Mu, J., Qiu, W., Hager, G. D., & Yuille, A. L. (2020). Learning from synthetic animals., In IEEE Computer Vision and Pattern Recognition, (pp. 12386\u201312395).","DOI":"10.1109\/CVPR42600.2020.01240"},{"key":"2355_CR25","first-page":"23296","volume":"34","author":"MM Naseer","year":"2021","unstructured":"Naseer, M. M., Ranasinghe, K., Khan, S. H., Hayat, M., Shahbaz Khan, F., & Yang, M. H. (2021). Intriguing properties of vision transformers. Advances in Neural Information Processing Systems, 34, 23296\u201323308.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2355_CR26","doi-asserted-by":"crossref","unstructured":"Ng, X. L., Ong, K. E., Zheng, Q., Ni, Y., Yeo, S. Y., & Liu, J. (2022). Animal kingdom: A large and diverse dataset for animal behavior understanding., In IEEE Computer Vision and Pattern Recognition, (pp 19023\u201319034).","DOI":"10.1109\/CVPR52688.2022.01844"},{"key":"2355_CR27","unstructured":"Park, N., & Kim, S. (2022). How do vision transformers work? In International Conference on Learning Representations"},{"issue":"1","key":"2355_CR28","doi-asserted-by":"publisher","first-page":"117","DOI":"10.1038\/s41592-018-0234-5","volume":"16","author":"TD Pereira","year":"2019","unstructured":"Pereira, T. D., Aldarondo, D. E., Willmore, L., Kislin, M., Wang, S. S. H., Murthy, M., & Shaevitz, J. W. (2019). Fast animal pose estimation using deep neural networks. Nature Methods, 16(1), 117\u2013125.","journal-title":"Nature Methods"},{"issue":"4","key":"2355_CR29","doi-asserted-by":"publisher","first-page":"486","DOI":"10.1038\/s41592-022-01426-1","volume":"19","author":"TD Pereira","year":"2022","unstructured":"Pereira, T. D., Tabris, N., Matsliah, A., Turner, D. M., Li, J., Ravindranath, S., Papadoyannis, E. S., Normand, E., Deutsch, D. S., Wang, Z. Y., et al. (2022). Sleap: A deep learning system for multi-animal pose tracking. Nature Methods, 19(4), 486\u2013495.","journal-title":"Nature Methods"},{"key":"2355_CR30","unstructured":"Radford, A., Kim, J. W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et\u00a0al. (2021). Learning transferable visual models from natural language supervision., In International Conference on Machine Learning, (pp. 8748\u20138763)."},{"key":"2355_CR31","doi-asserted-by":"crossref","unstructured":"Rao, J., Xu, T., Song, X., Feng, Z. H., & Wu, X. J. (2022). Kitpose: Keypoint-interactive transformer for animal pose estimation., In Chinese Conference on Pattern Recognition and Computer Vision, Springer, (pp. 660\u2013673).","DOI":"10.1007\/978-3-031-18907-4_51"},{"key":"2355_CR32","doi-asserted-by":"crossref","unstructured":"Sun, K., Xiao, B., Liu, D., & Wang, J. (2019). Deep high-resolution representation learning for human pose estimation., In IEEE Computer Vision and Pattern Recognition, (pp. 5693\u20135703).","DOI":"10.1109\/CVPR.2019.00584"},{"key":"2355_CR33","doi-asserted-by":"crossref","unstructured":"Sun, M., Zhao, Z., Chai, W., Luo, H., Cao, S., Zhang, Y., Hwang, J. N., & Wang, G. (2024). UNIAP: Towards universal animal perception in vision via few-shot learning. In Proceedings of the AAAI Conference on Artificial Intelligence, (vol. 38, pp. 5008\u20135016).","DOI":"10.1609\/aaai.v38i5.28305"},{"issue":"3","key":"2355_CR34","doi-asserted-by":"publisher","first-page":"633","DOI":"10.1007\/s11263-023-01901-y","volume":"132","author":"J Tu","year":"2023","unstructured":"Tu, J., Wu, G., & Wang, L. (2023). Dual graph networks for pose estimation in crowded scenes. International Journal of Computer Vision, 132(3), 633\u2013653.","journal-title":"International Journal of Computer Vision"},{"key":"2355_CR35","doi-asserted-by":"crossref","unstructured":"Xiao, B., Wu, H., & Wei, Y. (2018). Simple baselines for human pose estimation and tracking., In European Conference on Computer Vision, (pp. 466\u2013481).","DOI":"10.1007\/978-3-030-01231-1_29"},{"key":"2355_CR36","doi-asserted-by":"crossref","unstructured":"Xu, L., Jin, S., Zeng, W., Liu, W., Qian, C., Ouyang, W., Luo, P., & Wang, X. (2022a). Pose for everything: Towards category-agnostic pose estimation. In European Conference on Computer Vision, (pp. 398\u2013416).","DOI":"10.1007\/978-3-031-20068-7_23"},{"key":"2355_CR37","first-page":"38571","volume":"35","author":"Y Xu","year":"2022","unstructured":"Xu, Y., Zhang, J., Zhang, Q., & Tao, D. (2022). ViTPose: Simple vision transformer baselines for human pose estimation. Advances in Neural Information Processing Systems, 35, 38571\u201338584.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2355_CR38","unstructured":"Xu, Y., Zhang, J., Zhang, Q., & Tao, D. (2022). Vitpose+: Vision transformer foundation model for generic body pose estimation., arXiv preprint arXiv:2212.04246"},{"key":"2355_CR39","doi-asserted-by":"crossref","unstructured":"Yang, S., Quan, Z., Nie, M., & Yang, W. (2021). Transpose: Keypoint localization via transformer., In IEEE International Conference on Computer Vision, (pp. 11802\u201311812)","DOI":"10.1109\/ICCV48922.2021.01159"},{"key":"2355_CR40","unstructured":"Yu, H,. Xu, Y., Zhang, J., Zhao, W., Guan, Z., Tao, D. (2021). Ap-10k: A benchmark for animal pose estimation in the wild. In: Advances in Neural Information Processing Systems"},{"key":"2355_CR41","first-page":"7281","volume":"34","author":"Y Yuan","year":"2021","unstructured":"Yuan, Y., Fu, R., Huang, L., Lin, W., Zhang, C., Chen, X., & Wang, J. (2021). Hrformer: High-resolution vision transformer for dense predict. Advances in Neural Information Processing Systems, 34, 7281\u20137293.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2355_CR42","doi-asserted-by":"crossref","unstructured":"Yun, S., Han, D., Oh, S. J., Chun, S., Choe, J., & Yoo, Y. (2019). Cutmix: Regularization strategy to train strong classifiers with localizable features., In IEEE International Conference on Computer Vision, (pp 6023\u20136032)","DOI":"10.1109\/ICCV.2019.00612"},{"key":"2355_CR43","doi-asserted-by":"crossref","unstructured":"Zhang, F., Zhu, X., Dai, H., Ye, M., & Zhu, C. (2020). Distribution-aware coordinate representation for human pose estimation., In IEEE Computer Vision and Pattern Recognition, (pp. 7093\u20137102).","DOI":"10.1109\/CVPR42600.2020.00712"},{"key":"2355_CR44","doi-asserted-by":"crossref","unstructured":"Zhang, J., Chen, Z., & Tao, D. (2021). Towards high performance human keypoint detection. International Journal of Computer Vision, 129(9), 2639\u20132662.","DOI":"10.1007\/s11263-021-01482-8"},{"issue":"2","key":"2355_CR45","doi-asserted-by":"publisher","first-page":"496","DOI":"10.1007\/s11263-022-01711-8","volume":"131","author":"L Zhang","year":"2023","unstructured":"Zhang, L., Gao, J., Xiao, Z., & Fan, H. (2023). Animaltrack: A benchmark for multi-animal tracking in the wild. International Journal of Computer Vision, 131(2), 496\u2013513.","journal-title":"International Journal of Computer Vision"},{"key":"2355_CR46","doi-asserted-by":"crossref","unstructured":"Zhang, X., Wang, W., Chen, Z., Xu, Y., Zhang, J., & Tao, D. (2023b). Clamp: Prompt-based contrastive learning for connecting language and animal pose., In IEEE Computer Vision and Pattern Recognition, (pp. 23272\u201323281)","DOI":"10.1109\/CVPR52729.2023.02229"},{"key":"2355_CR47","doi-asserted-by":"crossref","unstructured":"Zhao, S., Zhu, L., Wang, X., & Yang, Y. (2022). Centerclip: Token clustering for efficient text-video retrieval., In International ACM SIGIR Conference on Research and Development in Information Retrieval, (pp. 970\u2013981)","DOI":"10.1145\/3477495.3531950"},{"key":"2355_CR48","doi-asserted-by":"crossref","unstructured":"Zuffi, S., Kanazawa, A., Berger-Wolf, T., & Black, M.J. (2019) Three-d safari: Learning to estimate zebra pose, shape, and texture from images\u201d in the wild\u201d. In IEEE Computer Vision and Pattern Recognition, (pp. 5359\u20135368)","DOI":"10.1109\/ICCV.2019.00546"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02355-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02355-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02355-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,7]],"date-time":"2025-06-07T05:59:08Z","timestamp":1749275948000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02355-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2,7]]},"references-count":48,"journal-issue":{"issue":"7","published-print":{"date-parts":[[2025,7]]}},"alternative-id":["2355"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02355-0","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,2,7]]},"assertion":[{"value":"3 November 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 January 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 February 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no Conflict of interest to declare that are relevant to the content of this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}