{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,8]],"date-time":"2025-09-08T06:06:26Z","timestamp":1757311586917},"reference-count":35,"publisher":"Institute of Electronics, Information and Communications Engineers (IEICE)","issue":"6","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEICE Trans. Inf. &amp; Syst."],"published-print":{"date-parts":[[2023,6,1]]},"DOI":"10.1587\/transinf.2022edp7182","type":"journal-article","created":{"date-parts":[[2023,5,31]],"date-time":"2023-05-31T22:20:12Z","timestamp":1685571612000},"page":"1165-1174","source":"Crossref","is-referenced-by-count":2,"title":["FSPose: A Heterogeneous Framework with Fast and Slow Networks for Human Pose Estimation in Videos"],"prefix":"10.1587","volume":"E106.D","author":[{"given":"Jianfeng","family":"XU","sequence":"first","affiliation":[{"name":"KDDI Research, Inc."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Satoshi","family":"KOMORITA","sequence":"additional","affiliation":[{"name":"KDDI Research, Inc."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kei","family":"KAWAMURA","sequence":"additional","affiliation":[{"name":"KDDI Research, Inc."}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"532","reference":[{"key":"1","doi-asserted-by":"publisher","unstructured":"[1] Q. Dang, J. Yin, B. Wang, and W. Zheng, \u201cDeep Learning based 2d Human Pose Estimation: A Survey,\u201d Tsinghua Science and Technology, vol.24, no.6, pp.663-676, 2019. 10.26599\/tst.2018.9010100","DOI":"10.26599\/TST.2018.9010100"},{"key":"2","doi-asserted-by":"crossref","unstructured":"[2] D. Yang, R. Dai, Y. Wang, R. Mallick, L. Minciullo, G. Francesca, and F. Bremond, \u201cSelective Spatio-temporal Aggregation Based Pose Refinement System: Towards Understanding Human Activities in Real-world Videos,\u201d Proc. IEEE\/CVF Winter Conference on Applications of Computer Vision, pp.2363-2372, 2021. 10.1109\/wacv48630.2021.00241","DOI":"10.1109\/WACV48630.2021.00241"},{"key":"3","doi-asserted-by":"crossref","unstructured":"[3] K. Sun, B. Xiao, D. Liu, and J. Wang, \u201cDeep High-resolution Representation Learning for Human Pose Estimation,\u201d Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp.5693-5703, 2019. 10.1109\/cvpr.2019.00584","DOI":"10.1109\/CVPR.2019.00584"},{"key":"4","unstructured":"[4] G. Bertasius, C. Feichtenhofer, D. Tran, J. Shi, and L. Torresani, \u201cLearning Temporal Pose Estimation from Sparsely-labeled Videos,\u201d Advances in Neural Information Processing Systems 32, pp.3027-3038, 2019."},{"key":"5","doi-asserted-by":"crossref","unstructured":"[5] A. Howard, M. Sandler, B. Chen, W. Wang, L.-C. Chen, M. Tan, G. Chu, V. Vasudevan, Y. Zhu, R. Pang, H. Adam, and Q. Le, \u201cSearching for MobileNetV3,\u201d Proc. IEEE\/CVF International Conference on Computer Vision, pp.1314-1324, 2019. 10.1109\/iccv.2019.00140","DOI":"10.1109\/ICCV.2019.00140"},{"key":"6","doi-asserted-by":"crossref","unstructured":"[6] M. Sandler, A. Howard, M. Zhu, A. Zhmoginov, and L.-C. Chen, \u201cMobileNetV2: Inverted Residuals and Linear Bottlenecks,\u201d Proc. IEEE Conference on Computer Vision and Pattern Recognition, pp.4510-4520, 2018. 10.1109\/cvpr.2018.00474","DOI":"10.1109\/CVPR.2018.00474"},{"key":"7","unstructured":"[7] M. Tan and Q. Le, \u201cEfficientnet: Rethinking Model Scaling for Convolutional Neural Networks,\u201d International Conference on Machine Learning, pp.6105-6114, PMLR, 2019."},{"key":"8","doi-asserted-by":"crossref","unstructured":"[8] X. Zhang, X. Zhou, M. Lin, and J. Sun, \u201cShufflenet: An Extremely Efficient Convolutional Neural Network for Mobile Devices,\u201d Proc. IEEE Conference on Computer Vision and Pattern Recognition, pp.6848-6856, 2018. 10.1109\/cvpr.2018.00716","DOI":"10.1109\/CVPR.2018.00716"},{"key":"9","doi-asserted-by":"crossref","unstructured":"[9] J. Wang, K. Qiu, H. Peng, J. Fu, and J. Zhu, \u201cAI Coach: Deep Human Pose Estimation and Analysis for Personalized Athletic Training Assistance,\u201d Proc. 27th ACM International Conference on Multimedia, pp.2228-2230, 2019. 10.1145\/3343031.3350609","DOI":"10.1145\/3343031.3350609"},{"key":"10","doi-asserted-by":"crossref","unstructured":"[10] U. Iqbal, A. Milan, and J. Gall, \u201cPoseTrack: Joint Multi-person Pose Estimation and Tracking,\u201d Proc. IEEE Conference on Computer Vision and Pattern Recognition, pp.4654-4663, 2017. 10.1109\/cvpr.2017.495","DOI":"10.1109\/CVPR.2017.495"},{"key":"11","doi-asserted-by":"crossref","unstructured":"[11] M. Andriluka, U. Iqbal, E. Insafutdinov, L. Pishchulin, A. Milan, J. Gall, and B. Schiele, \u201cPoseTrack: A Benchmark for Human Pose Estimation and Tracking,\u201d Proc. IEEE Conference on Computer Vision and Pattern Recognition, pp.5167-5176, 2018. 10.1109\/cvpr.2018.00542","DOI":"10.1109\/CVPR.2018.00542"},{"key":"12","doi-asserted-by":"publisher","unstructured":"[12] Y. Kawana and N. Ukita, \u201cOccluded Appearance Modeling with Sample Weighting for Human Pose Estimation,\u201d IEICE Trans. Inf. &amp; Syst., vol.E100-D, no.10, pp.2627-2634, 2017. 10.1587\/transinf.2017edp7088","DOI":"10.1587\/transinf.2017EDP7088"},{"key":"13","doi-asserted-by":"publisher","unstructured":"[13] N. Ukita, \u201cPose Estimation with Action Classification Using Global-and-pose Features and Fine-grained Action-specific Pose Models,\u201d IEICE Trans. Inf. &amp; Syst., vol.E101-D, no.3, pp.758-766, 2018. 10.1587\/transinf.2017edp7204","DOI":"10.1587\/transinf.2017EDP7204"},{"key":"14","doi-asserted-by":"publisher","unstructured":"[14] Z. Cao, G. Hidalgo, T. Simon, S.-E. Wei, and Y. Sheikh, \u201cOpenPose: Realtime Multi-person 2d Pose Estimation Using Part Affinity Fields,\u201d IEEE Trans. Pattern Anal. Mach. Intell., vol.43, no.1, pp.172-186, 2021. 10.1109\/tpami.2019.2929257","DOI":"10.1109\/TPAMI.2019.2929257"},{"key":"15","doi-asserted-by":"publisher","unstructured":"[15] C. Wang, F. Zhang, and S.S. Ge, \u201cA Comprehensive Survey on 2d Multi-person Pose Estimation Methods,\u201d Engineering Applications of Artificial Intelligence, vol.102, p.104260, 2021. 10.1016\/j.engappai.2021.104260","DOI":"10.1016\/j.engappai.2021.104260"},{"key":"16","doi-asserted-by":"crossref","unstructured":"[16] Y. Chen, Z. Wang, Y. Peng, Z. Zhang, G. Yu, and J. Sun, \u201cCascaded Pyramid Network for Multi-person Pose Estimation,\u201d Proc. IEEE Conference on Computer Vision and Pattern Recognition, pp.7103-7112, 2018. 10.1109\/cvpr.2018.00742","DOI":"10.1109\/CVPR.2018.00742"},{"key":"17","doi-asserted-by":"crossref","unstructured":"[17] Q. Bao, W. Liu, J. Hong, L. Duan, and T. Mei, \u201cPose-native Network Architecture Search for Multi-person Human Pose Estimation,\u201d Proc. 28th ACM International Conference on Multimedia, pp.592-600, 2020. 10.1145\/3394171.3413842","DOI":"10.1145\/3394171.3413842"},{"key":"18","doi-asserted-by":"crossref","unstructured":"[18] X. Dai, I. Spasi\u0107, S. Chapman, and B. Meyer, \u201cThe State of the Art in Implementing Machine Learning for Mobile Apps: A Survey,\u201d 2020 SoutheastCon, pp.1-8, IEEE, 2020. 10.1109\/southeastcon44009.2020.9249652","DOI":"10.1109\/SoutheastCon44009.2020.9249652"},{"key":"19","doi-asserted-by":"crossref","unstructured":"[19] E. Insafutdinov, M. Andriluka, L. Pishchulin, S. Tang, E. Levinkov, B. Andres, and B. Schiele, \u201cArttrack: Articulated Multi-person Tracking in the Wild,\u201d Proc. IEEE Conference on Computer Vision and Pattern Recognition, pp.1293-1301, 2017. 10.1109\/cvpr.2017.142","DOI":"10.1109\/CVPR.2017.142"},{"key":"20","doi-asserted-by":"crossref","unstructured":"[20] S. Hochreiter and J. Schmidhuber, \u201cLong Short-Term Memory,\u201d Neural Computation, vol.9, no.8, pp.1735-1780, 1997. 10.1162\/neco.1997.9.8.1735","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"21","doi-asserted-by":"crossref","unstructured":"[21] K. Cho, B. van Merri\u00ebnboer, \u00c7. G\u00fcl\u00e7ehre, D. Bahdanau, F. Bougares, H. Schwenk, and Y. Bengio, \u201cLearning Phrase Representations Using RNN Encoder-decoder for Statistical Machine Translation,\u201d EMNLP, pp.1724-1734, 2014. 10.3115\/v1\/d14-1179","DOI":"10.3115\/v1\/D14-1179"},{"key":"22","doi-asserted-by":"crossref","unstructured":"[22] B. Artacho and A. Savakis, \u201cUnipose: Unified Human Pose Estimation in Single Images and Videos,\u201d Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp.7033-7042, 2020. 10.1109\/cvpr42600.2020.00706","DOI":"10.1109\/CVPR42600.2020.00706"},{"key":"23","doi-asserted-by":"crossref","unstructured":"[23] Y. Luo, J. Ren, Z. Wang, W. Sun, J. Pan, J. Liu, J. Pang, and L. Lin, \u201cLSTM Pose Machines,\u201d Proc. IEEE Conference on Computer Vision and Pattern Recognition, pp.5207-5215, 2018. 10.1109\/cvpr.2018.00546","DOI":"10.1109\/CVPR.2018.00546"},{"key":"24","doi-asserted-by":"crossref","unstructured":"[24] R. Girdhar, G. Gkioxari, L. Torresani, M. Paluri, and D. Tran, \u201cDetect-and-track: Efficient Pose Estimation in Videos,\u201d Proc. IEEE Conference on Computer Vision and Pattern Recognition, pp.350-359, 2018. 10.1109\/cvpr.2018.00044","DOI":"10.1109\/CVPR.2018.00044"},{"key":"25","doi-asserted-by":"publisher","unstructured":"[25] L. Zhou, Y. Chen, J. Wang, and H. Lu, \u201cProgressive Bi-c3d Pose Grammar for Human Pose Estimation,\u201d Proc. AAAI Conference on Artificial Intelligence, vol.34, no.7, pp.13033-13040, 2020. 10.1609\/aaai.v34i07.7004","DOI":"10.1609\/aaai.v34i07.7004"},{"key":"26","doi-asserted-by":"crossref","unstructured":"[26] T. Pfister, J. Charles, and A. Zisserman, \u201cFlowing ConvNets for Human Pose Estimation in Videos,\u201d Proc. IEEE International Conference on Computer Vision, pp.1913-1921, 2015. 10.1109\/iccv.2015.222","DOI":"10.1109\/ICCV.2015.222"},{"key":"27","doi-asserted-by":"crossref","unstructured":"[27] J. Song, L. Wang, L. Van Gool, and O. Hilliges, \u201cThin-slicing Network: A Deep Structured Model for Pose Estimation in Videos,\u201d Proc. IEEE Conference on Computer Vision and Pattern Recognition, pp.5563-5572, 2017. 10.1109\/cvpr.2017.590","DOI":"10.1109\/CVPR.2017.590"},{"key":"28","doi-asserted-by":"crossref","unstructured":"[28] Y. Yang, Z. Ren, H. Li, C. Zhou, X. Wang, and G. Hua, \u201cLearning Dynamics via Graph Neural Networks for Human Pose Estimation and Tracking,\u201d Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp.8070-8080, 2021. 10.1109\/cvpr46437.2021.00798","DOI":"10.1109\/CVPR46437.2021.00798"},{"key":"29","doi-asserted-by":"crossref","unstructured":"[29] J. Dai, H. Qi, Y. Xiong, Y. Li, G. Zhang, H. Hu, and Y. Wei, \u201cDeformable Convolutional Networks,\u201d Proc. IEEE International Conference on Computer Vision, pp.764-773, 2017. 10.1109\/iccv.2017.89","DOI":"10.1109\/ICCV.2017.89"},{"key":"30","doi-asserted-by":"crossref","unstructured":"[30] T.-Y. Lin, M. Maire, S. Belongie, J. Hays, P. Perona, D. Ramanan, P. Doll\u00e1r, and C.L. Zitnick, \u201cMicrosoft COCO: Common Objects in Context,\u201d European Conference on Computer Vision, vol.8693, pp.740-755, Springer, 2014. 10.1007\/978-3-319-10602-1_48","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"31","doi-asserted-by":"crossref","unstructured":"[31] L. Pishchulin, E. Insafutdinov, S. Tang, B. Andres, M. Andriluka, P. Gehler, and B. Schiele, \u201cDeepcut: Joint Subset Partition and Labeling for Multi Person Pose Estimation,\u201d Proc. IEEE Conference on Computer Vision and Pattern Recognition, pp.4929-4937, 2016. 10.1109\/cvpr.2016.533","DOI":"10.1109\/CVPR.2016.533"},{"key":"32","doi-asserted-by":"crossref","unstructured":"[32] B. Xiao, H. Wu, and Y. Wei, \u201cSimple Baselines for Human Pose Estimation and Tracking,\u201d Proc. European Conference on Computer Vision, vol.11210, pp.472-487, 2018. 10.1007\/978-3-030-01231-1_29","DOI":"10.1007\/978-3-030-01231-1_29"},{"key":"33","doi-asserted-by":"crossref","unstructured":"[33] M. Andriluka, L. Pishchulin, P. Gehler, and B. Schiele, \u201c2d Human Pose Estimation: New Benchmark and State of the Art Analysis,\u201d Proc. IEEE Conference on Computer Vision and Pattern Recognition, pp.3686-3693, 2014. 10.1109\/cvpr.2014.471","DOI":"10.1109\/CVPR.2014.471"},{"key":"34","unstructured":"[34] \u201cGithub: fvcore,\u201d https:\/\/github.com\/facebookresearch\/fvcore, 2021."},{"key":"35","doi-asserted-by":"crossref","unstructured":"[35] L. Xu, Y. Guan, S. Jin, W. Liu, C. Qian, P. Luo, W. Ouyang, and X. Wang, \u201cVipnas: Efficient Video Pose Estimation via Neural Architecture Search,\u201d Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp.16067-16076, 2021. 10.1109\/cvpr46437.2021.01581","DOI":"10.1109\/CVPR46437.2021.01581"}],"container-title":["IEICE Transactions on Information and Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E106.D\/6\/E106.D_2022EDP7182\/_pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,6,3]],"date-time":"2023-06-03T04:17:31Z","timestamp":1685765851000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E106.D\/6\/E106.D_2022EDP7182\/_article"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6,1]]},"references-count":35,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2023]]}},"URL":"https:\/\/doi.org\/10.1587\/transinf.2022edp7182","relation":{},"ISSN":["0916-8532","1745-1361"],"issn-type":[{"value":"0916-8532","type":"print"},{"value":"1745-1361","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,6,1]]},"article-number":"2022EDP7182"}}