{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,27]],"date-time":"2025-08-27T16:13:26Z","timestamp":1756311206184,"version":"3.37.3"},"reference-count":70,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2022,3,12]],"date-time":"2022-03-12T00:00:00Z","timestamp":1647043200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,3,12]],"date-time":"2022-03-12T00:00:00Z","timestamp":1647043200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100011002","name":"National Natural Science Foundation of China-Yunnan Joint Fund","doi-asserted-by":"publisher","award":["61806176"],"award-info":[{"award-number":["61806176"]}],"id":[{"id":"10.13039\/501100011002","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["2019QNA5022"],"award-info":[{"award-number":["2019QNA5022"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2022,5]]},"DOI":"10.1007\/s11263-022-01596-7","type":"journal-article","created":{"date-parts":[[2022,3,12]],"date-time":"2022-03-12T03:02:40Z","timestamp":1647054160000},"page":"1165-1180","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["iMoCap: Motion Capture from Internet Videos"],"prefix":"10.1007","volume":"130","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0050-3989","authenticated-orcid":false,"given":"Junting","family":"Dong","sequence":"first","affiliation":[]},{"given":"Qing","family":"Shuai","sequence":"additional","affiliation":[]},{"given":"Jingxiang","family":"Sun","sequence":"additional","affiliation":[]},{"given":"Yuanqing","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Hujun","family":"Bao","sequence":"additional","affiliation":[]},{"given":"Xiaowei","family":"Zhou","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,3,12]]},"reference":[{"key":"1596_CR1","doi-asserted-by":"crossref","unstructured":"Anguelov, D., Srinivasan, P., Koller, D., Thrun, S., Rodgers, J., Davis, J. (2005). Scape: Shape completion and animation of people. In: TOG.","DOI":"10.1145\/1186822.1073207"},{"key":"1596_CR2","doi-asserted-by":"crossref","unstructured":"Arnab, A., Doersch, C., & Zisserman, A. (2019). Exploiting temporal context for 3d human pose estimation in the wild. In: CVPR.","DOI":"10.1109\/CVPR.2019.00351"},{"key":"1596_CR3","doi-asserted-by":"crossref","unstructured":"Bo, L., & Sminchisescu, C. (2010). Twin gaussian processes for structured prediction. IJCV.","DOI":"10.1007\/s11263-008-0204-y"},{"key":"1596_CR4","doi-asserted-by":"crossref","unstructured":"Bogo, F., Kanazawa, A., Lassner, C., Gehler, P., Romero, J., & Black, M.J. (2016). Keep it smpl: Automatic estimation of 3d human pose and shape from a single image. In: ECCV.","DOI":"10.1007\/978-3-319-46454-1_34"},{"key":"1596_CR5","doi-asserted-by":"crossref","unstructured":"Burenius, M., Sullivan, J., & Carlsson, S. (2013). 3d pictorial structures for multiple view articulated pose estimation. In: CVPR.","DOI":"10.1109\/CVPR.2013.464"},{"key":"1596_CR6","doi-asserted-by":"crossref","unstructured":"Caba\u00a0Heilbron, F., Escorcia, V., Ghanem, B., & Carlos\u00a0Niebles, J. (2015). Activitynet: A large-scale video benchmark for human activity understanding. In: CVPR.","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"1596_CR7","unstructured":"Cao, Z., Hidalgo Martinez, G., Simon, T., Wei, S., & Sheikh, Y.A. (2019). Openpose: Realtime multi-person 2d pose estimation using part affinity fields. T-PAMI."},{"key":"1596_CR8","doi-asserted-by":"crossref","unstructured":"Caspi, Y., & Irani, M. (2002). Spatio-temporal alignment of sequences. T-PAMI.","DOI":"10.1109\/TPAMI.2002.1046148"},{"key":"1596_CR9","doi-asserted-by":"crossref","unstructured":"Chen, C.H., & Ramanan, D. (2017). 3d human pose estimation = 2d pose estimation+ matching. In: CVPR.","DOI":"10.1109\/CVPR.2017.610"},{"key":"1596_CR10","doi-asserted-by":"crossref","unstructured":"Chen, Y., Wang, Z., Peng, Y., Zhang, Z., Yu, G., & Sun, J. (2018). Cascaded pyramid network for multi-person pose estimation.","DOI":"10.1109\/CVPR.2018.00742"},{"key":"1596_CR11","doi-asserted-by":"crossref","unstructured":"Cheng, Y., Yang, B., Wang, B., Yan, W., & Tan, R.T. (2019). Occlusion-aware networks for 3d human pose estimation in video. In: ICCV.","DOI":"10.1109\/ICCV.2019.00081"},{"key":"1596_CR12","doi-asserted-by":"crossref","unstructured":"Dong, J., Jiang, W., Huang, Q., Bao, H., & Zhou, X. (2019). Fast and robust multi-person 3d pose estimation from multiple views. In: CVPR.","DOI":"10.1109\/CVPR.2019.00798"},{"key":"1596_CR13","doi-asserted-by":"crossref","unstructured":"Dong, J., Shuai, Q., Zhang, Y., Liu, X., Zhou, X., & Bao, H. (2020). Motion capture from internet videos. In: ECCV.","DOI":"10.1007\/978-3-030-58536-5_13"},{"key":"1596_CR14","doi-asserted-by":"crossref","unstructured":"Dong, J., Fang, Q., Jiang, W., Yang, Y., Huang, Q., Bao, H., & Zhou, X. (2021). Fast and robust multi-person 3d pose estimation and tracking from multiple views. T-PAMI.","DOI":"10.1109\/TPAMI.2021.3098052"},{"key":"1596_CR15","doi-asserted-by":"crossref","unstructured":"Dwibedi, D., Aytar, Y., Tompson, J., Sermanet, P., & Zisserman, A. (2019). Temporal cycle-consistency learning. In: CVPR.","DOI":"10.1109\/CVPR.2019.00190"},{"key":"1596_CR16","doi-asserted-by":"crossref","unstructured":"Elhayek, A., Stoll, C., Hasler, N., Kim, K. I., Seidel, H. P., & Theobalt, C. (2012). Spatio-temporal motion tracking with unsynchronized cameras. In: CVPR.","DOI":"10.1109\/CVPR.2012.6247886"},{"key":"1596_CR17","doi-asserted-by":"crossref","unstructured":"Elhayek, A., de\u00a0Aguiar, E., Jain, A., Tompson, J., Pishchulin, L., Andriluka, M., Bregler, C., Schiele, B., & Theobalt, C. (2015a). Efficient convnet-based marker-less motion capture in general scenes with a low number of cameras. In: CVPR.","DOI":"10.1109\/CVPR.2015.7299005"},{"key":"1596_CR18","doi-asserted-by":"crossref","unstructured":"Elhayek, A., Stoll, C., Kim, K. I., & Theobalt, C. (2015b). Outdoor human motion capture by simultaneous optimization of pose and camera parameters. In: CGF.","DOI":"10.1111\/cgf.12519"},{"key":"1596_CR19","doi-asserted-by":"crossref","unstructured":"Fang, H. S., Xie, S., Tai, Y. W., & Lu, C. (2017). RMPE: Regional multi-person pose estimation. In: ICCV.","DOI":"10.1109\/ICCV.2017.256"},{"key":"1596_CR20","doi-asserted-by":"crossref","unstructured":"Feng, Y., Ma, L., Liu, W., Zhang, T., & Luo, J. (2018). Video re-localization. In: ECCV.","DOI":"10.1007\/978-3-030-01264-9_4"},{"key":"1596_CR21","doi-asserted-by":"crossref","unstructured":"Feng, Y., Ma, L., Liu, W., & Luo, J. (2019). Spatio-temporal video re-localization by warp lstm. In: CVPR.","DOI":"10.1109\/CVPR.2019.00138"},{"key":"1596_CR22","doi-asserted-by":"crossref","unstructured":"Gall, J., Rosenhahn, B., Brox, T., Seidel, H. P. (2010). Optimization and filtering for human motion capture. IJCV.","DOI":"10.1007\/s11263-008-0173-1"},{"key":"1596_CR23","unstructured":"Guan, P., Weiss, A., Balan, A. O., & Black, M. J. (2009). Estimating human shape and pose from a single image. In: ICCV."},{"key":"1596_CR24","doi-asserted-by":"crossref","unstructured":"Guler, R. A., Kokkinos, I. (2019). Holopose: Holistic 3d human reconstruction in-the-wild. In: CVPR.","DOI":"10.1109\/CVPR.2019.01114"},{"key":"1596_CR25","doi-asserted-by":"crossref","unstructured":"Hasler, N., Rosenhahn, B., Thormahlen, T., Wand, M., Gall, J., & Seidel, H. P. (2009). Markerless motion capture with unsynchronized moving cameras. In: CVPR.","DOI":"10.1109\/CVPR.2009.5206859"},{"key":"1596_CR26","doi-asserted-by":"crossref","unstructured":"Huang, Q. X., Guibas, L. (2013). Consistent shape maps via semidefinite programming. In: CGF.","DOI":"10.1111\/cgf.12184"},{"key":"1596_CR27","doi-asserted-by":"crossref","unstructured":"Huang, Y., Bogo, F., Lassner, C., Kanazawa, A., Gehler, P. V., Romero, J., Akhter, I., & Black, M. J. (2017). Towards accurate marker-less human shape and pose estimation over time. In: 3DV.","DOI":"10.1109\/3DV.2017.00055"},{"key":"1596_CR28","doi-asserted-by":"crossref","unstructured":"Ionescu, C., Papava, D., Olaru, V., & Sminchisescu, C. (2013). Human3. 6m: Large scale datasets and predictive methods for 3d human sensing in natural environments. T-PAMI.","DOI":"10.1109\/TPAMI.2013.248"},{"key":"1596_CR29","doi-asserted-by":"crossref","unstructured":"Joo, H., Simon, T., & Sheikh, Y. (2018). Total capture: A 3d deformation model for tracking faces, hands, and bodies. In: CVPR.","DOI":"10.1109\/CVPR.2018.00868"},{"key":"1596_CR30","doi-asserted-by":"crossref","unstructured":"Kanazawa, A., Black, M. J., Jacobs, D. W., & Malik, J. (2018). End-to-end recovery of human shape and pose. In: CVPR.","DOI":"10.1109\/CVPR.2018.00744"},{"key":"1596_CR31","doi-asserted-by":"crossref","unstructured":"Kanazawa, A., Zhang, J. Y., Felsen, P., & Malik, J. (2019). Learning 3d human dynamics from video. In: CVPR.","DOI":"10.1109\/CVPR.2019.00576"},{"key":"1596_CR32","doi-asserted-by":"crossref","unstructured":"Kocabas, M., Athanasiou, N., & Black, M. J. (2020). Vibe: Video inference for human body pose and shape estimation. In: CVPR.","DOI":"10.1109\/CVPR42600.2020.00530"},{"key":"1596_CR33","doi-asserted-by":"crossref","unstructured":"Kocabas, M., Huang, C. H. P., Hilliges, O., & Black, M. J. (2021). Pare: Part attention regressor for 3d human body estimation. ICCV.","DOI":"10.1109\/ICCV48922.2021.01094"},{"key":"1596_CR34","doi-asserted-by":"crossref","unstructured":"Lassner, C., Romero, J., Kiefel, M., Bogo, F., Black, M. J., & Gehler, P. V. (2017). Unite the people: Closing the loop between 3d and 2d human representations. In: CVPR.","DOI":"10.1109\/CVPR.2017.500"},{"key":"1596_CR35","doi-asserted-by":"crossref","unstructured":"Lee, C. S., & Elgammal, A. (2010). Coupled visual and kinematic manifold models for tracking. IJCV.","DOI":"10.1007\/s11263-009-0266-5"},{"key":"1596_CR36","doi-asserted-by":"crossref","unstructured":"Li, R., Tian, T. P., Sclaroff, S., & Yang, M. H. (2010). 3d human motion tracking with a coordinated mixture of factor analyzers. IJCV.","DOI":"10.1007\/s11263-009-0283-4"},{"key":"1596_CR37","doi-asserted-by":"crossref","unstructured":"Loper, M., Mahmood, N., Romero, J., Pons-Moll, G., Black, M. J. (2015). Smpl: A skinned multi-person linear model. TOG.","DOI":"10.1145\/2816795.2818013"},{"key":"1596_CR38","doi-asserted-by":"crossref","unstructured":"Martinez, J., Hossain, R., Romero, J., & Little, J. J. (2017). A simple yet effective baseline for 3d human pose estimation. In: ICCV.","DOI":"10.1109\/ICCV.2017.288"},{"key":"1596_CR39","doi-asserted-by":"crossref","unstructured":"Moreno-Noguer, F. (2017). 3d human pose estimation from a single image via distance matrix regression. In: CVPR.","DOI":"10.1109\/CVPR.2017.170"},{"key":"1596_CR40","doi-asserted-by":"crossref","unstructured":"Omran, M., Lassner, C., Pons-Moll, G., Gehler, P., & Schiele, B. (2018). Neural body fitting: Unifying deep learning and model based human pose and shape estimation. In: 3DV.","DOI":"10.1109\/3DV.2018.00062"},{"key":"1596_CR41","doi-asserted-by":"crossref","unstructured":"Pavlakos, G., Zhou, X., Derpanis, K. G., & Daniilidis, K. (2017). Harvesting multiple views for marker-less 3d human pose annotations. In: CVPR.","DOI":"10.1109\/CVPR.2017.138"},{"key":"1596_CR42","doi-asserted-by":"crossref","unstructured":"Pavlakos, G., Zhou, X., & Daniilidis, K. (2018a). Ordinal depth supervision for 3d human pose estimation. In: CVPR.","DOI":"10.1109\/CVPR.2018.00763"},{"key":"1596_CR43","doi-asserted-by":"crossref","unstructured":"Pavlakos, G., Zhu, L., Zhou, X., Daniilidis, K. (2018b). Learning to estimate 3d human pose and shape from a single color image. In: CVPR.","DOI":"10.1109\/CVPR.2018.00055"},{"key":"1596_CR44","doi-asserted-by":"crossref","unstructured":"Pavlakos, G., Choutas, V., Ghorbani, N., Bolkart, T., Osman, A. A., Tzionas, D., Black, M. J. (2019). Expressive body capture: 3d hands, face, and body from a single image. In: CVPR.","DOI":"10.1109\/CVPR.2019.01123"},{"key":"1596_CR45","doi-asserted-by":"crossref","unstructured":"Pavllo, D., Feichtenhofer, C., Grangier, D., & Auli, M. (2019). 3d human pose estimation in video with temporal convolutions and semi-supervised training. In: CVPR.","DOI":"10.1109\/CVPR.2019.00794"},{"key":"1596_CR46","doi-asserted-by":"crossref","unstructured":"Romero, J., Tzionas, D., & Black, M. J. (2017). Embodied hands: Modeling and capturing hands and bodies together. SIGGRAPH Asia.","DOI":"10.1145\/3130800.3130883"},{"key":"1596_CR47","doi-asserted-by":"crossref","unstructured":"Saini, N., Price, E., Tallamraju, R., Enficiaud, R., Ludwig, R., Martinovic, I., Ahmad, A., & Black, M. J. (2019). Markerless outdoor human motion capture using multiple autonomous micro aerial vehicles. In: ICCV.","DOI":"10.1109\/ICCV.2019.00091"},{"key":"1596_CR48","doi-asserted-by":"crossref","unstructured":"Sermanet, P., Lynch, C., Chebotar, Y., Hsu, J., Jang, E., Schaal, S., Levine, S., & Brain, G. (2018). Time-contrastive networks: Self-supervised learning from video. In: ICRA.","DOI":"10.1109\/ICRA.2018.8462891"},{"key":"1596_CR49","unstructured":"Sigal, L., Balan, A., & Black, M. J. (2008). Combined discriminative and generative articulated pose and non-rigid shape estimation. In: NeurIPS."},{"key":"1596_CR50","doi-asserted-by":"crossref","unstructured":"Sigal, L., Isard, M., Haussecker, H., Black, M. J. (2012). Loose-limbed people: Estimating 3d human pose and motion using non-parametric belief propagation. IJCV.","DOI":"10.1007\/s11263-011-0493-4"},{"key":"1596_CR51","doi-asserted-by":"crossref","unstructured":"Sun, K., Xiao, B., Liu, D., & Wang, J. (2019). Deep high-resolution representation learning for human pose estimation. In: CVPR.","DOI":"10.1109\/CVPR.2019.00584"},{"key":"1596_CR52","doi-asserted-by":"crossref","unstructured":"Sun, X., Shang, J., Liang, S., & Wei, Y. (2017). Compositional human pose regression. In: ICCV.","DOI":"10.1109\/ICCV.2017.284"},{"key":"1596_CR53","doi-asserted-by":"crossref","unstructured":"Sun, X., Xiao, B., Wei, F., Liang, S., & Wei, Y. (2018). Integral human pose regression. In: ECCV.","DOI":"10.1109\/ICCV.2017.284"},{"key":"1596_CR54","doi-asserted-by":"crossref","unstructured":"Tekin, B., M\u00e1rquez-Neila, P., Salzmann, M., & Fua, P. (2017). Learning to fuse 2d and 3d image cues for monocular body pose estimation. In: ICCV.","DOI":"10.1109\/ICCV.2017.425"},{"key":"1596_CR55","doi-asserted-by":"crossref","unstructured":"Tome, D., Russell, C., & Agapito, L. (2017). Lifting from the deep: Convolutional 3d pose estimation from a single image. CVPR.","DOI":"10.1109\/CVPR.2017.603"},{"key":"1596_CR56","unstructured":"Tuytelaars, T., & Van\u00a0Gool, L. (2004). Synchronizing video sequences. In: CVPR."},{"key":"1596_CR57","doi-asserted-by":"crossref","unstructured":"Ukrainitz, Y., & Irani, M. (2006). Aligning sequences and actions by maximizing space-time correlations. In: ECCV.","DOI":"10.1007\/11744078_42"},{"key":"1596_CR58","unstructured":"Wang, J., Xu, E., Xue, K., & Kidzinski, L. (2020). 3d pose detection in videos: Focusing on occlusion. arXiv preprint arXiv:200613517."},{"key":"1596_CR59","doi-asserted-by":"crossref","unstructured":"Wang, O., Schroers, C., Zimmer, H., Gross, M., & Sorkine-Hornung, A. (2014). Videosnapping: Interactive synchronization of multiple videos. TOG.","DOI":"10.1145\/2601097.2601208"},{"key":"1596_CR60","doi-asserted-by":"crossref","unstructured":"Wang, Y., Liu, Y., Tong, X., Dai, Q., & Tan, P. (2017). Outdoor markerless motion capture with sparse handheld video cameras. TVCG.","DOI":"10.1109\/TVCG.2017.2693151"},{"key":"1596_CR61","doi-asserted-by":"crossref","unstructured":"Wolf, L., & Zomet, A. (2006). Wide baseline matching between unsynchronized video sequences. IJCV.","DOI":"10.1007\/s11263-005-4841-0"},{"key":"1596_CR62","doi-asserted-by":"crossref","unstructured":"Xiang, D., Joo, H., & Sheikh, Y. (2019). Monocular total capture: Posing face, body, and hands in the wild. In: CVPR.","DOI":"10.1109\/CVPR.2019.01122"},{"key":"1596_CR63","doi-asserted-by":"crossref","unstructured":"Xu, X., & Dunn, E. (2019). Discrete laplace operator estimation for dynamic 3d reconstruction. arXiv preprint arXiv:190811044.","DOI":"10.1109\/ICCV.2019.00163"},{"key":"1596_CR64","unstructured":"Yu, C., Wang, B., Yang, B., & Tan, R. T. (2020). Multi-scale networks for 3d human pose estimation with inference stage optimization. arXiv preprint arXiv:201006844."},{"key":"1596_CR65","doi-asserted-by":"crossref","unstructured":"Zanfir, A., Marinoiu, E., & Sminchisescu, C. (2018a). Monocular 3d pose and shape estimation of multiple people in natural scenes-the importance of multiple scene constraints. In: CVPR.","DOI":"10.1109\/CVPR.2018.00229"},{"key":"1596_CR66","unstructured":"Zanfir, A., Marinoiu, E., Zanfir, M., Popa, A. I., & Sminchisescu, C. (2018b). Deep network for the integrated 3d sensing of multiple people in natural images. In: NeurIPS."},{"key":"1596_CR67","doi-asserted-by":"crossref","unstructured":"Zheng, E., Ji, D., Dunn, E., & Frahm, J. M. (2015). Sparse dynamic 3d reconstruction from unsynchronized videos. In: ICCV.","DOI":"10.1109\/ICCV.2015.504"},{"key":"1596_CR68","doi-asserted-by":"crossref","unstructured":"Zhou, X., Zhu, M., & Daniilidis, K. (2015). Multi-image matching via fast alternating minimization. In: ICCV.","DOI":"10.1109\/ICCV.2015.459"},{"key":"1596_CR69","doi-asserted-by":"crossref","unstructured":"Zhou, X., Zhu, M., Leonardos, S., Derpanis, K. G., & Daniilidis, K. (2016). Sparseness meets deepness: 3d human pose estimation from monocular video. In: CVPR.","DOI":"10.1109\/CVPR.2016.537"},{"key":"1596_CR70","doi-asserted-by":"crossref","unstructured":"Zhou, X., Huang, Q., Sun, X., Xue, X., & Wei, Y. (2017). Towards 3d human pose estimation in the wild: a weakly-supervised approach. In: ICCV.","DOI":"10.1109\/ICCV.2017.51"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-022-01596-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-022-01596-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-022-01596-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,4,27]],"date-time":"2022-04-27T11:16:32Z","timestamp":1651058192000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-022-01596-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,3,12]]},"references-count":70,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2022,5]]}},"alternative-id":["1596"],"URL":"https:\/\/doi.org\/10.1007\/s11263-022-01596-7","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"type":"print","value":"0920-5691"},{"type":"electronic","value":"1573-1405"}],"subject":[],"published":{"date-parts":[[2022,3,12]]},"assertion":[{"value":"20 February 2021","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 January 2022","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 March 2022","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}