{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T04:39:11Z","timestamp":1764995951519,"version":"3.46.0"},"reference-count":232,"publisher":"Springer Science and Business Media LLC","issue":"12","license":[{"start":{"date-parts":[[2025,11,24]],"date-time":"2025-11-24T00:00:00Z","timestamp":1763942400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2025,11,24]],"date-time":"2025-11-24T00:00:00Z","timestamp":1763942400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,12]]},"DOI":"10.1007\/s11263-025-02557-6","type":"journal-article","created":{"date-parts":[[2025,11,24]],"date-time":"2025-11-24T08:54:06Z","timestamp":1763974446000},"page":"8356-8435","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Ego-Exo4D: Understanding Skilled Human Activity from First- and Third-Person Perspectives"],"prefix":"10.1007","volume":"133","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9591-5873","authenticated-orcid":false,"given":"Kristen","family":"Grauman","sequence":"first","affiliation":[]},{"given":"Andrew","family":"Westbury","sequence":"additional","affiliation":[]},{"given":"Lorenzo","family":"Torresani","sequence":"additional","affiliation":[]},{"given":"Kris","family":"Kitani","sequence":"additional","affiliation":[]},{"given":"Jitendra","family":"Malik","sequence":"additional","affiliation":[]},{"given":"Triantafyllos","family":"Afouras","sequence":"additional","affiliation":[]},{"given":"Kumar","family":"Ashutosh","sequence":"additional","affiliation":[]},{"given":"Vijay","family":"Baiyya","sequence":"additional","affiliation":[]},{"given":"Siddhant","family":"Bansal","sequence":"additional","affiliation":[]},{"given":"Bikram","family":"Boote","sequence":"additional","affiliation":[]},{"given":"Eugene","family":"Byrne","sequence":"additional","affiliation":[]},{"given":"Zach","family":"Chavis","sequence":"additional","affiliation":[]},{"given":"Joya","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Feng","family":"Cheng","sequence":"additional","affiliation":[]},{"given":"Fu-Jen","family":"Chu","sequence":"additional","affiliation":[]},{"given":"Sean","family":"Crane","sequence":"additional","affiliation":[]},{"given":"Avijit","family":"Dasgupta","sequence":"additional","affiliation":[]},{"given":"Jing","family":"Dong","sequence":"additional","affiliation":[]},{"given":"Maria","family":"Escobar","sequence":"additional","affiliation":[]},{"given":"Cristhian","family":"Forigua","sequence":"additional","affiliation":[]},{"given":"Abrham","family":"Gebreselasie","sequence":"additional","affiliation":[]},{"given":"Sanjay","family":"Haresh","sequence":"additional","affiliation":[]},{"given":"Jing","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Md Mohaiminul","family":"Islam","sequence":"additional","affiliation":[]},{"given":"Suyog","family":"Jain","sequence":"additional","affiliation":[]},{"given":"Rawal","family":"Khirodkar","sequence":"additional","affiliation":[]},{"given":"Devansh","family":"Kukreja","sequence":"additional","affiliation":[]},{"given":"Kevin J.","family":"Liang","sequence":"additional","affiliation":[]},{"given":"Jia-Wei","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Sagnik","family":"Majumder","sequence":"additional","affiliation":[]},{"given":"Yongsen","family":"Mao","sequence":"additional","affiliation":[]},{"given":"Miguel","family":"Martin","sequence":"additional","affiliation":[]},{"given":"Effrosyni","family":"Mavroudi","sequence":"additional","affiliation":[]},{"given":"Tushar","family":"Nagarajan","sequence":"additional","affiliation":[]},{"given":"Francesco","family":"Ragusa","sequence":"additional","affiliation":[]},{"given":"Santhosh Kumar","family":"Ramakrishnan","sequence":"additional","affiliation":[]},{"given":"Luigi","family":"Seminara","sequence":"additional","affiliation":[]},{"given":"Arjun","family":"Somayazulu","sequence":"additional","affiliation":[]},{"given":"Yale","family":"Song","sequence":"additional","affiliation":[]},{"given":"Shan","family":"Su","sequence":"additional","affiliation":[]},{"given":"Zihui","family":"Xue","sequence":"additional","affiliation":[]},{"given":"Edward","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Jinxu","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Angela","family":"Castillo","sequence":"additional","affiliation":[]},{"given":"Changan","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Xinzhu","family":"Fu","sequence":"additional","affiliation":[]},{"given":"Ryosuke","family":"Furuta","sequence":"additional","affiliation":[]},{"given":"Cristina","family":"Gonz\u00e1lez","sequence":"additional","affiliation":[]},{"given":"Prince","family":"Gupta","sequence":"additional","affiliation":[]},{"given":"Jiabo","family":"Hu","sequence":"additional","affiliation":[]},{"given":"Yifei","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Yiming","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Weslie","family":"Khoo","sequence":"additional","affiliation":[]},{"given":"Anush","family":"Kumar","sequence":"additional","affiliation":[]},{"given":"Robert","family":"Kuo","sequence":"additional","affiliation":[]},{"given":"Sach","family":"Lakhavani","sequence":"additional","affiliation":[]},{"given":"Miao","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Mi","family":"Luo","sequence":"additional","affiliation":[]},{"given":"Zhengyi","family":"Luo","sequence":"additional","affiliation":[]},{"given":"Brighid","family":"Meredith","sequence":"additional","affiliation":[]},{"given":"Austin","family":"Miller","sequence":"additional","affiliation":[]},{"given":"Oluwatumininu","family":"Oguntola","sequence":"additional","affiliation":[]},{"given":"Xiaqing","family":"Pan","sequence":"additional","affiliation":[]},{"given":"Penny","family":"Peng","sequence":"additional","affiliation":[]},{"given":"Shraman","family":"Pramanick","sequence":"additional","affiliation":[]},{"given":"Merey","family":"Ramazanova","sequence":"additional","affiliation":[]},{"given":"Fiona","family":"Ryan","sequence":"additional","affiliation":[]},{"given":"Wei","family":"Shan","sequence":"additional","affiliation":[]},{"given":"Kiran","family":"Somasundaram","sequence":"additional","affiliation":[]},{"given":"Chenan","family":"Song","sequence":"additional","affiliation":[]},{"given":"Audrey","family":"Southerland","sequence":"additional","affiliation":[]},{"given":"Masatoshi","family":"Tateno","sequence":"additional","affiliation":[]},{"given":"Huiyu","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Yuchen","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Takuma","family":"Yagi","sequence":"additional","affiliation":[]},{"given":"Mingfei","family":"Yan","sequence":"additional","affiliation":[]},{"given":"Xitong","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Zecheng","family":"Yu","sequence":"additional","affiliation":[]},{"given":"Shengxin Cindy","family":"Zha","sequence":"additional","affiliation":[]},{"given":"Chen","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Ziwei","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Zhifan","family":"Zhu","sequence":"additional","affiliation":[]},{"given":"Jeff","family":"Zhuo","sequence":"additional","affiliation":[]},{"given":"Pablo","family":"Arbel\u00e1ez","sequence":"additional","affiliation":[]},{"given":"Gedas","family":"Bertasius","sequence":"additional","affiliation":[]},{"given":"David","family":"Crandall","sequence":"additional","affiliation":[]},{"given":"Dima","family":"Damen","sequence":"additional","affiliation":[]},{"given":"Jakob","family":"Engel","sequence":"additional","affiliation":[]},{"given":"Giovanni Maria","family":"Farinella","sequence":"additional","affiliation":[]},{"given":"Antonino","family":"Furnari","sequence":"additional","affiliation":[]},{"given":"Bernard","family":"Ghanem","sequence":"additional","affiliation":[]},{"given":"Judy","family":"Hoffman","sequence":"additional","affiliation":[]},{"given":"C. V.","family":"Jawahar","sequence":"additional","affiliation":[]},{"given":"Richard","family":"Newcombe","sequence":"additional","affiliation":[]},{"given":"Hyun Soo","family":"Park","sequence":"additional","affiliation":[]},{"given":"James M.","family":"Rehg","sequence":"additional","affiliation":[]},{"given":"Yoichi","family":"Sato","sequence":"additional","affiliation":[]},{"given":"Manolis","family":"Savva","sequence":"additional","affiliation":[]},{"given":"Jianbo","family":"Shi","sequence":"additional","affiliation":[]},{"given":"Mike Zheng","family":"Shou","sequence":"additional","affiliation":[]},{"given":"Michael","family":"Wray","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,11,24]]},"reference":[{"key":"2557_CR1","doi-asserted-by":"crossref","unstructured":"Aboukhadra, A. T., Malik, J., Elhayek, A., Robertini, N., & Stricker, D. (2023). Thor-net: End-to-end graformer-based realistic two hands and object reconstruction with self-supervision. In Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 1001\u20131010.","DOI":"10.1109\/WACV56688.2023.00106"},{"key":"2557_CR2","doi-asserted-by":"crossref","unstructured":"Abrash, M. (2021). Creating the future: Augmented reality, the next human-machine interface. In 2021 IEEE International Electron Devices Meeting (IEDM).","DOI":"10.1109\/IEDM19574.2021.9720526"},{"key":"2557_CR3","doi-asserted-by":"crossref","unstructured":"Ahuja, K., Harrison, C., Goel, M., & Xiao, R. (2019). Mecap: Whole-body digitization for low-cost vr\/ar headsets. In Proceedings of the 32nd Annual ACM Symposium on User Interface Software and Technology, pp. 453\u2013462.","DOI":"10.1145\/3332165.3347889"},{"key":"2557_CR4","doi-asserted-by":"crossref","unstructured":"Alayrac, J.-B., Bojanowski, P., Agrawal, N., Sivic, J., Laptev, I., & Lacoste-Julien, S. (2016). Unsupervised learning from narrated instruction videos. In: CVPR.","DOI":"10.1109\/CVPR.2016.495"},{"key":"2557_CR5","doi-asserted-by":"crossref","unstructured":"Ardeshir, S., & Borji, A. (2016). Ego2top: Matching viewers in egocentric and top-view videos. In ECCV.","DOI":"10.1007\/978-3-319-46454-1_16"},{"issue":"6","key":"2557_CR6","doi-asserted-by":"publisher","first-page":"1353","DOI":"10.1109\/TPAMI.2018.2832121","volume":"41","author":"S Ardeshir","year":"2018","unstructured":"Ardeshir, S., & Borji, A. (2018). Egocentric meets top-view. IEEE transactions on pattern analysis and machine intelligence, 41(6), 1353\u20131366.","journal-title":"IEEE transactions on pattern analysis and machine intelligence"},{"key":"2557_CR7","doi-asserted-by":"publisher","first-page":"61","DOI":"10.1016\/j.cviu.2018.05.005","volume":"171","author":"S Ardeshir","year":"2018","unstructured":"Ardeshir, S., & Borji, A. (2018). An exocentric look at egocentric actions and vice versa. Computer Vision and Image Understanding, 171, 61\u201368.","journal-title":"Computer Vision and Image Understanding"},{"key":"2557_CR8","doi-asserted-by":"crossref","unstructured":"Ashutosh, K., Girdhar, R., Torresani, L., & Grauman, K. (2023). Hiervl: Learning hierarchical video-language embeddings. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52729.2023.02209"},{"key":"2557_CR9","doi-asserted-by":"crossref","unstructured":"Ashutosh, K., Nagarajan, T., Pavlakos, G., Kitani, K., & Grauman, K. (2024). ExpertAF: Expert Actionable Feedback from Video.","DOI":"10.1109\/CVPR52734.2025.01268"},{"key":"2557_CR10","unstructured":"Ashutosh, K., Ramakrishnan, S. K., Afouras, T., & Grauman, K. (2023). Video-mined task graphs for keystep recognition in instructional videos. In: NeurIPS."},{"key":"2557_CR11","doi-asserted-by":"crossref","unstructured":"Bansal, S., Arora, C., & Jawahar, C. V. (2022). My view is the best view: Procedure learning from egocentric videos. In: European Conference on Computer Vision (ECCV).","DOI":"10.1007\/978-3-031-19778-9_38"},{"key":"2557_CR12","doi-asserted-by":"crossref","unstructured":"Ben-Shabat, Y., Yu, X., Saleh, F., Campbell, D., Rodriguez-Opazo, C., Li, H., & Gould, S. (2020). The ikea asm dataset: Understanding people assembling furniture through actions, objects and pose.","DOI":"10.1109\/WACV48630.2021.00089"},{"key":"2557_CR13","doi-asserted-by":"crossref","unstructured":"Bertasius, G., Park, H. S., Yu, S., & Shi, J. (2017). Am i a baller? basketball performance assessment from first-person videos. In ICCV.","DOI":"10.1109\/ICCV.2017.239"},{"key":"2557_CR14","unstructured":"Bertasius, G., Wang, H., & Torresani, L. (2021). Is space-time attention all you need for video understanding? In: ICML."},{"key":"2557_CR15","doi-asserted-by":"crossref","unstructured":"Bi, J., Luo, J., & Xu, C. (2021). Procedure planning in instructional videos via contextual modeling and model-based policy learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15611\u201315620.","DOI":"10.1109\/ICCV48922.2021.01532"},{"key":"2557_CR16","doi-asserted-by":"crossref","unstructured":"Bi, J., Luo, J., & Xu, C. (2021). Procedure planning in instructional videos via contextual modeling and model-based policy learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15611\u201315620.","DOI":"10.1109\/ICCV48922.2021.01532"},{"key":"2557_CR17","doi-asserted-by":"crossref","unstructured":"Brodersen, K. H., Ong, C. S., Stephan, K. E., & Buhmann, J. M. (2010). The balanced accuracy and its posterior distribution. In: 2010 20th International Conference on Pattern Recognition, pp. 3121\u20133124. IEEE.","DOI":"10.1109\/ICPR.2010.764"},{"key":"2557_CR18","doi-asserted-by":"crossref","unstructured":"Cao, M., Yang, T., Weng, J., Zhang, C., Wang, J., & Zou, Y. (2022). Locvtp: Video-text pre-training for temporal localization. In European Conference on Computer Vision.","DOI":"10.1007\/978-3-031-19809-0_3"},{"key":"2557_CR19","doi-asserted-by":"crossref","unstructured":"Castillo, A., Escobar, M., Jeanneret, G., Pumarola, A., Arbel\u00e1ez, P., Thabet, A., & Sanakoyeu, A. (2023). Bodiffusion: Diffusing sparse observations for full-body human motion synthesis. CV4Metaverse workshop, International Conference on Computer Vision.","DOI":"10.1109\/ICCVW60793.2023.00456"},{"key":"2557_CR20","doi-asserted-by":"crossref","unstructured":"Chan, E. R., Nagano, K., Chan, M. A., Bergman, A. W., Park, J. J., Levy, A., Aittala, M., De Mello, S., Karras, T., & Wetzstein, G. (2023). Generative novel view synthesis with 3d-aware diffusion models. arXiv preprint arXiv:2304.02602.","DOI":"10.1109\/ICCV51070.2023.00389"},{"key":"2557_CR21","doi-asserted-by":"crossref","unstructured":"Chang, A., Dai, A., Funkhouser, T., Nie\u00dfner, M., Savva, M., Song, S., Zeng, A., & Zhang, Y. (2017). Matterport3d: Learning from rgb-d data in indoor environments. In Proceedings of the International Conference on 3D Vision (3DV). MatterPort3D dataset license available at: http:\/\/kaldir.vc.in.tum.de\/matterport\/MP_TOS.pdf","DOI":"10.1109\/3DV.2017.00081"},{"key":"2557_CR22","doi-asserted-by":"crossref","unstructured":"Chang, C.-Y., Huang, D.-A., Xu, D., Adeli, E., Fei-Fei, L., & Niebles, J. C. (2020). Procedure planning in instructional videos. In Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XI, pp. 334\u2013350. Springer","DOI":"10.1007\/978-3-030-58621-8_20"},{"key":"2557_CR23","doi-asserted-by":"crossref","unstructured":"Chang, C.-Y., Huang, D.-A., Xu, D., Adeli, E., Fei-Fei, L., & Niebles, J. C. (2020). Procedure planning in instructional videos. In European Conference on Computer Vision, pp. 334\u2013350. Springer.","DOI":"10.1007\/978-3-030-58621-8_20"},{"key":"2557_CR24","unstructured":"Chen, F., Ding, L., Lertniphonphan, K., Li, J., Huang, K., & Wang, Z. (2024). Pcie_egohandpose solution for egoexo4d hand pose challenge. arXiv preprint arXiv:2406.12219."},{"key":"2557_CR25","doi-asserted-by":"crossref","unstructured":"Chen, Y.-H., Yang, T.-J., Emer, J., & Sze, V. (2019). Eyeriss v2: A flexible accelerator for emerging deep neural networks on mobile devices. IEEE Journal on Emerging and Selected Topics in Circuits and Systems.","DOI":"10.1109\/JETCAS.2019.2910232"},{"key":"2557_CR26","doi-asserted-by":"crossref","unstructured":"Cheng, H. K., & Schwing, A. G. (2022). Xmem: Long-term video object segmentation with an atkinson-shiffrin memory model. In: European Conference on Computer Vision, pp. 640\u2013658. Springer.","DOI":"10.1007\/978-3-031-19815-1_37"},{"key":"2557_CR27","doi-asserted-by":"crossref","unstructured":"Cheng, F., Luo, M., Wang, H., Dimakis, A., Torresani, L., Bertasius, G., & Grauman, K. (2024). 4DIFF: 3d-aware diffusion model for third-to-first viewpoint translation. In: ECCV.","DOI":"10.1007\/978-3-031-72691-0_23"},{"key":"2557_CR28","doi-asserted-by":"crossref","unstructured":"Corona, K., Osterdahl, K., Collins, R., & Hoogs, A. (2021). Meva: A large-scale multiview, multimodal video dataset for activity detection. In WACV.","DOI":"10.1109\/WACV48630.2021.00110"},{"key":"2557_CR29","doi-asserted-by":"crossref","unstructured":"Damen, D., Doughty, H., Farinella, G. M., Fidler, S., Furnari, A., Kazakos, E., Moltisanti, D., Munro, J., Perrett, T., Price, W., & Wray, M. (2018). Scaling egocentric vision: The epic-kitchens dataset. In European Conference on Computer Vision (ECCV).","DOI":"10.1007\/978-3-030-01225-0_44"},{"key":"2557_CR30","unstructured":"Damen, D., Doughty, H., Farinella, G. M., Furnari, A., Ma, J., Kazakos, E., Moltisanti, D., Munro, J., Perrett, T., Price, W., & Wray, M. (2021). Rescaling egocentric vision. IJCV."},{"issue":"1","key":"2557_CR31","doi-asserted-by":"publisher","first-page":"33","DOI":"10.1007\/s11263-021-01531-2","volume":"130","author":"D Damen","year":"2022","unstructured":"Damen, D., Doughty, H., Farinella, G. M., Furnari, A., Kazakos, E., Ma, J., Moltisanti, D., Munro, J., Perrett, T., Price, W., & Wray, M. (2022). Rescaling egocentric vision: collection, pipeline and challenges for epic-kitchens-100. International Journal of Computer Vision, 130(1), 33\u201355.","journal-title":"International Journal of Computer Vision"},{"key":"2557_CR32","doi-asserted-by":"crossref","unstructured":"De Geest, R., Gavves, E., Ghodrati, A., Li, Z., Snoek, C., & Tuytelaars, T. (2016). Online action detection. In Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11-14, 2016, Proceedings, Part V 14, pp. 269\u2013284. Springer.","DOI":"10.1007\/978-3-319-46454-1_17"},{"key":"2557_CR33","doi-asserted-by":"crossref","unstructured":"Desislavov, R., Mart\u00ednez-Plumed, F., & Hern\u00e1ndez-Orallo, J. (2023). Trends in ai inference energy consumption: Beyond the performance-vs-parameter laws of deep learning. Sustainable Computing: Informatics and Systems,38, Article 100857.","DOI":"10.1016\/j.suscom.2023.100857"},{"key":"2557_CR34","unstructured":"Ding, G., Sener, F., Ma, S., & Yao, A. (2023). Every mistake counts in assembly. arXiv preprint arXiv:2307.16453."},{"issue":"5","key":"2557_CR35","first-page":"2567","volume":"44","author":"K Ding","year":"2020","unstructured":"Ding, K., Ma, K., Wang, S., & Simoncelli, E. P. (2020). Image quality assessment: Unifying structure and texture similarity. IEEE transactions on pattern analysis and machine intelligence, 44(5), 2567\u20132581.","journal-title":"IEEE transactions on pattern analysis and machine intelligence"},{"key":"2557_CR36","unstructured":"Donley, J., Tourbabin, V., Lee, J.-S., Broyles, M., Jiang, H., Shen, J., Pantic, M., Ithapu, V. K., & Mehra, R. (2021). Easycom: An augmented reality dataset to support algorithms for easy communication in noisy environments. arXiv:2107.04174."},{"key":"2557_CR37","unstructured":"Dosovitskiy, A. (2020). An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929."},{"key":"2557_CR38","doi-asserted-by":"crossref","unstructured":"Doughty, H., Damen, D., & Mayol-Cuevas, W. (2018). Who\u2019s better? who\u2019s best? pairwise deep ranking for skill determination. In: CVPR.","DOI":"10.1109\/CVPR.2018.00634"},{"key":"2557_CR39","doi-asserted-by":"crossref","unstructured":"Doughty, H., Mayol-Cuevas, W., & Damen, D. (2019). The Pros and Cons: Rank-aware Temporal Attention for Skill Determination in Long Videos.","DOI":"10.1109\/CVPR.2019.00805"},{"key":"2557_CR40","doi-asserted-by":"crossref","unstructured":"Dvornik, N., Hadji, I., Pham, H., Bhatt, D., Martinez, B., Fazly, A., & Jepson, A. D. (2022). Flow graph to video grounding for weakly-supervised multi-step localization. In ECCV, pp. 319\u2013335. Springer.","DOI":"10.1007\/978-3-031-19833-5_19"},{"key":"2557_CR41","doi-asserted-by":"crossref","unstructured":"Elhamifar, E., & Huynh, D. (2020). Self-supervised multi-task procedure learning from instructional videos. In European Conference on Computer Vision, pp. 557\u2013573. Springer.","DOI":"10.1007\/978-3-030-58520-4_33"},{"key":"2557_CR42","unstructured":"Engel, J., Somasundaram, K., Goesele, M., Sun, A., Gamino, A., Turner, A., Talattof, A., Yuan, A., Souti, B., Meredith, B., Peng, C., Sweeney, C., Wilson, C., Barnes, D., DeTone, D., Caruso, D., Valleroy, D., Ginjupalli, D., Frost, D., Miller, E., & Mueggler, E. et\u00a0al. (2023). Project Aria: A New Tool for Egocentric Multi-Modal AI Research."},{"key":"2557_CR43","unstructured":"Esser, S. K., McKinstry, J. L., Bablani, D., Appuswamy, R., & Modha, D. S. (2019). Learned step size quantization. arXiv preprint arXiv:1902.08153."},{"key":"2557_CR44","doi-asserted-by":"crossref","unstructured":"Fan, C., Lee, J., Xu, M., Kumar Singh, K., Jae Lee, Y., Crandall, D. J., & Ryoo, M. S. (2017). Identifying first-person camera wearers in third-person videos. In CVPR.","DOI":"10.1109\/CVPR.2017.503"},{"key":"2557_CR45","doi-asserted-by":"crossref","unstructured":"Fan, Z., Taheri, O., Tzionas, D., Kocabas, M., Kaufmann, M., Black, M. J., & Hilliges, O. (2023). ARCTIC: A dataset for dexterous bimanual hand-object manipulation. In Proceedings IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR52729.2023.01244"},{"key":"2557_CR46","doi-asserted-by":"crossref","unstructured":"Fathi, A., Hodgins, J. K., & Rehg, J. M. (2012). Social interactions: A first-person perspective. In CVPR.","DOI":"10.1109\/CVPR.2012.6247805"},{"key":"2557_CR47","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C. (2020). X3d: Expanding architectures for efficient video recognition. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 203\u2013213.","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"2557_CR48","doi-asserted-by":"crossref","unstructured":"Flavell, J. H., Flavell, E. R., Green, F. L., & Wilcox, S. A. (1981). The development of three spatial perspective-taking rules. Child Development.","DOI":"10.2307\/1129250"},{"key":"2557_CR49","doi-asserted-by":"crossref","unstructured":"Fu, R., Zhang, D., Jiang, A., Fu, W., Funk, A., Ritchie, D., & Sridhar, S. (2025). Gigahands: A massive annotated dataset of bimanual hand activities. In CVPR.","DOI":"10.1109\/CVPR52734.2025.01627"},{"key":"2557_CR50","doi-asserted-by":"crossref","unstructured":"Gao, R., Oh, T.-H., Grauman, K., & Torresani, L. (2020). Listen to look: Action recognition by previewing audio. In CVPR.","DOI":"10.1109\/CVPR42600.2020.01047"},{"key":"2557_CR51","doi-asserted-by":"crossref","unstructured":"Ghodrati, A., Bejnordi, B. E., & Habibian, A. (2021). Frameexit: Conditional early exiting for efficient video recognition. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR46437.2021.01535"},{"key":"2557_CR52","doi-asserted-by":"crossref","unstructured":"Girdhar, R., Singh, M., Ravi, N., Maaten, L., Joulin, A., & Misra, I. (2022). Omnivore: A single model for many visual modalities. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16102\u201316112.","DOI":"10.1109\/CVPR52688.2022.01563"},{"key":"2557_CR53","doi-asserted-by":"crossref","unstructured":"Goyal, R., Ebrahimi Kahou, S., Michalski, V., Materzynska, J., Westphal, S., Kim, H., Haenel, V., Fruend, I., Yianilos, P., Mueller-Freitag, M., Hoppe, F. (2017). The \u201csomething something\u201d video database for learning and evaluating visual common sense. In Proceedings of the IEEE International Conference on Computer Vision, pp. 5842\u20135850.","DOI":"10.1109\/ICCV.2017.622"},{"key":"2557_CR54","doi-asserted-by":"crossref","unstructured":"Grauman, K., Westbury, A., Byrne, E., Chavis, Z., Furnari, A., Girdhar, R., Hamburger, J., Jiang, H., Liu, M., Liu, X., & Martin, M. (2022). Ego4d: Around the world in 3,000 hours of egocentric video. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18995\u201319012.","DOI":"10.1109\/CVPR52688.2022.01842"},{"key":"2557_CR55","doi-asserted-by":"crossref","unstructured":"Grauman, K., Westbury, A., Byrne, E., Chavis, Z., Furnari, A., Girdhar, R., Hamburger, J., Jiang, H., Liu, M., Liu, X., Martin, M., Nagarajan, T., Radosavovic, I., Ramakrishnan, S. K., Ryan, F., Sharma, J., Wray, M., Xu, M., Xu, E. Z., Zhao, C., & Bansal, S. et\u00a0al. Ego4D: Around the world in 3,000 hours of egocentric video. In CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01842"},{"key":"2557_CR56","doi-asserted-by":"crossref","unstructured":"Grauman, K., Westbury, A., Torresani, L., Kitani, K., Malik, J., Afouras, T., Ashutosh, K., Baiyya, V., Bansal, S., Boote, B., Byrne, E., Chavis, Z., Chen, J., Cheng, F., Chu, F.-J., Crane, S., Dasgupta, A., Dong, J., Escobar, M., Forigua, C., & Gebreselasie, A. et\u00a0al. (2024). Ego-Exo4D: Understanding skilled human activity from first- and third-person perspectives. In CVPR.","DOI":"10.1109\/CVPR52733.2024.01834"},{"key":"2557_CR57","doi-asserted-by":"crossref","unstructured":"Gu, C., Sun, C., Ross, D. A., Vondrick, C., Pantofaru, C., Li, Y., Vijayanarasimhan, S., Toderici, G., Ricco, S., Sukthankar, R., Schmid, C., & Malik, J. (2018). Ava: A video dataset of spatio-temporally localized atomic visual actions. In CVPR.","DOI":"10.1109\/CVPR.2018.00633"},{"key":"2557_CR58","doi-asserted-by":"crossref","unstructured":"Guzov, V., Mir, A., Sattler, T., & Pons-Moll, G. (2021). Human poseitioning system (hps): 3d human pose estimation and self-localization in large scenes from body-mounted sensors. In CVPR.","DOI":"10.1109\/CVPR46437.2021.00430"},{"key":"2557_CR59","doi-asserted-by":"crossref","unstructured":"Hampali, S., Rad, M., Oberweger, M., & Lepetit, V. (2020). Honnotate: A method for 3d annotation of hand and object poses. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3196\u20133206.","DOI":"10.1109\/CVPR42600.2020.00326"},{"key":"2557_CR60","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In CVPR.","DOI":"10.1109\/CVPR.2016.90"},{"key":"2557_CR61","unstructured":"Hinton, G., Vinyals, O., & Dean, J. (2015). Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531."},{"key":"2557_CR62","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., & Abbeel, P. (2020). Denoising diffusion probabilistic models. Advances in Neural Information Processing Systems, 33, 6840\u20136851.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2557_CR63","doi-asserted-by":"crossref","unstructured":"Hore, A., & Ziou, D. (2010). Image quality metrics: Psnr vs. ssim. In 2010 20th International Conference on Pattern Recognition, pp. 2366\u20132369. IEEE","DOI":"10.1109\/ICPR.2010.579"},{"key":"2557_CR64","doi-asserted-by":"crossref","unstructured":"Horowitz, M. (2014). 1.1 computing\u2019s energy problem (and what we can do about it). In 2014 IEEE International Solid-state Circuits Conference Digest of Technical Papers (ISSCC).","DOI":"10.1109\/ISSCC.2014.6757323"},{"key":"2557_CR65","unstructured":"Howard, A. G., Zhu, M., Chen, B., Kalenichenko, D., Wang, W., Weyand, T., Andreetto, M., & Adam, H. (2017). Mobilenets: Efficient convolutional neural networks for mobile vision applications. arXiv preprint arXiv:1704.04861."},{"key":"2557_CR66","doi-asserted-by":"crossref","unstructured":"Huang, Y., Chen, G., Xu, J., Zhang, M., Yang, L., Pei, B., Zhang, H., Dong, L., Wang, Y., Wang, L., & Qiao, Y. (2024). Egoexolearn: A dataset for bridging asynchronous ego-and exo-centric view of procedural activities in real world. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22072\u201322086.","DOI":"10.1109\/CVPR52733.2024.02084"},{"key":"2557_CR67","doi-asserted-by":"crossref","unstructured":"Hwang, D.-H., Aso, K., Yuan, Y., Kitani, K., & Koike, H. (2020). Monoeye: Multimodal human motion capture system using a single ultra-wide fisheye camera. In Proceedings of the 33rd Annual ACM Symposium on User Interface Software and Technology, pp. 98\u2013111.","DOI":"10.1145\/3379337.3415856"},{"key":"2557_CR68","unstructured":"Iandola, F. N., Han, S., Moskewicz, M. W., Ashraf, K., Dally, W. J., & Keutzer, K. (2016). Squeezenet: Alexnet-level accuracy with 50x fewer parameters and $$<$$ 0.5 mb model size. arXiv preprint arXiv:1602.07360."},{"key":"2557_CR69","doi-asserted-by":"crossref","unstructured":"Iashin, V., & Rahtu, E. (2020). Multi-modal dense video captioning. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops, pp. 958\u2013959.","DOI":"10.1109\/CVPRW50498.2020.00487"},{"key":"2557_CR70","first-page":"214","volume":"2018","author":"H Ismail Fawaz","year":"2018","unstructured":"Ismail Fawaz, H., Forestier, G., Weber, J., Idoumghar, L., & Muller, P.-A. (2018). Evaluating surgical skills from kinematic data using convolutional neural networks. Medical Image Computing and Computer Assisted Intervention - MICCAI, 2018, 214\u2013221.","journal-title":"Medical Image Computing and Computer Assisted Intervention - MICCAI"},{"key":"2557_CR71","doi-asserted-by":"crossref","unstructured":"Isola, P., Zhu, J.-Y., Zhou, T., & Efros, A. A. (2017). Image-to-image translation with conditional adversarial networks. CVPR.","DOI":"10.1109\/CVPR.2017.632"},{"key":"2557_CR72","doi-asserted-by":"crossref","unstructured":"Jang, Y., Sohn, S., Logeswaran, L., Luo, T., Lee, M., & Lee, H. (2023). Multimodal subtask graph generation from instructional videos. arXiv preprint arXiv:2302.08672.","DOI":"10.18653\/v1\/2023.findings-acl.210"},{"key":"2557_CR73","doi-asserted-by":"crossref","unstructured":"Jia, B., Chen, Y., Huang, S., Zhu, Y., & Zhu, S.-C. (2020). Lemma: A multi-view dataset for le arning m ulti-agent m ulti-task a ctivities. In European Conference on Computer Vision, pp. 767\u2013786. Springer.","DOI":"10.1007\/978-3-030-58574-7_46"},{"key":"2557_CR74","doi-asserted-by":"crossref","unstructured":"Jiang, H., & Grauman, K. (2017). Seeing invisible poses: Estimating 3d body pose from egocentric video. In CVPR.","DOI":"10.1109\/CVPR.2017.373"},{"key":"2557_CR75","doi-asserted-by":"crossref","unstructured":"Jiang, J., Streli, P., Qiu, H., Fender, A., Laich, L., Snape, P., & Holz, C. (2022). Avatarposer: Articulated full-body pose tracking from sparse motion sensing. In European Conference on Computer Vision, pp. 443\u2013460. Springer","DOI":"10.1007\/978-3-031-20065-6_26"},{"key":"2557_CR76","doi-asserted-by":"crossref","unstructured":"Jiang, W., Trulls, E., Hosang, J., Tagliasacchi, A., & Yi, K. M. (2021). Cotr: Correspondence transformer for matching across images. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6207\u20136217.","DOI":"10.1109\/ICCV48922.2021.00615"},{"key":"2557_CR77","unstructured":"Joo, H., Simon, T., Li, X., Liu, H., Tan, L., Gui, L., Banerjee, S., Godisart, T. S., Nabbe, B., Matthews, I., Kanade, T., Nobuhara, S., & Sheikh, Y. (2017). Panoptic studio: A massively multiview system for social interaction capture. IEEE Transactions on Pattern Analysis and Machine Intelligence."},{"key":"2557_CR78","unstructured":"Kay, W., Carreira, J., Simonyan, K., Zhang, B., Hillier, C., Vijayanarasimhan, S., Viola, F., Green, T., Back, T., Natsev, P., & Suleyman, M. (2017). The kinetics human action video dataset. arXiv preprint arXiv:1705.06950."},{"key":"2557_CR79","doi-asserted-by":"crossref","unstructured":"Khirodkar, R., Bansal, A., Ma, L., Newcombe, R., Vo, M., & Kitani, K. (2023). Egohumans: An egocentric 3d multi-human benchmark. In ICCV.","DOI":"10.1109\/ICCV51070.2023.01814"},{"key":"2557_CR80","unstructured":"Kingma, D. P., & Ba, J. (2014). Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980."},{"key":"2557_CR81","doi-asserted-by":"crossref","unstructured":"Kirillov, A., Mintun, E., Ravi, N., Mao, H., Rolland, C., Gustafson, L., Xiao, T., Whitehead, S., Berg, A. C., Lo, W.-Y., Dollar, P., & Girshick, R. (2023). Segment anything. In Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 4015\u20134026.","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"2557_CR82","doi-asserted-by":"crossref","unstructured":"Ko, D., Choi, J., Ko, J., Noh, S., On, K.-W., Kim, E.-S., & Kim, H. J. (2022). Video-text representation learning via differentiable weak temporal alignment. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5016\u20135025.","DOI":"10.1109\/CVPR52688.2022.00496"},{"key":"2557_CR83","doi-asserted-by":"crossref","unstructured":"Kolotouros, N., Pavlakos, G., Black, M. J., & Daniilidis, K. (2019). Learning to reconstruct 3d human pose and shape via model-fitting in the loop. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2252\u20132261.","DOI":"10.1109\/ICCV.2019.00234"},{"key":"2557_CR84","doi-asserted-by":"crossref","unstructured":"Korbar, B., Tran, D., & Torresani, L. (2019). Scsampler: Sampling salient clips from video for efficient action recognition. In Proceedings of the IEEE\/CVF International Conference on Computer Vision","DOI":"10.1109\/ICCV.2019.00633"},{"key":"2557_CR85","doi-asserted-by":"crossref","unstructured":"Kukelova, Z., Heller, J., & Fitzgibbon, A. (2016). Efficient intersection of three quadrics and applications in computer vision. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1799\u20131808.","DOI":"10.1109\/CVPR.2016.199"},{"key":"2557_CR86","doi-asserted-by":"crossref","unstructured":"Kwak, I., Guo, J.-Z., Hantman, A., Kriegman, D., & Branson, K. (2020). Detecting the starting frame of actions in video. In Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 489\u2013497.","DOI":"10.1109\/WACV45572.2020.9093405"},{"key":"2557_CR87","doi-asserted-by":"crossref","unstructured":"Kwon, T., Tekin, B., St\u00fchmer, J., Bogo, F., & Pollefeys, M. (2021). H2o: Two hands manipulating objects for first person interaction recognition. In Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 10138\u201310148","DOI":"10.1109\/ICCV48922.2021.00998"},{"key":"2557_CR88","unstructured":"Lee, Y. J., Ghosh, J., & Grauman, K. (2012). Discovering important people and objects for egocentric video summarization. In CVPR."},{"key":"2557_CR89","doi-asserted-by":"crossref","unstructured":"Li, J., Bian, S., Zeng, A., Wang, C., Pang, B., Liu, W., & Lu, C. (2021). Human pose regression with residual log-likelihood estimation. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 11025\u201311034.","DOI":"10.1109\/ICCV48922.2021.01084"},{"key":"2557_CR90","doi-asserted-by":"crossref","unstructured":"Li, Y., Liu, M., & Rehg, J. M. (2018). In the eye of beholder: Joint learning of gaze and actions in first person video. In ECCV.","DOI":"10.1007\/978-3-030-01228-1_38"},{"key":"2557_CR91","doi-asserted-by":"crossref","unstructured":"Li, J., Liu, K., & Wu, J. (2023). Ego-body pose estimation via ego-head pose estimation. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 17142\u201317151.","DOI":"10.1109\/CVPR52729.2023.01644"},{"key":"2557_CR92","doi-asserted-by":"crossref","unstructured":"Li, Y., Nagarajan, T., Xiong, B., & Grauman, K. (2021). Ego-exo: Transferring visual representations from third-person to first-person videos. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6943\u20136953.","DOI":"10.1109\/CVPR46437.2021.00687"},{"key":"2557_CR93","doi-asserted-by":"crossref","unstructured":"Li, J., Xu, C., Chen, Z., Bian, S., Yang, L., & Lu, C. (2021). Hybrik: A hybrid analytical-neural inverse kinematics solution for 3d human pose and shape estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3383\u20133393.","DOI":"10.1109\/CVPR46437.2021.00339"},{"key":"2557_CR94","doi-asserted-by":"crossref","unstructured":"Liao, J., Duan, H., Feng, K., Zhao, W., Yang, Y., & Chen, L. (2023). A light weight model for active speaker detection. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22932\u201322941.","DOI":"10.1109\/CVPR52729.2023.02196"},{"key":"2557_CR95","doi-asserted-by":"crossref","unstructured":"Lin, T., Cui, Y., Belongie, S., & Hays, J. (2015). Learning deep representations for ground-to-aerial geolocalization. In CVPR.","DOI":"10.1109\/CVPR.2015.7299135"},{"key":"2557_CR96","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Doll\u00e1r, P., Girshick, R., He, K., Hariharan, B., & Belongie, S. (2017). Feature pyramid networks for object detection. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2117\u20132125.","DOI":"10.1109\/CVPR.2017.106"},{"key":"2557_CR97","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., & Zitnick, C. L. (2014). Microsoft COCO: Common objects in context. In ECCV.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"2557_CR98","doi-asserted-by":"crossref","unstructured":"Lin, X., Petroni, F., Bertasius, G., Rohrbach, M., Chang, S.-F., & Torresani, L. (2022). Learning to recognize procedural activities with distant supervision. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13853\u201313863.","DOI":"10.1109\/CVPR52688.2022.01348"},{"key":"2557_CR99","doi-asserted-by":"crossref","unstructured":"Lin, K., Wang, L., & Liu, Z. (2021). End-to-end human pose and mesh reconstruction with transformers. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1954\u20131963.","DOI":"10.1109\/CVPR46437.2021.00199"},{"key":"2557_CR100","unstructured":"Lin, K. Q., Wang, A. J., Soldan, M., Wray, M., Yan, R., Xu, E. Z., Gao, D., Tu, R., Zhao, W., Kong, W., & Cai, C. (2022). Egocentric video-language pretraining. NeurIPS."},{"key":"2557_CR101","doi-asserted-by":"crossref","unstructured":"Liu, C., Bainbridge, L., Berkovich, A., Chen, S., Gao, W., Tsai, T.-H., Mori, K., Ikeno, R., Uno, M., Isozaki, T., & Tsai, Y. L. (2020). A 4.6 $$\\mu $$m, 512$$\\times $$ 512, ultra-low power stacked digital pixel sensor with triple quantization and 127db dynamic range. In 2020 IEEE International Electron Devices Meeting (IEDM).","DOI":"10.1109\/IEDM13553.2020.9371913"},{"key":"2557_CR102","doi-asserted-by":"crossref","unstructured":"Liu, D., Li, Q., Jiang, T., Wang, Y., Miao, R., Shan, F., & Li, Z. (2021). Towards unified surgical skill assessment. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 9522\u20139531.","DOI":"10.1109\/CVPR46437.2021.00940"},{"key":"2557_CR103","doi-asserted-by":"crossref","unstructured":"Liu, Z., Ning, J., Cao, Y., Wei, Y., Zhang, Z., Lin, S., & Hu, H. (2022). Video swin transformer. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3202\u20133211.","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"2557_CR104","doi-asserted-by":"crossref","unstructured":"Liu, G., Tang, H., Latapie, H., & Yan, Y. (2020). Exocentric to egocentric image generation via parallel generative adversarial network. In ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1843\u20131847. IEEE","DOI":"10.1109\/ICASSP40776.2020.9053957"},{"key":"2557_CR105","doi-asserted-by":"crossref","unstructured":"Liu, G., Tang, H., Latapie, H. M., Corso, J. J., & Yan, Y. (2021). Cross-view exocentric to egocentric video synthesis. In Proceedings of the 29th ACM International Conference on Multimedia, pp. 974\u2013982.","DOI":"10.1145\/3474085.3475596"},{"key":"2557_CR106","doi-asserted-by":"crossref","unstructured":"Liu, A., Tucker, R., Jampani, V., Makadia, A., Snavely, N., & Kanazawa, A. (2021). Infinite nature: Perpetual view generation of natural scenes from a single image. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 14458\u201314467.","DOI":"10.1109\/ICCV48922.2021.01419"},{"key":"2557_CR107","unstructured":"Loshchilov, I., & Hutter, F. (2019). Decoupled weight decay regularization. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=Bkg6RiCqY7."},{"key":"2557_CR108","doi-asserted-by":"crossref","unstructured":"Lu, X., Li, Z., Cui, Z., Oswald, M. R., Pollefeys, M., & Qin, R. (2020). Geometry-aware satellite-to-ground image synthesis for urban areas. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR42600.2020.00094"},{"key":"2557_CR109","unstructured":"Luo, Z., Hachiuma, R., Yuan, Y., & Kitani, K. (2021). Dynamics-regulated kinematic policy for egocentric pose estimation. In Advances in Neural Information Processing Systems."},{"key":"2557_CR110","doi-asserted-by":"crossref","unstructured":"Luo, M., Xue, Z., Dimakis, A., & Grauman, K. (2024). Put myself in your shoes: Lifting the egocentric perspective from exocentric videos. arXiv:2403.06351.","DOI":"10.1007\/978-3-031-72920-1_23"},{"key":"2557_CR111","doi-asserted-by":"crossref","unstructured":"Luo, M., Xue, Z., Dimakis, A., & Grauman, K. (2024). Put myself in your shoes: Lifting the egocentric perspective from exocentric videos. In ECCV.","DOI":"10.1007\/978-3-031-72920-1_23"},{"key":"2557_CR112","doi-asserted-by":"crossref","unstructured":"Mahmood, N., Ghorbani, N., Troje, N. F., Pons-Moll, G., & Black, M. J. (2019). Amass: Archive of motion capture as surface shapes. In The IEEE International Conference on Computer Vision (ICCV). https:\/\/amass.is.tue.mpg.de.","DOI":"10.1109\/ICCV.2019.00554"},{"key":"2557_CR113","doi-asserted-by":"crossref","unstructured":"Mavroudi, E., Afouras, T., & Torresani, L. (2022). Learning to ground instructional articles in videos through narrations.","DOI":"10.1109\/ICCV51070.2023.01395"},{"key":"2557_CR114","unstructured":"Mehta, S., & Rastegari, M. (2021). Mobilevit: light-weight, general-purpose, and mobile-friendly vision transformer. arXiv preprint arXiv:2110.02178."},{"key":"2557_CR115","doi-asserted-by":"crossref","unstructured":"Meng, Y., Lin, C.-C., Panda, R., Sattigeri, P., Karlinsky, L., Oliva, A., Saenko, K., & Feris, R. (2020). Ar-net: Adaptive frame resolution for efficient action recognition. In: ECCV 2020.","DOI":"10.1007\/978-3-030-58571-6_6"},{"key":"2557_CR116","doi-asserted-by":"crossref","unstructured":"Miech, A., Alayrac, J.-B., Smaira, L., Laptev, I., Sivic, J., & Zisserman, A. (2020). End-to-end learning of visual representations from uncurated instructional videos. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9879\u20139889.","DOI":"10.1109\/CVPR42600.2020.00990"},{"key":"2557_CR117","doi-asserted-by":"crossref","unstructured":"Miech, A., Zhukov, D., Alayrac, J.-B., Tapaswi, M., Laptev, I., & Sivic, J. (2019). HowTo100M: Learning a Text-Video Embedding by Watching Hundred Million Narrated Video Clips. In ICCV.","DOI":"10.1109\/ICCV.2019.00272"},{"key":"2557_CR118","unstructured":"MMPoseContributors: OpenMMLab Pose Estimation Toolbox and Benchmark. https:\/\/github.com\/open-mmlab\/mmpose (2020)."},{"key":"2557_CR119","doi-asserted-by":"crossref","unstructured":"Monfort, M., Andonian, A., Zhou, B., Ramakrishnan, K., Bargal, S. A., Yan, T., Brown, L., Fan, Q., Gutfreund, D., Vondrick, C., & Oliva, A. (2019). Moments in time dataset: one million videos for event understanding. PAMI.","DOI":"10.1109\/TPAMI.2019.2901464"},{"key":"2557_CR120","doi-asserted-by":"crossref","unstructured":"Moon, G., Yu, S.-I., Wen, H., Shiratori, T., & Lee, K. M. (2020). Interhand2. 6m: A dataset and baseline for 3d interacting hand pose estimation from a single rgb image. In Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XX 16, pp. 548\u2013564. Springer.","DOI":"10.1007\/978-3-030-58565-5_33"},{"key":"2557_CR121","unstructured":"Narasimhan, M., Yu, L., Bell, S., Zhang, N., & Darrell, T. (2023). Learning and verification of task structure in instructional videos. arXiv preprint arXiv:2303.13519."},{"key":"2557_CR122","doi-asserted-by":"crossref","unstructured":"Newcombe, N. (1989). The development of spatial perspective taking. Advances in child development and behavior.","DOI":"10.1016\/S0065-2407(08)60415-2"},{"key":"2557_CR123","doi-asserted-by":"crossref","unstructured":"Northcutt, C. G., Zha, S., Lovegrove, S., & Newcombe, R. (2023). Egocom: A multi-person multi-modal egocentric communications dataset. PAMI.","DOI":"10.1109\/TPAMI.2020.3025105"},{"key":"2557_CR124","doi-asserted-by":"crossref","unstructured":"Ohkawa, T., He, K., Sener, F., Hodan, T., Tran, L., & Keskin, C. (2023). Assemblyhands: Towards egocentric activity understanding via 3d hand pose estimation. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12999\u201313008.","DOI":"10.1109\/CVPR52729.2023.01249"},{"key":"2557_CR125","unstructured":"Oord, A.V.D., Li, Y., & Vinyals, O. (2018). Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748."},{"key":"2557_CR126","doi-asserted-by":"crossref","unstructured":"Pan, B., Cai, H., Huang, D.-A., Lee, K.-H., Gaidon, A., Adeli, E., & Niebles, J. C. (2020). Spatio-temporal graph for video captioning with knowledge distillation. In: CVPR.","DOI":"10.1109\/CVPR42600.2020.01088"},{"key":"2557_CR127","doi-asserted-by":"crossref","unstructured":"Park, J., Oh, Y., Moon, G., Choi, H., & Lee, K. M. (2022). Handoccnet: Occlusion-robust 3d hand mesh estimation network. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1496\u20131505.","DOI":"10.1109\/CVPR52688.2022.00155"},{"key":"2557_CR128","doi-asserted-by":"crossref","unstructured":"Parmar, P., & Morris, B. T. (2017). Learning To Score Olympic Events.","DOI":"10.1109\/CVPRW.2017.16"},{"key":"2557_CR129","doi-asserted-by":"crossref","unstructured":"Parmar, P., & Morris, B. (2019). Action quality assessment across multiple actions. In: WACV.","DOI":"10.1109\/WACV.2019.00161"},{"key":"2557_CR130","doi-asserted-by":"crossref","unstructured":"Parmar, P., & Tran Morris, B. (2019). What and how well you performed? a multitask learning approach to action quality assessment. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 304\u2013313.","DOI":"10.1109\/CVPR.2019.00039"},{"key":"2557_CR131","doi-asserted-by":"crossref","unstructured":"Pavlakos, G., Shan, D., Radosavovic, I., Kanazawa, A., Fouhey, D., & Malik, J. (2024). Reconstructing hands in 3d with transformers. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9826\u20139836.","DOI":"10.1109\/CVPR52733.2024.00938"},{"key":"2557_CR132","doi-asserted-by":"crossref","unstructured":"Peebles, W., & Xie, S. (2022). Scalable diffusion models with transformers. arXiv preprint arXiv:2212.09748.","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"2557_CR133","doi-asserted-by":"crossref","unstructured":"Perazzi, F., Pont-Tuset, J., McWilliams, B., Van\u00a0Gool, L., Gross, M., & Sorkine-Hornung, A. (2016). A benchmark dataset and evaluation methodology for video object segmentation. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 724\u2013732.","DOI":"10.1109\/CVPR.2016.85"},{"key":"2557_CR134","doi-asserted-by":"crossref","unstructured":"Perez, E., Strub, F., De Vries, H., Dumoulin, V., & Courville, A. (2018). Film: Visual reasoning with a general conditioning layer. In Proceedings of the AAAI Conference on Artificial Intelligence, vol. 32.","DOI":"10.1609\/aaai.v32i1.11671"},{"key":"2557_CR135","doi-asserted-by":"crossref","unstructured":"Pirsiavash, H., & Ramanan, D. (2012). Detecting activities of daily living in first-person camera views. In CVPR.","DOI":"10.1109\/CVPR.2012.6248010"},{"key":"2557_CR136","doi-asserted-by":"crossref","unstructured":"Pirsiavash, H., Vondrick, C., & Torralba, A. (2014). Assessing the quality of actions. In ECCV.","DOI":"10.1007\/978-3-319-10599-4_36"},{"issue":"11","key":"2557_CR137","doi-asserted-by":"publisher","first-page":"4880","DOI":"10.1007\/s11263-024-02095-7","volume":"132","author":"C Plizzari","year":"2024","unstructured":"Plizzari, C., Goletto, G., Furnari, A., Bansal, S., Ragusa, F., Farinella, G. M., Damen, D., & Tommasi, T. (2024). An outlook into the future of egocentric vision. International Journal of Computer Vision., 132(11), 4880\u20134936.","journal-title":"International Journal of Computer Vision."},{"key":"2557_CR138","unstructured":"Polino, A., Pascanu, R., & Alistarh, D. (2018). Model compression via distillation and quantization. arXiv preprint arXiv:1802.05668."},{"key":"2557_CR139","unstructured":"Povey, D., Ghoshal, A., Boulianne, G., Burget, L., Glembek, O., Goel, N. K., Hannemann, M., Motl\u00edcek, P., Qian, Y., Schwarz, P., Silovsk\u00fd, J., Stemmer, G., & Vesel\u00fd, K. (2011). The kaldi speech recognition toolkit. https:\/\/api.semanticscholar.org\/CorpusID:1774023."},{"key":"2557_CR140","doi-asserted-by":"crossref","unstructured":"Pramanick, S., Song, Y., Nag, S., Lin, K. Q., Shah, H., Shou, M. Z., Chellappa, R., & Zhang, P. (2023). Egovlpv2: Egocentric video-language pre-training with fusion in the backbone. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5285\u20135297.","DOI":"10.1109\/ICCV51070.2023.00487"},{"key":"2557_CR141","unstructured":"Radford, A., Kim, J. W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., & Krueger, G. (2021). Learning transferable visual models from natural language supervision. In International Conference on Machine Learning, pp. 8748\u20138763. PMLR."},{"key":"2557_CR142","unstructured":"Radford, A., Kim, J. W., Xu, T., Brockman, G., McLeavey, C., & Sutskever, I. (2023). Robust speech recognition via large-scale weak supervision. In International Conference on Machine Learning, pp. 28492\u201328518. PMLR."},{"key":"2557_CR143","doi-asserted-by":"crossref","unstructured":"Ragusa, F., Furnari, A., Livatino, S., & Farinella, G. M. (2021). The meccano dataset: Understanding human-object interactions from egocentric videos in an industrial-like domain. In: WACV.","DOI":"10.1109\/WACV48630.2021.00161"},{"key":"2557_CR144","doi-asserted-by":"crossref","unstructured":"Rai, N., Chen, H., Ji, J., Desai, R., Kozuka, K., Ishizaka, S., Adeli, E., Niebles, J. C. (2021). Home action genome: Contrastive compositional action understanding. In CVPR.","DOI":"10.1109\/CVPR46437.2021.01103"},{"key":"2557_CR145","unstructured":"Ramakrishnan, S. K., Gokaslan, A., Wijmans, E., Maksymets, O., Clegg, A., Turner, J. M., Undersander, E., Galuba, W., Westbury, A., Chang, A. X., Savva, M., Zhao, Y., & Batra, D. (2021). Habitat-matterport 3d dataset (HM3d): 1000 large-scale 3d environments for embodied AI. In Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track. arXiv:2109.08238."},{"key":"2557_CR146","unstructured":"Reed, S., Lee, H., Anguelov, D., Szegedy, C., Erhan, D., & Rabinovich, A. (2014). Training deep neural networks on noisy labels with bootstrapping. arXiv preprint arXiv:1412.6596."},{"key":"2557_CR147","doi-asserted-by":"crossref","unstructured":"Regmi, K., & Borji, A. (2018). Cross-view image synthesis using conditional gans. In The IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2018.00369"},{"key":"2557_CR148","doi-asserted-by":"crossref","unstructured":"Regmi, K., & Shah, M. (2019). Bridging the domain gap for ground-to-aerial image matching. In ICCV.","DOI":"10.1109\/ICCV.2019.00056"},{"key":"2557_CR149","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2019.07.008","author":"K Regmi","year":"2019","unstructured":"Regmi, K., & Borji, A. (2019). Cross-view image synthesis using geometry-guided conditional gans. Computer Vision and Image Understanding. https:\/\/doi.org\/10.1016\/j.cviu.2019.07.008","journal-title":"Computer Vision and Image Understanding"},{"key":"2557_CR150","doi-asserted-by":"crossref","unstructured":"Reizenstein, J., Shapovalov, R., Henzler, P., Sbordone, L., Labatut, P., & Novotny, D. (2021). Common objects in 3d: Large-scale learning and evaluation of real-life 3d category reconstruction. In ICCV.","DOI":"10.1109\/ICCV48922.2021.01072"},{"key":"2557_CR151","doi-asserted-by":"crossref","unstructured":"Ren, X., & Wang, X. (2022). Look outside the room: Synthesizing a consistent long-term 3d scene video from a single image. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3563\u20133573.","DOI":"10.1109\/CVPR52688.2022.00355"},{"key":"2557_CR152","doi-asserted-by":"crossref","unstructured":"Ren, B., Tang, H., & Sebe, N. (2021). Cascaded cross mlp-mixer gans for cross-view image translation. arXiv preprint arXiv:2110.10183.","DOI":"10.5244\/C.35.40"},{"issue":"6","key":"2557_CR153","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2980179.2980235","volume":"35","author":"H Rhodin","year":"2016","unstructured":"Rhodin, H., Richardt, C., Casas, D., Insafutdinov, E., Shafiei, M., Seidel, H.-P., Schiele, B., & Theobalt, C. (2016). Egocap: egocentric marker-less motion capture with two fisheye cameras. ACM Transactions on Graphics (TOG), 35(6), 1\u201311.","journal-title":"ACM Transactions on Graphics (TOG)"},{"key":"2557_CR154","doi-asserted-by":"crossref","unstructured":"Rombach, R., Esser, P., & Ommer, B. (2021). Geometry-free view synthesis: Transformers and no 3d priors. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 14356\u201314366.","DOI":"10.1109\/ICCV48922.2021.01409"},{"key":"2557_CR155","doi-asserted-by":"crossref","unstructured":"Romero, J., Tzionas, D., & Black, M. J. (2017). Embodied hands: Modeling and capturing hands and bodies together. ACM Transactions on Graphics, (Proc. SIGGRAPH Asia) 36(6).","DOI":"10.1145\/3130800.3130883"},{"key":"2557_CR156","doi-asserted-by":"crossref","unstructured":"Seminara, L., Farinella, G. M., & Furnari, A. (2024). Differentiable Task Graph Learning: Procedural Activity Representation and Online Mistake Detection from Egocentric Videos.","DOI":"10.52202\/079017-1895"},{"key":"2557_CR157","doi-asserted-by":"crossref","unstructured":"Sener, F., Chatterjee, D., Shelepov, D., He, K., Singhania, D., Wang, R., & Yao, A. (2022). Assembly101: A large-scale multi-view video dataset for understanding procedural activities. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 21096\u201321106.","DOI":"10.1109\/CVPR52688.2022.02042"},{"key":"2557_CR158","doi-asserted-by":"crossref","unstructured":"Sermanet, P., Lynch, C., Chebotar, Y., Hsu, J., Jang, E., Schaal, S., & Levine, S. (2018). Time-contrastive networks: Self-supervised learning from video. Proceedings of International Conference in Robotics and Automation (ICRA).","DOI":"10.1109\/ICRA.2018.8462891"},{"key":"2557_CR159","doi-asserted-by":"crossref","unstructured":"Sermanet, P., Lynch, C., Hsu, J., & Levine, S. (2017). Time-contrastive networks: Self-supervised learning from multi-view observation. In 2017 IEEE Conference on Computer Vision and Pattern Recognition Workshops (CVPRW), pp. 486\u2013487. IEEE.","DOI":"10.1109\/CVPRW.2017.69"},{"key":"2557_CR160","doi-asserted-by":"crossref","unstructured":"Shen, X., Efros, A. A., Joulin, A., & Aubry, M. (2022). Learning co-segmentation by segment swapping for retrieval and discovery. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5082\u20135092.","DOI":"10.1109\/CVPRW56347.2022.00556"},{"key":"2557_CR161","doi-asserted-by":"crossref","unstructured":"Shvetsova, N., Chen, B., Rouditchenko, A., Thomas, S., Kingsbury, B., Feris, R. S., Harwath, D., Glass, J., & Kuehne, H. (2022). Everything at once-multi-modal fusion transformer for video retrieval. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 20020\u201320029.","DOI":"10.1109\/CVPR52688.2022.01939"},{"key":"2557_CR162","doi-asserted-by":"crossref","unstructured":"Sigurdsson, G. A., Gupta, A., Schmid, C., Farhadi, A., & Alahari, K. (2018). Actor and observer: Joint modeling of first and third-person videos. In CVPR.","DOI":"10.1109\/CVPR.2018.00772"},{"key":"2557_CR163","unstructured":"Sigurdsson, G. A., Gupta, A., Schmid, C., Farhadi, A., & Alahari, K. (2018). Charades-ego: A large-scale dataset of paired third and first person videos. arXiv preprint arXiv:1804.09626."},{"key":"2557_CR164","doi-asserted-by":"crossref","unstructured":"Simon, T., Joo, H., Matthews, I., & Sheikh, Y. (2017). Hand keypoint detection in single images using multiview bootstrapping. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1145\u20131153.","DOI":"10.1109\/CVPR.2017.494"},{"key":"2557_CR165","doi-asserted-by":"crossref","unstructured":"Singh, K. K., Fatahalian, K., & Efros, A. A. (2016). Krishnacam: Using a longitudinal, single-person, egocentric dataset for scene understanding tasks. In WACV.","DOI":"10.1109\/WACV.2016.7477717"},{"key":"2557_CR166","unstructured":"Song, Y., Byrne, E., Nagarajan, T., Wang, H., Martin, M., & Torresani, L. (2023). Ego4d goal-step: Toward hierarchical understanding of procedural activities. In NeurIPS."},{"key":"2557_CR167","unstructured":"Soomro, K., Zamir, A. R., & Shah, M. (2012). Ucf101: A dataset of 101 human action classes from videos in the wild. In CRCV-TR-12-01."},{"key":"2557_CR168","doi-asserted-by":"crossref","unstructured":"Soran, B., Farhadi, A., & Shapiro, L. (2015). Generating notifications for missing actions: Don\u2019t forget to turn the lights off! In ICCV, pp. 4669\u20134677.","DOI":"10.1109\/ICCV.2015.530"},{"key":"2557_CR169","unstructured":"Straub, J., Whelan, T., Ma, L., Chen, Y., Wijmans, E., Green, S., Engel, J.J., Mur-Artal, R., Ren, C., Verma, S., Clarkson, A., Yan, M., Budge, B., Yan, Y., Pan, X., Yon, J., Zou, Y., Leon, K., Carter, N., Briales, J., & Gillingham, T., et\u00a0al. (2019). The Replica dataset: A digital replica of indoor spaces. arXiv preprint arXiv:1906.05797."},{"key":"2557_CR170","doi-asserted-by":"crossref","unstructured":"Sudre, C. H., Li, W., Vercauteren, T., Ourselin, S., & Jorge Cardoso, M. (2017). Generalised dice overlap as a deep learning loss function for highly unbalanced segmentations. In Deep Learning in Medical Image Analysis and Multimodal Learning for Clinical Decision Support: Third International Workshop, DLMIA 2017, and 7th International Workshop, ML-CDS 2017, Held in Conjunction with MICCAI 2017, Qu\u00e9bec City, QC, Canada, September 14, Proceedings 3, pp. 240\u2013248. Springer.","DOI":"10.1007\/978-3-319-67558-9_28"},{"key":"2557_CR171","doi-asserted-by":"crossref","unstructured":"Sze, V., Chen, Y.-H., Yang, T.-J., & Emer, J. S. (2020). How to evaluate deep neural network processors: Tops\/w (alone) considered harmful. IEEE Solid-State Circuits Magazine.","DOI":"10.1109\/MSSC.2020.3002140"},{"key":"2557_CR172","doi-asserted-by":"crossref","unstructured":"Taheri, O., Ghorbani, N., Black, M. J., & Tzionas, D. (2020). GRAB: A dataset of whole-body human grasping of objects. In European Conference on Computer Vision (ECCV).","DOI":"10.1007\/978-3-030-58548-8_34"},{"key":"2557_CR173","doi-asserted-by":"crossref","unstructured":"Tan, M., Chen, B., Pang, R., Vasudevan, V., Sandler, M., Howard, A., & Le, Q. V. (2019). Mnasnet: Platform-aware neural architecture search for mobile. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2820\u20132828.","DOI":"10.1109\/CVPR.2019.00293"},{"key":"2557_CR174","unstructured":"Tan, S., Nagarajan, T., & Grauman, K. (2023). Egodistill: Egocentric head motion distillation for efficient video understanding. NeurIPS."},{"key":"2557_CR175","unstructured":"Tang, H., Liang, K., Grauman, K., Feiszli, M., & Wang, W. (2023). Egotracks: A long-term egocentric visual object tracking dataset. Advances in Neural Information Processing Systems."},{"key":"2557_CR176","doi-asserted-by":"crossref","unstructured":"Tang, Y., Lu, J., & Zhou, J. (2020). Comprehensive instructional video analysis: The coin dataset and performance evaluation. IEEE transactions on pattern analysis and machine intelligence.","DOI":"10.1109\/TPAMI.2020.2980824"},{"key":"2557_CR177","doi-asserted-by":"crossref","unstructured":"Tang, H., Xu, D., Sebe, N., Wang, Y., Corso, J. J., & Yan, Y. (2019). Multi-channel attention selection gan with cascaded semantic guidance for cross-view image translation. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2417\u20132426.","DOI":"10.1109\/CVPR.2019.00252"},{"key":"2557_CR178","unstructured":"Teed, Z., & Deng, J. (2021). DROID-SLAM: Deep Visual SLAM for Monocular, Stereo, and RGB-D Cameras. Advances in neural information processing systems."},{"key":"2557_CR179","doi-asserted-by":"crossref","unstructured":"Tendulkar, P., Sur\u00eds, D., & Vondrick, C. (2023). Flex: Full-body grasping without full-body grasps. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR52729.2023.02029"},{"key":"2557_CR180","doi-asserted-by":"crossref","unstructured":"Tome, D., Peluse, P., Agapito, L., & Badino, H. (2019). xr-egopose: Egocentric 3d human pose from an hmd camera. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV).","DOI":"10.1109\/ICCV.2019.00782"},{"key":"2557_CR181","unstructured":"Tong, Z., Song, Y., Wang, J., & Wang, L. (2022). Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training. arXiv:2203.12602."},{"key":"2557_CR182","unstructured":"Torre, F. D., Hodgins, J., Montano, J., Valcarcel, S., Forcada, R., & Macey, J. (2009). Guide to the carnegie mellon university multimodal activity (cmu-mmac) database. In Tech. Report CMU-RI-TR-08-22, Robotics Institute, Carnegie Mellon University."},{"key":"2557_CR183","unstructured":"Tschernezki, V., Darkhalil, A., Zhu, Z., Fouhey, D., Larina, I., Larlus, D., Damen, D., & Vedaldi, A. (2023). EPIC Fields: Marrying 3D Geometry and Video Understanding. In Proceedings of the Neural Information Processing Systems (NeurIPS)."},{"key":"2557_CR184","doi-asserted-by":"crossref","unstructured":"Tseng, H.-Y., Li, Q., Kim, C., Alsisan, S., Huang, J.-B., & Kopf, J. (2023). Consistent view synthesis with pose-guided diffusion models. arXiv preprint arXiv:2303.17598.","DOI":"10.1109\/CVPR52729.2023.01609"},{"key":"2557_CR185","unstructured":"Varma, M., Wang, P., Chen, X., Chen, T., Venugopalan, S., & Wang, Z. (2023). Is attention all that neRF needs? In The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=xE-LtsE-xx."},{"key":"2557_CR186","doi-asserted-by":"crossref","unstructured":"Vasu, P. K. A., Gabriel, J., Zhu, J., Tuzel, O., & Ranjan, A. (2023). Mobileone: An improved one millisecond mobile backbone. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7907\u20137917.","DOI":"10.1109\/CVPR52729.2023.00764"},{"key":"2557_CR187","doi-asserted-by":"crossref","unstructured":"Vicente, S., Rother, C., & Kolmogorov, V. (2011). Object cosegmentation. In CVPR 2011, pp. 2217\u20132224. IEEE.","DOI":"10.1109\/CVPR.2011.5995530"},{"key":"2557_CR188","doi-asserted-by":"crossref","unstructured":"Wang, X., Kwon, T., Rad, M., Pan, B., Chakraborty, I., Andrist, S., Bohus, D., Feniello, A., Tekin, B., Frujeri, F. V., & Joshi, N. (2023). Holoassist: an egocentric human interaction dataset for interactive ai assistants in the real world. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 20270\u201320281.","DOI":"10.1109\/ICCV51070.2023.01854"},{"key":"2557_CR189","doi-asserted-by":"crossref","unstructured":"Wang, H., Wu, Y., Guo, S., & Wang, L. (2023). Pdpp: Projected diffusion for procedure planning in instructional videos. arXiv preprint arXiv:2303.14676.","DOI":"10.1109\/CVPR52729.2023.01425"},{"key":"2557_CR190","unstructured":"Watson, D., Chan, W., Martin-Brualla, R., Ho, J., Tagliasacchi, A., & Norouzi, M. (2022). Novel view synthesis with diffusion models. arXiv preprint arXiv:2210.04628."},{"key":"2557_CR191","doi-asserted-by":"crossref","unstructured":"Weinland, D., Ronfard, R., & Boyer, E. (2006). Free viewpoint action recognition using motion history volumes. Computer Vision and Image Understanding (CVIU).","DOI":"10.1016\/j.cviu.2006.07.013"},{"key":"2557_CR192","doi-asserted-by":"crossref","unstructured":"Wen, Y., Singh, K. K., Anderson, M., Jan, W.-P., & Lee, Y. J. (2021). Seeing the unseen: Predicting the first-person camera wearer\u2019s location and pose in third-person scenes. In Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV) Workshops, pp. 3446\u20133455.","DOI":"10.1109\/ICCVW54120.2021.00384"},{"key":"2557_CR193","doi-asserted-by":"crossref","unstructured":"Wiles, O., Gkioxari, G., Szeliski, R., & Johnson, J. (2020). Synsin: End-to-end view synthesis from a single image. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7467\u20137477.","DOI":"10.1109\/CVPR42600.2020.00749"},{"key":"2557_CR194","doi-asserted-by":"crossref","unstructured":"Wong, B., Chen, J., Wu, Y., Lei, S. W., Mao, D., Gao, D., & Shou, M. Z. (2022). Assistq: Affordance-centric question-driven task completion for egocentric assistant. In: European Conference on Computer Vision.","DOI":"10.1007\/978-3-031-20059-5_28"},{"key":"2557_CR195","doi-asserted-by":"crossref","unstructured":"Wu, Z., Nagarajan, T., Kumar, A., Rennie, S., Davis, L. S., Grauman, K., & Feris, R. (2018). Blockdrop: Dynamic inference paths in residual networks. In CVPR.","DOI":"10.1109\/CVPR.2018.00919"},{"key":"2557_CR196","unstructured":"Wu, Z., Song, S., Khosla, A., Yu, F., Zhang, L., Tang, X., & Xiao, J. (2015). 3d shapenets: A deep representation for volumetric shapes. In Computer Vision and Pattern Recognition, IEEE Conference On."},{"key":"2557_CR197","doi-asserted-by":"crossref","unstructured":"Xia, F., Zamir, A. R., He, Z.-Y., Sax, A., Malik, J., & Savarese, S. (2018). Gibson Env: real-world perception for embodied agents. In CVPR. IEEE. Gibson license is available at http:\/\/svl.stanford.edu\/gibson2\/assets\/GDS_agreement.pdf","DOI":"10.1109\/CVPR.2018.00945"},{"key":"2557_CR198","doi-asserted-by":"crossref","unstructured":"Xiao, J., Owens, A., & Torralba, A. (2013). Sun3d: A database of big spaces reconstructed using sfm and object labels. In ICCV.","DOI":"10.1109\/ICCV.2013.458"},{"key":"2557_CR199","doi-asserted-by":"crossref","unstructured":"Xu, M., Fan, C., Wang, Y., Ryoo, M. S., & Crandall, D. J. (2018). Joint person segmentation and identification in synchronized first-and third-person videos. In ECCV.","DOI":"10.1007\/978-3-030-01246-5_39"},{"key":"2557_CR200","doi-asserted-by":"crossref","unstructured":"Xu, H., Ghosh, G., Huang, P.-Y., Okhonko, D., Aghajanyan, A., Metze, F., Zettlemoyer, L., & Feichtenhofer, C. (2021). Videoclip: Contrastive pre-training for zero-shot video-text understanding. arXiv preprint arXiv:2109.14084.","DOI":"10.18653\/v1\/2021.emnlp-main.544"},{"key":"2557_CR201","doi-asserted-by":"crossref","unstructured":"Xu, F. F., Ji, L., Shi, B., Du, J., Neubig, G., Bisk, Y., & Duan, N. (2020). A benchmark for structured procedural knowledge extraction from cooking videos. arXiv preprint arXiv:2005.00706.","DOI":"10.18653\/v1\/2020.nlpbt-1.4"},{"issue":"5","key":"2557_CR202","doi-asserted-by":"publisher","first-page":"2093","DOI":"10.1109\/TVCG.2019.2898650","volume":"25","author":"W Xu","year":"2019","unstructured":"Xu, W., Chatterjee, A., Zollhoefer, M., Rhodin, H., Fua, P., Seidel, H.-P., & Theobalt, C. (2019). Mo 2 cap 2: Real-time mobile 3d motion capture with a cap-mounted fisheye camera. IEEE transactions on visualization and computer graphics, 25(5), 2093\u20132101.","journal-title":"IEEE transactions on visualization and computer graphics"},{"key":"2557_CR203","unstructured":"Xue, Z., & Grauman, K. (2023). Learning fine-grained view-invariant representations from unpaired ego-exo videos via temporal alignment. In NeurIPS."},{"key":"2557_CR204","first-page":"1086","volume":"34","author":"M Xu","year":"2021","unstructured":"Xu, M., Xiong, Y., Chen, H., Li, X., Xia, W., Tu, Z., & Soatto, S. (2021). Long short-term transformer for online action detection. Advances in Neural Information Processing Systems, 34, 1086\u20131099.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2557_CR205","doi-asserted-by":"crossref","unstructured":"Yang, J., Liu, S., Guo, H., Dong, Y., Zhang, X., Zhang, S., Wang, P., Zhou, Z., Xie, B., Wang, Z., Ouyang, B., Lin, Z., Cominelli, M., Cai, Z., Zhang, Y., Zhang, P., Hong, F., Widmer, J., Gringoli, F., Yang, L., & Li, B. et\u00a0al. (2025). Egolife: Towards egocentric life assistant. In CVPR.","DOI":"10.1109\/CVPR52734.2025.02690"},{"key":"2557_CR206","doi-asserted-by":"crossref","unstructured":"Yang, L., Radway, R. M., Chen, Y.-H., Wu, T. F., Liu, H., Ansari, E., Chandra, V., Mitra, S., Beign\u00e9, E. (2022). Three-dimensional stacked neural network accelerator architectures for ar\/vr applications. IEEE Micro.","DOI":"10.1109\/MM.2022.3202254"},{"key":"2557_CR207","doi-asserted-by":"crossref","unstructured":"Yu, H., Cai, M., Liu, Y., & Lu, F. (2019). What i see is what you see: Joint attention learning for first and third person video co-analysis. In ACM MM.","DOI":"10.1145\/3343031.3350896"},{"key":"2557_CR208","unstructured":"Yu, H., Cai, M., Liu, Y., & Lu, F. (2020). First-and third-person video co-analysis by learning spatial-temporal joint attention. IEEE Transactions on Pattern Analysis and Machine Intelligence."},{"key":"2557_CR209","doi-asserted-by":"crossref","unstructured":"Yu, X., Rao, Y., Zhao, W., Lu, J., & Zhou, J. (2021). Group-aware contrastive regression for action quality assessment. In Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 7919\u20137928.","DOI":"10.1109\/ICCV48922.2021.00782"},{"key":"2557_CR210","doi-asserted-by":"crossref","unstructured":"Yuan, Y., & Kitani, K. (2018). 3d ego-pose estimation via imitation learning. In: Proceedings of the European Conference on Computer Vision (ECCV).","DOI":"10.1007\/978-3-030-01270-0_45"},{"key":"2557_CR211","doi-asserted-by":"crossref","unstructured":"Yuan, Y., & Kitani, K. (2019). Ego-pose estimation and forecasting as real-time pd control. In Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV).","DOI":"10.1109\/ICCV.2019.01018"},{"key":"2557_CR212","doi-asserted-by":"crossref","unstructured":"Zhan, X., Yang, L., Zhao, Y., Mao, K., Xu, H., Lin, Z., Li, K., & Lu, C. (2024). Oakink2: A dataset of bimanual hands-object manipulation in complex task completion. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 445\u2013456.","DOI":"10.1109\/CVPR52733.2024.00050"},{"key":"2557_CR213","doi-asserted-by":"crossref","unstructured":"Zhang, Q., & Li, B. (2013). Relative hidden markov models for evaluating motion skill. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2013.77"},{"key":"2557_CR214","doi-asserted-by":"crossref","unstructured":"Zhang, S., Dai, W., Wang, S., Shen, X., Lu, J., Zhou, J., & Tang, Y. (2023). Logo: A long-form video dataset for group action quality assessment. In CVPR.","DOI":"10.1109\/CVPR52729.2023.00238"},{"key":"2557_CR215","doi-asserted-by":"crossref","unstructured":"Zhang, R., Isola, P., Efros, A. A., Shechtman, E., & Wang, O. (2018). The unreasonable effectiveness of deep features as a perceptual metric. In CVPR.","DOI":"10.1109\/CVPR.2018.00068"},{"key":"2557_CR216","doi-asserted-by":"crossref","unstructured":"Zhang, S., Ma, Q., Zhang, Y., Qian, Z., Kwon, T., Pollefeys, M., Bogo, F., & Tang, S. (2022). Egobody: Human body shape and motion of interacting people from head-mounted devices. In ECCV.","DOI":"10.1007\/978-3-031-20068-7_11"},{"key":"2557_CR217","doi-asserted-by":"crossref","unstructured":"Zhang, C., Wu, J., & Li, Y. (2022). Actionformer: Localizing moments of actions with transformers. arXiv preprint arXiv:2202.07925.","DOI":"10.1007\/978-3-031-19772-7_29"},{"key":"2557_CR218","first-page":"492","volume":"13664","author":"C-L Zhang","year":"2022","unstructured":"Zhang, C.-L., Wu, J., & Li, Y. (2022). Actionformer: Localizing moments of actions with transformers. European Conference on Computer Vision. LNCS, 13664, 492\u2013510.","journal-title":"European Conference on Computer Vision. LNCS"},{"key":"2557_CR219","doi-asserted-by":"crossref","unstructured":"Zhang, X., Zhou, X., Lin, M., & Sun, J. (2018). Shufflenet: An extremely efficient convolutional neural network for mobile devices. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6848\u20136856.","DOI":"10.1109\/CVPR.2018.00716"},{"key":"2557_CR220","doi-asserted-by":"crossref","unstructured":"Zhao, Y., & Kr\u00e4henb\u00fchl, P. (2022). Real-time online video detection with temporal smoothing transformers. In European Conference on Computer Vision.","DOI":"10.1007\/978-3-031-19830-4_28"},{"key":"2557_CR221","doi-asserted-by":"crossref","unstructured":"Zhao, H., Hadji, I., Dvornik, N., Derpanis, K. G., Wildes, R. P., & Jepson, A. D. (2022). P3iv: Probabilistic procedure planning from instructional videos with weak supervision. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2938\u20132948.","DOI":"10.1109\/CVPR52688.2022.00295"},{"key":"2557_CR222","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Misra, I., Kr\u00e4henb\u00fchl, P., & Girdhar, R. (2023). Learning video representations from large language models. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6586\u20136597.","DOI":"10.1109\/CVPR52729.2023.00637"},{"key":"2557_CR223","doi-asserted-by":"crossref","unstructured":"Zhao, W., Wang, W., & Tian, Y. (2022). Graformer: Graph-oriented transformer for 3d pose estimation. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 20438\u201320447.","DOI":"10.1109\/CVPR52688.2022.01979"},{"key":"2557_CR224","doi-asserted-by":"crossref","unstructured":"Zheng, C., Liu, X., Qi, G.-J., & Chen, C. (2023). Potter: Pooling attention transformer for efficient human mesh recovery. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1611\u20131620.","DOI":"10.1109\/CVPR52729.2023.00161"},{"key":"2557_CR225","doi-asserted-by":"crossref","unstructured":"Zhong, Y., Yu, L., Bai, Y., Li, S., Yan, X., & Li, Y. (2023). Learning procedure-aware video representation from instructional videos and their narrations. arXiv preprint arXiv:2303.17839.","DOI":"10.1109\/CVPR52729.2023.01424"},{"key":"2557_CR226","unstructured":"Zhou, L., Louis, N., & Corso, J. (2018). Weakly-supervised video object grounding from text by loss weighting and object interaction. In BMVC."},{"key":"2557_CR227","doi-asserted-by":"crossref","unstructured":"Zhou, H., Mart\u00edn-Mart\u00edn, R., Kapadia, M., Savarese, S., & Niebles, J. C. (2023). Procedure-aware pretraining for instructional video understanding. In CVPR, pp. 10727\u201310738.","DOI":"10.1109\/CVPR52729.2023.01033"},{"key":"2557_CR228","doi-asserted-by":"crossref","unstructured":"Zhou, H., Martin-Martin, R., Kapadia, M., Savarese, S., & Niebles, J. C. (2023). Procedure-aware pretraining for instructional video understanding. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR52729.2023.01033"},{"key":"2557_CR229","doi-asserted-by":"crossref","unstructured":"Zhou, L., Xu, C., & Corso, J. J. (2018). Towards automatic learning of procedures from web instructional videos. In AAAI.","DOI":"10.1609\/aaai.v32i1.12342"},{"key":"2557_CR230","unstructured":"Zhu, M., & Gupta, S. (2017). To prune, or not to prune: exploring the efficacy of pruning for model compression. arXiv preprint arXiv:1710.01878."},{"key":"2557_CR231","doi-asserted-by":"crossref","unstructured":"Zhukov, D., Alayrac, J.-B., Cinbis, R. G., Fouhey, D., Laptev, I., & Sivic, J. (2019). Cross-task weakly supervised learning from instructional videos. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","DOI":"10.1109\/CVPR.2019.00365"},{"key":"2557_CR232","doi-asserted-by":"crossref","unstructured":"Zia, A., Sharma, Y., Bettadapura, V., Sarin, E. L., & Essa, I. A. (2017). Video and accelerometer-based motion analysis for automated surgical skills assessment. CoRR arXiv:1702.07772.","DOI":"10.1007\/s11548-018-1704-z"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02557-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02557-6\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02557-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T04:04:28Z","timestamp":1764993868000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02557-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,24]]},"references-count":232,"journal-issue":{"issue":"12","published-print":{"date-parts":[[2025,12]]}},"alternative-id":["2557"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02557-6","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"type":"print","value":"0920-5691"},{"type":"electronic","value":"1573-1405"}],"subject":[],"published":{"date-parts":[[2025,11,24]]},"assertion":[{"value":"25 September 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 July 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 November 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}