{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,6]],"date-time":"2026-02-06T01:54:04Z","timestamp":1770342844322,"version":"3.49.0"},"publisher-location":"Cham","reference-count":74,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729195","type":"print"},{"value":"9783031729201","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T00:00:00Z","timestamp":1727740800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T00:00:00Z","timestamp":1727740800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72920-1_23","type":"book-chapter","created":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T08:02:57Z","timestamp":1727683377000},"page":"407-425","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":11,"title":["Put Myself in\u00a0Your Shoes: Lifting the\u00a0Egocentric Perspective from\u00a0Exocentric Videos"],"prefix":"10.1007","author":[{"given":"Mi","family":"Luo","sequence":"first","affiliation":[]},{"given":"Zihui","family":"Xue","sequence":"additional","affiliation":[]},{"given":"Alex","family":"Dimakis","sequence":"additional","affiliation":[]},{"given":"Kristen","family":"Grauman","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,1]]},"reference":[{"key":"23_CR1","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"253","DOI":"10.1007\/978-3-319-46454-1_16","volume-title":"Computer Vision \u2013 ECCV 2016","author":"S Ardeshir","year":"2016","unstructured":"Ardeshir, S., Borji, A.: Ego2Top: matching viewers in egocentric and\u00a0top-view videos. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9909, pp. 253\u2013268. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46454-1_16"},{"issue":"6","key":"23_CR2","doi-asserted-by":"publisher","first-page":"1353","DOI":"10.1109\/TPAMI.2018.2832121","volume":"41","author":"S Ardeshir","year":"2018","unstructured":"Ardeshir, S., Borji, A.: Egocentric meets top-view. IEEE Trans. Pattern Anal. Mach. Intell. 41(6), 1353\u20131366 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"23_CR3","doi-asserted-by":"publisher","first-page":"61","DOI":"10.1016\/j.cviu.2018.05.005","volume":"171","author":"S Ardeshir","year":"2018","unstructured":"Ardeshir, S., Borji, A.: An exocentric look at egocentric actions and vice versa. Comput. Vis. Image Underst. 171, 61\u201368 (2018)","journal-title":"Comput. Vis. Image Underst."},{"key":"23_CR4","doi-asserted-by":"crossref","unstructured":"Bahl, S., Gupta, A., Pathak, D.: Human-to-robot imitation in the wild. arXiv preprint arXiv:2207.09450 (2022)","DOI":"10.15607\/RSS.2022.XVIII.026"},{"key":"23_CR5","doi-asserted-by":"crossref","unstructured":"Bansal, A., Ma, S., Ramanan, D., Sheikh, Y.: Recycle-gan: unsupervised video retargeting. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 119\u2013135 (2018)","DOI":"10.1007\/978-3-030-01228-1_8"},{"key":"23_CR6","unstructured":"Bharadhwaj, H., Gupta, A., Tulsiani, S., Kumar, V.: Zero-shot robot manipulation from passive human videos. arXiv preprint arXiv:2302.02011 (2023)"},{"key":"23_CR7","doi-asserted-by":"crossref","unstructured":"Calli, B., Singh, A., Walsman, A., Srinivasa, S., Abbeel, P., Dollar, A.M.: The ycb object and model set: towards common benchmarks for manipulation research. In: 2015 International Conference on Advanced Robotics (ICAR), pp. 510\u2013517. IEEE (2015)","DOI":"10.1109\/ICAR.2015.7251504"},{"key":"23_CR8","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-End object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"23_CR9","doi-asserted-by":"crossref","unstructured":"Chan, E.R., et al.: Generative novel view synthesis with 3d-aware diffusion models. arXiv preprint arXiv:2304.02602 (2023)","DOI":"10.1109\/ICCV51070.2023.00389"},{"key":"23_CR10","doi-asserted-by":"crossref","unstructured":"Cheng, F., et al.: 4diff: 3d-aware diffusion model for third-to-first viewpoint translation. In: ECCV (2024)","DOI":"10.1007\/978-3-031-72691-0_23"},{"key":"23_CR11","first-page":"8780","volume":"34","author":"P Dhariwal","year":"2021","unstructured":"Dhariwal, P., Nichol, A.: Diffusion models beat gans on image synthesis. Adv. Neural. Inf. Process. Syst. 34, 8780\u20138794 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"23_CR12","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16x16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"23_CR13","unstructured":"Elfeki, M., Regmi, K., Ardeshir, S., Borji, A.: From third person to first person: dataset and baselines for synthesis and retrieval. arXiv preprint arXiv:1812.00104 (2018)"},{"key":"23_CR14","doi-asserted-by":"crossref","unstructured":"Fan, C., et al.: Identifying first-person camera wearers in third-person videos. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5125\u20135133 (2017)","DOI":"10.1109\/CVPR.2017.503"},{"key":"23_CR15","unstructured":"Grauman, K., et\u00a0al.: Ego-exo4d: understanding skilled human activity from first-and third-person perspectives. arXiv preprint arXiv:2311.18259 (2023)"},{"key":"23_CR16","unstructured":"Grauman, K., et\u00a0al.: Ego-exo4d: understanding skilled human activity from first-and third-person perspectives. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19383\u201319400 (2024)"},{"key":"23_CR17","doi-asserted-by":"crossref","unstructured":"Ho, H.I., Chiu, W.C., Wang, Y.C.F.: Summarizing first-person videos from third persons\u2019 points of view. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 70\u201385 (2018)","DOI":"10.1007\/978-3-030-01267-0_5"},{"key":"23_CR18","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. Adv. Neural. Inf. Process. Syst. 33, 6840\u20136851 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"issue":"1","key":"23_CR19","first-page":"2249","volume":"23","author":"J Ho","year":"2022","unstructured":"Ho, J., Saharia, C., Chan, W., Fleet, D.J., Norouzi, M., Salimans, T.: Cascaded diffusion models for high fidelity image generation. J. Mach. Learn. Res. 23(1), 2249\u20132281 (2022)","journal-title":"J. Mach. Learn. Res."},{"key":"23_CR20","doi-asserted-by":"crossref","unstructured":"Hore, A., Ziou, D.: Image quality metrics: PSNR vs. SSIM. In: 2010 20th International Conference on Pattern Recognition, pp. 2366\u20132369. IEEE (2010)","DOI":"10.1109\/ICPR.2010.579"},{"key":"23_CR21","unstructured":"Iandola, F.N., Han, S., Moskewicz, M.W., Ashraf, K., Dally, W.J., Keutzer, K.: Squeezenet: alexnet-level accuracy with 50x fewer parameters and$$<$$ 0.5 mb model size. arXiv preprint arXiv:1602.07360 (2016)"},{"key":"23_CR22","doi-asserted-by":"crossref","unstructured":"Isola, P., Zhu, J.Y., Zhou, T., Efros, A.A.: Image-to-image translation with conditional adversarial networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1125\u20131134 (2017)","DOI":"10.1109\/CVPR.2017.632"},{"key":"23_CR23","doi-asserted-by":"crossref","unstructured":"Jang, W., Agapito, L.: Codenerf: disentangled neural radiance fields for object categories. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 12949\u201312958 (2021)","DOI":"10.1109\/ICCV48922.2021.01271"},{"issue":"2","key":"23_CR24","doi-asserted-by":"publisher","first-page":"3046","DOI":"10.1109\/LRA.2022.3144512","volume":"7","author":"R Jangir","year":"2022","unstructured":"Jangir, R., Hansen, N., Ghosal, S., Jain, M., Wang, X.: Look closer: bridging egocentric and third-person views with transformers for robotic manipulation. IEEE Rob. Autom. Lett. 7(2), 3046\u20133053 (2022)","journal-title":"IEEE Rob. Autom. Lett."},{"key":"23_CR25","unstructured":"Kingma, D.P., Welling, M.: Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114 (2013)"},{"issue":"6","key":"23_CR26","doi-asserted-by":"publisher","first-page":"84","DOI":"10.1145\/3065386","volume":"60","author":"A Krizhevsky","year":"2017","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: Imagenet classification with deep convolutional neural networks. Commun. ACM 60(6), 84\u201390 (2017)","journal-title":"Commun. ACM"},{"key":"23_CR27","doi-asserted-by":"crossref","unstructured":"Kwon, T., Tekin, B., St\u00fchmer, J., Bogo, F., Pollefeys, M.: H2o: Two hands manipulating objects for first person interaction recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 10138\u201310148 (2021)","DOI":"10.1109\/ICCV48922.2021.00998"},{"key":"23_CR28","doi-asserted-by":"crossref","unstructured":"Li, Y., Nagarajan, T., Xiong, B., Grauman, K.: Ego-exo: transferring visual representations from third-person to first-person videos. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6943\u20136953 (2021)","DOI":"10.1109\/CVPR46437.2021.00687"},{"key":"23_CR29","doi-asserted-by":"crossref","unstructured":"Liu, A., Tucker, R., Jampani, V., Makadia, A., Snavely, N., Kanazawa, A.: Infinite nature: perpetual view generation of natural scenes from a single image. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 14458\u201314467 (2021)","DOI":"10.1109\/ICCV48922.2021.01419"},{"key":"23_CR30","doi-asserted-by":"crossref","unstructured":"Liu, G., Tang, H., Latapie, H., Yan, Y.: Exocentric to egocentric image generation via parallel generative adversarial network. In: ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1843\u20131847. IEEE (2020)","DOI":"10.1109\/ICASSP40776.2020.9053957"},{"key":"23_CR31","doi-asserted-by":"crossref","unstructured":"Liu, G., Tang, H., Latapie, H.M., Corso, J.J., Yan, Y.: Cross-view exocentric to egocentric video synthesis. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 974\u2013982 (2021)","DOI":"10.1145\/3474085.3475596"},{"key":"23_CR32","unstructured":"Lv, Z., et al.: Aria pilot dataset (2022). https:\/\/about.facebook.com\/realitylabs\/projectaria\/datasets"},{"key":"23_CR33","unstructured":"Majumdar, A., et\u00a0al.: Where are we in the search for an artificial visual cortex for embodied intelligence? arXiv preprint arXiv:2303.18240 (2023)"},{"key":"23_CR34","unstructured":"Mandikal, P., Grauman, K.: Dexvip: learning dexterous grasping with human hand pose priors from video. In: Conference on Robot Learning (2021)"},{"key":"23_CR35","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"405","DOI":"10.1007\/978-3-030-58452-8_24","volume-title":"Computer Vision \u2013 ECCV 2020","author":"B Mildenhall","year":"2020","unstructured":"Mildenhall, B., Srinivasan, P.P., Tancik, M., Barron, J.T., Ramamoorthi, R., Ng, R.: NeRF: representing scenes as neural radiance fields for view synthesis. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 405\u2013421. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_24"},{"key":"23_CR36","unstructured":"Nair, S., Rajeswaran, A., Kumar, V., Finn, C., Gupta, A.: R3m: a universal visual representation for robot manipulation. arXiv preprint arXiv:2203.12601 (2022)"},{"key":"23_CR37","doi-asserted-by":"crossref","unstructured":"Niemeyer, M., Barron, J.T., Mildenhall, B., Sajjadi, M.S., Geiger, A., Radwan, N.: Regnerf: regularizing neural radiance fields for view synthesis from sparse inputs. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5480\u20135490 (2022)","DOI":"10.1109\/CVPR52688.2022.00540"},{"key":"23_CR38","doi-asserted-by":"crossref","unstructured":"Peebles, W., Xie, S.: Scalable diffusion models with transformers. arXiv preprint arXiv:2212.09748 (2022)","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"23_CR39","doi-asserted-by":"crossref","unstructured":"Redmon, J., Divvala, S., Girshick, R., Farhadi, A.: You only look once: unified, real-time object detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 779\u2013788 (2016)","DOI":"10.1109\/CVPR.2016.91"},{"key":"23_CR40","doi-asserted-by":"crossref","unstructured":"Regmi, K., Borji, A.: Cross-view image synthesis using conditional gans. In: The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2018)","DOI":"10.1109\/CVPR.2018.00369"},{"key":"23_CR41","doi-asserted-by":"publisher","unstructured":"Regmi, K., Borji, A.: Cross-view image synthesis using geometry-guided conditional gans. In: Computer Vision and Image Understanding (2019). https:\/\/doi.org\/10.1016\/j.cviu.2019.07.008. http:\/\/www.sciencedirect.com\/science\/article\/pii\/S1077314219301043","DOI":"10.1016\/j.cviu.2019.07.008"},{"key":"23_CR42","unstructured":"Ren, B., Tang, H., Sebe, N.: Cascaded cross mlp-mixer gans for cross-view image translation. arXiv preprint arXiv:2110.10183 (2021)"},{"key":"23_CR43","doi-asserted-by":"crossref","unstructured":"Ren, X., Wang, X.: Look outside the room: synthesizing a consistent long-term 3d scene video from a single image. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3563\u20133573 (2022)","DOI":"10.1109\/CVPR52688.2022.00355"},{"key":"23_CR44","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"23_CR45","doi-asserted-by":"crossref","unstructured":"Rombach, R., Esser, P., Ommer, B.: Geometry-free view synthesis: transformers and no 3d priors. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 14356\u201314366 (2021)","DOI":"10.1109\/ICCV48922.2021.01409"},{"key":"23_CR46","unstructured":"Seitzer, M.: pytorch-fid: FID Score for PyTorch (2020). https:\/\/github.com\/mseitzer\/pytorch-fid. version 0.3.0"},{"key":"23_CR47","doi-asserted-by":"crossref","unstructured":"Sener, F., et al.: Assembly101: a large-scale multi-view video dataset for understanding procedural activities. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 21096\u201321106 (2022)","DOI":"10.1109\/CVPR52688.2022.02042"},{"key":"23_CR48","doi-asserted-by":"crossref","unstructured":"Sermanet, P., et al.: Time-contrastive networks: self-supervised learning from video. Proceedings of International Conference in Robotics and Automation (ICRA) (2018). http:\/\/arxiv.org\/abs\/1704.06888","DOI":"10.1109\/ICRA.2018.8462891"},{"key":"23_CR49","doi-asserted-by":"crossref","unstructured":"Shan, D., Geng, J., Shu, M., Fouhey, D.F.: Understanding human hands in contact at internet scale. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2020)","DOI":"10.1109\/CVPR42600.2020.00989"},{"key":"23_CR50","doi-asserted-by":"crossref","unstructured":"Sigurdsson, G.A., Gupta, A., Schmid, C., Farhadi, A., Alahari, K.: Actor and observer: joint modeling of first and third-person videos. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7396\u20137404 (2018)","DOI":"10.1109\/CVPR.2018.00772"},{"key":"23_CR51","unstructured":"Sigurdsson, G.A., Gupta, A., Schmid, C., Farhadi, A., Alahari, K.: Charades-ego: a large-scale dataset of paired third and first person videos. arXiv preprint arXiv:1804.09626 (2018)"},{"key":"23_CR52","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)"},{"key":"23_CR53","unstructured":"Sitzmann, V., Zollh\u00f6fer, M., Wetzstein, G.: Scene representation networks: continuous 3d-structure-aware neural scene representations. Adv. Neural Inf. Process. Syst. 32 (2019)"},{"key":"23_CR54","unstructured":"Somasundaram, K., et al.: Project aria: a new tool for egocentric multi-modal AI research. arXiv preprint arXiv:2308.13561 (2023)"},{"key":"23_CR55","unstructured":"Song, Y., Sohl-Dickstein, J., Kingma, D.P., Kumar, A., Ermon, S., Poole, B.: Score-based generative modeling through stochastic differential equations. In: International Conference on Learning Representations (2021). https:\/\/openreview.net\/forum?id=PxTIG12RRHS"},{"key":"23_CR56","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"178","DOI":"10.1007\/978-3-319-16814-2_12","volume-title":"Computer Vision \u2013 ACCV 2014","author":"B Soran","year":"2015","unstructured":"Soran, B., Farhadi, A., Shapiro, L.: Action recognition in the presence of one egocentric and multiple static cameras. In: Cremers, D., Reid, I., Saito, H., Yang, M.-H. (eds.) ACCV 2014. LNCS, vol. 9007, pp. 178\u2013193. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-16814-2_12"},{"key":"23_CR57","doi-asserted-by":"crossref","unstructured":"Tang, H., Xu, D., Sebe, N., Wang, Y., Corso, J.J., Yan, Y.: Multi-channel attention selection gan with cascaded semantic guidance for cross-view image translation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2417\u20132426 (2019)","DOI":"10.1109\/CVPR.2019.00252"},{"key":"23_CR58","doi-asserted-by":"crossref","unstructured":"Tseng, H.Y., Li, Q., Kim, C., Alsisan, S., Huang, J.B., Kopf, J.: Consistent view synthesis with pose-guided diffusion models. arXiv preprint arXiv:2303.17598 (2023)","DOI":"10.1109\/CVPR52729.2023.01609"},{"key":"23_CR59","unstructured":"Vaswani, A., et al.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"23_CR60","doi-asserted-by":"crossref","unstructured":"Wang, J., Liu, L., Xu, W., Sarkar, K., Luvizon, D., Theobalt, C.: Estimating egocentric 3d human pose in the wild with external weak supervision. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13157\u201313166 (2022)","DOI":"10.1109\/CVPR52688.2022.01281"},{"key":"23_CR61","unstructured":"Wang, T.C., et al.: Video-to-video synthesis. arXiv preprint arXiv:1808.06601 (2018)"},{"key":"23_CR62","doi-asserted-by":"crossref","unstructured":"Wang, T.C., Liu, M.Y., Zhu, J.Y., Tao, A., Kautz, J., Catanzaro, B.: High-resolution image synthesis and semantic manipulation with conditional gans. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 8798\u20138807 (2018)","DOI":"10.1109\/CVPR.2018.00917"},{"key":"23_CR63","unstructured":"Wang, Z., Wu, S., Xie, W., Chen, M., Prisacariu, V.A.: NeRF$$--$$: neural radiance fields without known camera parameters. arXiv preprint arXiv:2102.07064 (2021)"},{"key":"23_CR64","unstructured":"Watson, D., Chan, W., Martin-Brualla, R., Ho, J., Tagliasacchi, A., Norouzi, M.: Novel view synthesis with diffusion models. arXiv preprint arXiv:2210.04628 (2022)"},{"key":"23_CR65","doi-asserted-by":"crossref","unstructured":"Wen, Y., Singh, K.K., Anderson, M., Jan, W.P., Lee, Y.J.: Seeing the unseen: predicting the first-person camera wearer\u2019s location and pose in third-person scenes. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV) Workshops, pp. 3446\u20133455 (2021)","DOI":"10.1109\/ICCVW54120.2021.00384"},{"key":"23_CR66","doi-asserted-by":"crossref","unstructured":"Wiles, O., Gkioxari, G., Szeliski, R., Johnson, J.: Synsin: end-to-end view synthesis from a single image. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7467\u20137477 (2020)","DOI":"10.1109\/CVPR42600.2020.00749"},{"key":"23_CR67","doi-asserted-by":"crossref","unstructured":"Xu, M., Fan, C., Wang, Y., Ryoo, M.S., Crandall, D.J.: Joint person segmentation and identification in synchronized first-and third-person videos. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 637\u2013652 (2018)","DOI":"10.1007\/978-3-030-01246-5_39"},{"key":"23_CR68","unstructured":"Xue, Z., Grauman, K.: Learning fine-grained view-invariant representations from unpaired ego-exo videos via temporal alignment. In: NeurIPS (2023)"},{"key":"23_CR69","doi-asserted-by":"crossref","unstructured":"Ye, Y., et al.: Affordance diffusion: synthesizing hand-object interactions. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02153"},{"key":"23_CR70","doi-asserted-by":"crossref","unstructured":"Yu, A., Ye, V., Tancik, M., Kanazawa, A.: pixelnerf: neural radiance fields from one or few images. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 4578\u20134587 (2021)","DOI":"10.1109\/CVPR46437.2021.00455"},{"key":"23_CR71","doi-asserted-by":"crossref","unstructured":"Yu, H., Cai, M., Liu, Y., Lu, F.: What i see is what you see: joint attention learning for first and third person video co-analysis. In: Proceedings of the 27th ACM International Conference on Multimedia, pp. 1358\u20131366 (2019)","DOI":"10.1145\/3343031.3350896"},{"key":"23_CR72","doi-asserted-by":"publisher","first-page":"6631","DOI":"10.1109\/TPAMI.2020.3030048","volume":"45","author":"H Yu","year":"2020","unstructured":"Yu, H., Cai, M., Liu, Y., Lu, F.: First-and third-person video co-analysis by learning spatial-temporal joint attention. IEEE Trans. Pattern Anal. Mach. Intell. 45, 6631\u20136646 (2020)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"23_CR73","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., Agrawala, M.: Adding conditional control to text-to-image diffusion models (2023)","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"23_CR74","doi-asserted-by":"crossref","unstructured":"Zhang, R., Isola, P., Efros, A.A., Shechtman, E., Wang, O.: The unreasonable effectiveness of deep features as a perceptual metric. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00068"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72920-1_23","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T21:54:21Z","timestamp":1732830861000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72920-1_23"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,1]]},"ISBN":["9783031729195","9783031729201"],"references-count":74,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72920-1_23","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,1]]},"assertion":[{"value":"1 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}