{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,18]],"date-time":"2026-02-18T01:50:06Z","timestamp":1771379406985,"version":"3.50.1"},"publisher-location":"Cham","reference-count":64,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729881","type":"print"},{"value":"9783031729898","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T00:00:00Z","timestamp":1729900800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T00:00:00Z","timestamp":1729900800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72989-8_1","type":"book-chapter","created":{"date-parts":[[2024,10,25]],"date-time":"2024-10-25T17:02:04Z","timestamp":1729875724000},"page":"1-19","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Ex2Eg-MAE: A Framework for\u00a0Adaptation of\u00a0Exocentric Video Masked Autoencoders for\u00a0Egocentric Social Role Understanding"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-2391-3563","authenticated-orcid":false,"given":"Minh","family":"Tran","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yelin","family":"Kim","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Che-Chun","family":"Su","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Cheng-Hao","family":"Kuo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Min","family":"Sun","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5873-1434","authenticated-orcid":false,"given":"Mohammad","family":"Soleymani","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,10,26]]},"reference":[{"key":"1_CR1","doi-asserted-by":"crossref","unstructured":"Abu\u00a0Farha, Y., Richard, A., Gall, J.: When will you do what?-anticipating temporal occurrences of activities. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5343\u20135352 (2018)","DOI":"10.1109\/CVPR.2018.00560"},{"key":"1_CR2","doi-asserted-by":"crossref","unstructured":"Afouras, T., Asano, Y.M., Fagan, F., Vedaldi, A., Metze, F.: Self-supervised object detection from audio-visual correspondence. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10575\u201310586 (2022)","DOI":"10.1109\/CVPR52688.2022.01032"},{"key":"1_CR3","doi-asserted-by":"publisher","first-page":"79","DOI":"10.1007\/s12193-012-0111-y","volume":"7","author":"X Alameda-Pineda","year":"2013","unstructured":"Alameda-Pineda, X., et al.: Ravel: an annotated corpus for training robots with audiovisual abilities. J. Multimodal User Interfaces 7, 79\u201391 (2013)","journal-title":"J. Multimodal User Interfaces"},{"key":"1_CR4","doi-asserted-by":"crossref","unstructured":"Alc\u00e1zar, J.L., Caba, F., Thabet, A.K., Ghanem, B.: Maas: multi-modal assignation for active speaker detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 265\u2013274 (2021)","DOI":"10.1109\/ICCV48922.2021.00033"},{"key":"1_CR5","doi-asserted-by":"crossref","unstructured":"Athar, S., Shu, Z., Samaras, D.: Self-supervised deformation modeling for facial expression editing. In: 2020 15th IEEE International Conference on Automatic Face and Gesture Recognition (FG 2020), pp. 294\u2013301. IEEE (2020)","DOI":"10.1109\/FG47880.2020.00115"},{"key":"1_CR6","unstructured":"Bao, H., Dong, L., Piao, S., Wei, F.: Beit: bert pre-training of image transformers. In: International Conference on Learning Representations (2021)"},{"key":"1_CR7","doi-asserted-by":"crossref","unstructured":"Bhattarai, A.R., Nie\u00dfner, M., Sevastopolsky, A.: Triplanenet: an encoder for eg3d inversion. arXiv preprint arXiv:2303.13497 (2023)","DOI":"10.1109\/WACV57701.2024.00303"},{"issue":"2","key":"1_CR8","doi-asserted-by":"publisher","first-page":"79","DOI":"10.1258\/135763307780096195","volume":"13","author":"P Boissy","year":"2007","unstructured":"Boissy, P., Corriveau, H., Michaud, F., Labont\u00e9, D., Royer, M.P.: A qualitative study of in-home robotic telepresence for home care of community-living elderly subjects. J. Telemed. Telecare 13(2), 79\u201384 (2007)","journal-title":"J. Telemed. Telecare"},{"key":"1_CR9","doi-asserted-by":"crossref","unstructured":"Bulat, A., Tzimiropoulos, G.: How far are we from solving the 2d & 3d face alignment problem?(and a dataset of 230,000 3d facial landmarks). In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1021\u20131030 (2017)","DOI":"10.1109\/ICCV.2017.116"},{"key":"1_CR10","doi-asserted-by":"crossref","unstructured":"Cai, Z., et al.: Marlin: masked autoencoder for facial video representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1493\u20131504 (2023)","DOI":"10.1109\/CVPR52729.2023.00150"},{"key":"1_CR11","doi-asserted-by":"crossref","unstructured":"Carros, F., et al.: Exploring human-robot interaction with the elderly: results from a ten-week case study in a care home. In: Proceedings of the 2020 CHI Conference on Human Factors in Computing Systems, pp. 1\u201312 (2020)","DOI":"10.1145\/3313831.3376402"},{"key":"1_CR12","doi-asserted-by":"crossref","unstructured":"Chan, E.R., et\u00a0al.: Efficient geometry-aware 3d generative adversarial networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16123\u201316133 (2022)","DOI":"10.1109\/CVPR52688.2022.01565"},{"key":"1_CR13","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: International Conference on Machine Learning, pp. 1597\u20131607. PMLR (2020)"},{"key":"1_CR14","doi-asserted-by":"crossref","unstructured":"Chung, J., Nagrani, A., Zisserman, A.: Voxceleb2: deep speaker recognition. In: Interspeech 2018 (2018)","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"1_CR15","doi-asserted-by":"crossref","unstructured":"Cruz, F., Parisi, G.I., Twiefel, J., Wermter, S.: Multi-modal integration of dynamic audiovisual patterns for an interactive reinforcement learning scenario. In: 2016 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS), pp. 759\u2013766. IEEE (2016)","DOI":"10.1109\/IROS.2016.7759137"},{"key":"1_CR16","doi-asserted-by":"crossref","unstructured":"Damen, D., et\u00a0al.: Scaling egocentric vision: the epic-kitchens dataset. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 720\u2013736 (2018)","DOI":"10.1007\/978-3-030-01225-0_44"},{"key":"1_CR17","doi-asserted-by":"publisher","first-page":"98","DOI":"10.1016\/j.cviu.2016.02.016","volume":"149","author":"D Damen","year":"2016","unstructured":"Damen, D., Leelasawassuk, T., Mayol-Cuevas, W.: You-do, i-learn: egocentric unsupervised discovery of objects and their modes of interaction towards video-based guidance. Comput. Vis. Image Underst. 149, 98\u2013112 (2016)","journal-title":"Comput. Vis. Image Underst."},{"key":"1_CR18","unstructured":"Donley, J., et al.: Easycom: an augmented reality dataset to support algorithms for easy communication in noisy environments. arXiv preprint arXiv:2107.04174 (2021)"},{"key":"1_CR19","doi-asserted-by":"crossref","unstructured":"Everingham, M., Sivic, J., Zisserman, A.: Hello! my name is... buffy\u201d\u2013automatic naming of characters in tv video. In: BMVC, vol.\u00a02, p.\u00a06 (2006)","DOI":"10.5244\/C.20.92"},{"key":"1_CR20","unstructured":"Fritsch, J., Kleinehagenbrock, M., Lang, S., Fink, G.A., Sagerer, G.: Audiovisual person tracking with a mobile robot. In: Proceedings of International Conference on Intelligent Autonomous Systems, pp. 898\u2013906 (2004)"},{"issue":"11","key":"1_CR21","doi-asserted-by":"publisher","first-page":"4021","DOI":"10.1109\/TPAMI.2020.2992889","volume":"43","author":"A Furnari","year":"2020","unstructured":"Furnari, A., Farinella, G.M.: Rolling-unrolling lstms for action anticipation from first-person video. IEEE Trans. Pattern Anal. Mach. Intell. 43(11), 4021\u20134036 (2020)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1_CR22","doi-asserted-by":"crossref","unstructured":"Gan, C., Zhang, Y., Wu, J., Gong, B., Tenenbaum, J.B.: Look, listen, and act: towards audio-visual embodied navigation. In: 2020 IEEE International Conference on Robotics and Automation (ICRA), pp. 9701\u20139707. IEEE (2020)","DOI":"10.1109\/ICRA40945.2020.9197008"},{"key":"1_CR23","unstructured":"Grauman, K., et\u00a0al.: Ego4d: around the world in 3,000 hours of egocentric video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18995\u201319012 (2022)"},{"key":"1_CR24","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.: Masked autoencoders are scalable vision learners. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16000\u201316009 (2022)","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"1_CR25","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.: Momentum contrast for unsupervised visual representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9729\u20139738 (2020)","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"1_CR26","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"1_CR27","doi-asserted-by":"crossref","unstructured":"Hempel, T., Abdelrahman, A.A., Al-Hamadi, A.: 6d rotation representation for unconstrained head pose estimation. In: 2022 IEEE International Conference on Image Processing (ICIP), pp. 2496\u20132500. IEEE (2022)","DOI":"10.1109\/ICIP46576.2022.9897219"},{"key":"1_CR28","doi-asserted-by":"crossref","unstructured":"Huang, C., Tian, Y., Kumar, A., Xu, C.: Egocentric audio-visual object localization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22910\u201322921 (2023)","DOI":"10.1109\/CVPR52729.2023.02194"},{"key":"1_CR29","doi-asserted-by":"crossref","unstructured":"Jiang, H., Grauman, K.: Seeing invisible poses: estimating 3d body pose from egocentric video. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3501\u20133509. IEEE (2017)","DOI":"10.1109\/CVPR.2017.373"},{"key":"1_CR30","doi-asserted-by":"crossref","unstructured":"Jiang, H., Murdock, C., Ithapu, V.K.: Egocentric deep multi-channel audio-visual active speaker localization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10544\u201310552 (2022)","DOI":"10.1109\/CVPR52688.2022.01029"},{"key":"1_CR31","unstructured":"Kay, W., et\u00a0al.: The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)"},{"key":"1_CR32","doi-asserted-by":"crossref","unstructured":"Kazakos, E., Nagrani, A., Zisserman, A., Damen, D.: Epic-fusion: audio-visual temporal binding for egocentric action recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5492\u20135501 (2019)","DOI":"10.1109\/ICCV.2019.00559"},{"key":"1_CR33","doi-asserted-by":"crossref","unstructured":"Kellnhofer, P., Recasens, A., Stent, S., Matusik, W., Torralba, A.: Gaze360: physically unconstrained gaze estimation in the wild. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6912\u20136921 (2019)","DOI":"10.1109\/ICCV.2019.00701"},{"key":"1_CR34","doi-asserted-by":"publisher","first-page":"621","DOI":"10.1007\/978-3-031-19827-4_36","volume-title":"European Conference on Computer Vision","author":"D Kim","year":"2022","unstructured":"Kim, D., Wang, K., Sclaroff, S., Saenko, K.: A broad study of pre-training for domain generalization and adaptation. In: Avidan, S., Brostow, G., Cisse, M., Farinella, G.M., Hassner, T. (eds.) European Conference on Computer Vision, vol. 13693, pp. 621\u2013638. Springer, Heidelberg (2022). https:\/\/doi.org\/10.1007\/978-3-031-19827-4_36"},{"key":"1_CR35","doi-asserted-by":"crossref","unstructured":"Kim, S., Jeong, S., Kim, E., Kang, I., Kwak, N.: Self-supervised pre-training and contrastive representation learning for multiple-choice video qa. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a035, pp. 13171\u201313179 (2021)","DOI":"10.1609\/aaai.v35i14.17556"},{"key":"1_CR36","doi-asserted-by":"crossref","unstructured":"K\u00f6p\u00fckl\u00fc, O., Taseska, M., Rigoll, G.: How to design a three-stage architecture for audio-visual active speaker detection in the wild. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1193\u20131203 (2021)","DOI":"10.1109\/ICCV48922.2021.00123"},{"key":"1_CR37","doi-asserted-by":"crossref","unstructured":"Lee, Y.J., Ghosh, J., Grauman, K.: Discovering important people and objects for egocentric video summarization. In: 2012 IEEE Conference on Computer Vision and Pattern Recognition, pp. 1346\u20131353. IEEE (2012)","DOI":"10.1109\/CVPR.2012.6247820"},{"key":"1_CR38","doi-asserted-by":"crossref","unstructured":"Li, J., Liu, K., Wu, J.: Ego-body pose estimation via ego-head pose estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 17142\u201317151 (2023)","DOI":"10.1109\/CVPR52729.2023.01644"},{"key":"1_CR39","unstructured":"Lin, H.C., Wang, C.Y., Chen, M.H., Fu, S.W., Wang, Y.C.F.: Quavf: quality-aware audio-visual fusion for ego4d talking to me challenge. arXiv preprint arXiv:2306.17404 (2023)"},{"key":"1_CR40","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"704","DOI":"10.1007\/978-3-030-58452-8_41","volume-title":"Computer Vision \u2013 ECCV 2020","author":"M Liu","year":"2020","unstructured":"Liu, M., Tang, S., Li, Y., Rehg, J.M.: Forecasting human-object interaction: joint prediction of motor attention and\u00a0actions in first person video. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 704\u2013721. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_41"},{"key":"1_CR41","doi-asserted-by":"publisher","first-page":"371","DOI":"10.1007\/978-3-031-19833-5_22","volume-title":"European Conference on Computer Vision","author":"K Min","year":"2022","unstructured":"Min, K., Roy, S., Tripathi, S., Guha, T., Majumdar, S.: Learning long-term spatial-temporal graphs for active speaker detection. In: Avidan, S., Brostow, G., Cisse, M., Farinella, G.M., Hassner, T. (eds.) European Conference on Computer Vision, vol. 13695, pp. 371\u2013387. Springer, Heidelberg (2022). https:\/\/doi.org\/10.1007\/978-3-031-19833-5_22"},{"key":"1_CR42","doi-asserted-by":"crossref","unstructured":"Nagarajan, T., Feichtenhofer, C., Grauman, K.: Grounded human-object interaction hotspots from video. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 8688\u20138697 (2019)","DOI":"10.1109\/ICCV.2019.00878"},{"key":"1_CR43","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2019.101027","volume":"60","author":"A Nagrani","year":"2020","unstructured":"Nagrani, A., Chung, J.S., Xie, W., Zisserman, A.: Voxceleb: large-scale speaker verification in the wild. Comput. Speech Lang. 60, 101027 (2020)","journal-title":"Comput. Speech Lang."},{"key":"1_CR44","doi-asserted-by":"crossref","unstructured":"Ng, E., Xiang, D., Joo, H., Grauman, K.: You2me: inferring body pose in egocentric video via first and second person interactions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9890\u20139900 (2020)","DOI":"10.1109\/CVPR42600.2020.00991"},{"key":"1_CR45","unstructured":"Northcutt, C., Zha, S., Lovegrove, S., Newcombe, R.: Egocom: a multi-person multi-modal egocentric communications dataset. IEEE Trans. Pattern Anal. Mach. Intell. (2020)"},{"key":"1_CR46","doi-asserted-by":"crossref","unstructured":"Pramanick, S., et al.: Egovlpv2: egocentric video-language pre-training with fusion in the backbone. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5285\u20135297 (2023)","DOI":"10.1109\/ICCV51070.2023.00487"},{"key":"1_CR47","unstructured":"Radford, A., Kim, J.W., Xu, T., Brockman, G., McLeavey, C., Sutskever, I.: Robust speech recognition via large-scale weak supervision. In: International Conference on Machine Learning, pp. 28492\u201328518. PMLR (2023)"},{"key":"1_CR48","unstructured":"Radosavovic, I., Xiao, T., James, S., Abbeel, P., Malik, J., Darrell, T.: Real-world robot learning with masked visual pre-training. In: Conference on Robot Learning, pp. 416\u2013426. PMLR (2023)"},{"key":"1_CR49","doi-asserted-by":"crossref","unstructured":"Richardson, E., et al.: Encoding in style: a stylegan encoder for image-to-image translation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2287\u20132296 (2021)","DOI":"10.1109\/CVPR46437.2021.00232"},{"issue":"1","key":"1_CR50","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3544777","volume":"42","author":"D Roich","year":"2022","unstructured":"Roich, D., Mokady, R., Bermano, A.H., Cohen-Or, D.: Pivotal tuning for latent-based editing of real images. ACM Trans. Graph. (TOG) 42(1), 1\u201313 (2022)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"1_CR51","doi-asserted-by":"crossref","unstructured":"Ryan, F., Jiang, H., Shukla, A., Rehg, J.M., Ithapu, V.K.: Egocentric auditory attention localization in conversations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14663\u201314674 (2023)","DOI":"10.1109\/CVPR52729.2023.01409"},{"key":"1_CR52","unstructured":"Sigurdsson, G.A., Gupta, A., Schmid, C., Farhadi, A., Alahari, K.: Charades-ego: a large-scale dataset of paired third and first person videos. arXiv preprint arXiv:1804.09626 (2018)"},{"key":"1_CR53","unstructured":"Smilkov, D., Thorat, N., Kim, B., Vi\u00e9gas, F., Wattenberg, M.: Smoothgrad: removing noise by adding noise. arXiv preprint arXiv:1706.03825 (2017)"},{"key":"1_CR54","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"454","DOI":"10.1007\/978-3-319-46454-1_28","volume-title":"Computer Vision \u2013 ECCV 2016","author":"Y-C Su","year":"2016","unstructured":"Su, Y.-C., Grauman, K.: Detecting engagement in egocentric video. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9909, pp. 454\u2013471. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46454-1_28"},{"key":"1_CR55","doi-asserted-by":"crossref","unstructured":"Tao, R., Pan, Z., Das, R.K., Qian, X., Shou, M.Z., Li, H.: Is someone speaking? exploring long-term temporal features for audio-visual active speaker detection. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 3927\u20133935 (2021)","DOI":"10.1145\/3474085.3475587"},{"key":"1_CR56","first-page":"10078","volume":"35","author":"Z Tong","year":"2022","unstructured":"Tong, Z., Song, Y., Wang, J., Wang, L.: Videomae: masked autoencoders are data-efficient learners for self-supervised video pre-training. Adv. Neural. Inf. Process. Syst. 35, 10078\u201310093 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1_CR57","doi-asserted-by":"crossref","unstructured":"Wang, J., Liu, Y., Hu, Y., Shi, H., Mei, T.: Facex-zoo: a pytorch toolbox for face recognition. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 3779\u20133782 (2021)","DOI":"10.1145\/3474085.3478324"},{"key":"1_CR58","doi-asserted-by":"crossref","unstructured":"Wang, L., et al.: Videomae v2: Scaling video masked autoencoders with dual masking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14549\u201314560 (2023)","DOI":"10.1109\/CVPR52729.2023.01398"},{"key":"1_CR59","doi-asserted-by":"crossref","unstructured":"Wolf, L., Hassner, T., Maoz, I.: Face recognition in unconstrained videos with matched background similarity. In: CVPR 2011, pp. 529\u2013534. IEEE (2011)","DOI":"10.1109\/CVPR.2011.5995566"},{"key":"1_CR60","doi-asserted-by":"crossref","unstructured":"Xue, Z., Song, Y., Grauman, K., Torresani, L.: Egocentric video task translation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2310\u20132320 (2023)","DOI":"10.1109\/CVPR52729.2023.00229"},{"key":"1_CR61","doi-asserted-by":"crossref","unstructured":"Yang, C., Lamdouar, H., Lu, E., Zisserman, A., Xie, W.: Self-supervised video object segmentation by motion grouping. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7177\u20137188 (2021)","DOI":"10.1109\/ICCV48922.2021.00709"},{"key":"1_CR62","doi-asserted-by":"crossref","unstructured":"Zhang, Y., et al.: Unicon: unified context network for robust active speaker detection. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 3964\u20133972 (2021)","DOI":"10.1145\/3474085.3475275"},{"key":"1_CR63","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Berg, T.L.: Temporal perception and prediction in ego-centric video. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 4498\u20134506 (2015)","DOI":"10.1109\/ICCV.2015.511"},{"key":"1_CR64","doi-asserted-by":"crossref","unstructured":"Zhu, X., Lei, Z., Liu, X., Shi, H., Li, S.Z.: Face alignment across large poses: a 3d solution. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 146\u2013155 (2016)","DOI":"10.1109\/CVPR.2016.23"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72989-8_1","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,25]],"date-time":"2024-10-25T17:09:29Z","timestamp":1729876169000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72989-8_1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,26]]},"ISBN":["9783031729881","9783031729898"],"references-count":64,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72989-8_1","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,26]]},"assertion":[{"value":"26 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}