{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:47:57Z","timestamp":1777657677023,"version":"3.51.4"},"publisher-location":"Cham","reference-count":53,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031732195","type":"print"},{"value":"9783031732201","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73220-1_15","type":"book-chapter","created":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T20:05:00Z","timestamp":1730577900000},"page":"253-270","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["Synchronization Is All You Need: Exocentric-to-Egocentric Transfer for\u00a0Temporal Action Segmentation with\u00a0Unlabeled Synchronized Video Pairs"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4999-8698","authenticated-orcid":false,"given":"Camillo","family":"Quattrocchi","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6911-0302","authenticated-orcid":false,"given":"Antonino","family":"Furnari","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4286-2050","authenticated-orcid":false,"given":"Daniele","family":"Di Mauro","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5232-677X","authenticated-orcid":false,"given":"Mario Valerio","family":"Giuffrida","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6034-0432","authenticated-orcid":false,"given":"Giovanni Maria","family":"Farinella","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,3]]},"reference":[{"key":"15_CR1","doi-asserted-by":"crossref","unstructured":"Camporese, G., Coscia, P., Furnari, A., Farinella, G.M., Ballan, L.: Knowledge distillation for action anticipation via label smoothing. In: 2020 25th International Conference on Pattern Recognition (ICPR), pp. 3312\u20133319. IEEE (2021)","DOI":"10.1109\/ICPR48806.2021.9412660"},{"key":"15_CR2","doi-asserted-by":"crossref","unstructured":"Chen, M.H., Kira, Z., AlRegib, G., Yoo, J., Chen, R., Zheng, J.: Temporal attentive alignment for large-scale video domain adaptation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV) (2019)","DOI":"10.1109\/ICCV.2019.00642"},{"key":"15_CR3","doi-asserted-by":"crossref","unstructured":"Chen, M.H., Li, B., Bao, Y., AlRegib, G.: Action segmentation with mixed temporal domain adaptation. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 605\u2013614 (2020)","DOI":"10.1109\/WACV45572.2020.9093535"},{"key":"15_CR4","doi-asserted-by":"crossref","unstructured":"Chen, X., et al.: Long video-based action segmentation for earthmoving excavators using improved temporal convolutional network models. In: IOP Conference Series: Earth and Environmental Science, vol.\u00a01101, p. 092003. IOP Publishing (2022)","DOI":"10.1088\/1755-1315\/1101\/9\/092003"},{"key":"15_CR5","doi-asserted-by":"crossref","unstructured":"Choi, J., Sharma, G., Chandraker, M., Huang, J.B.: Unsupervised and semi-supervised domain adaptation for action recognition from drones. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 1717\u20131726 (2020)","DOI":"10.1109\/WACV45572.2020.9093511"},{"key":"15_CR6","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"678","DOI":"10.1007\/978-3-030-58610-2_40","volume-title":"Computer Vision \u2013 ECCV 2020","author":"J Choi","year":"2020","unstructured":"Choi, J., Sharma, G., Schulter, S., Huang, J.-B.: Shuffle and attend: video domain adaptation. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12357, pp. 678\u2013695. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58610-2_40"},{"key":"15_CR7","doi-asserted-by":"crossref","unstructured":"Crasto, N., Weinzaepfel, P., Alahari, K., Schmid, C.: Mars: motion-augmented rgb stream for action recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7882\u20137891 (2019)","DOI":"10.1109\/CVPR.2019.00807"},{"key":"15_CR8","unstructured":"Csurka, G.: Domain adaptation for visual applications: a comprehensive survey. CoRR arxiv:1702.05374 (2017)"},{"key":"15_CR9","unstructured":"Damen, D., et al.: Rescaling egocentric vision. CoRR arxiv:2006.13256 (2020)"},{"key":"15_CR10","unstructured":"Ding, G., Sener, F., Yao, A.: Temporal action segmentation: an analysis of modern technique. arXiv preprint arXiv:2210.10352 (2022)"},{"key":"15_CR11","doi-asserted-by":"crossref","unstructured":"Dwibedi, D., Aytar, Y., Tompson, J., Sermanet, P., Zisserman, A.: Temporal cycle-consistency learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1801\u20131810 (2019)","DOI":"10.1109\/CVPR.2019.00190"},{"key":"15_CR12","doi-asserted-by":"crossref","unstructured":"Fernando, B., Herath, S.: Anticipating human actions by correlating past with the future with jaccard similarity measures. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13224\u201313233 (2021)","DOI":"10.1109\/CVPR46437.2021.01302"},{"key":"15_CR13","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2023.103763","volume":"234","author":"A Furnari","year":"2023","unstructured":"Furnari, A., Farinella, G.M.: Streaming egocentric action anticipation: an evaluation scheme and approach. Comput. Vis. Image Underst. 234, 103763 (2023)","journal-title":"Comput. Vis. Image Underst."},{"key":"15_CR14","unstructured":"Ganin, Y., Lempitsky, V.: Unsupervised domain adaptation by backpropagation. In: International Conference on Machine Learning, pp. 1180\u20131189. PMLR (2015)"},{"key":"15_CR15","doi-asserted-by":"crossref","unstructured":"Girdhar, R., Singh, M., Ravi, N., van\u00a0der Maaten, L., Joulin, A., Misra, I.: Omnivore: a single model for many visual Modalities. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01563"},{"key":"15_CR16","unstructured":"Grauman, K., et\u00a0al.: Ego-exo4d: understanding skilled human activity from first-and third-person perspectives. In: CVPR (2024)"},{"issue":"1","key":"15_CR17","first-page":"723","volume":"13","author":"A Gretton","year":"2012","unstructured":"Gretton, A., Borgwardt, K.M., Rasch, M.J., Sch\u00f6lkopf, B., Smola, A.: A kernel two-sample test. J. Mach. Learn. Res. 13(1), 723\u2013773 (2012)","journal-title":"J. Mach. Learn. Res."},{"key":"15_CR18","doi-asserted-by":"crossref","unstructured":"Hadji, I., Derpanis, K.G., Jepson, A.D.: Representation learning via global temporal alignment and cycle-consistency. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11068\u201311077 (2021)","DOI":"10.1109\/CVPR46437.2021.01092"},{"key":"15_CR19","unstructured":"Hinton, G., Vinyals, O., Dean, J.: Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531 (2015)"},{"key":"15_CR20","unstructured":"Huang, Z., Wang, N.: Like what you like: knowledge distill via neuron selectivity transfer (2017)"},{"key":"15_CR21","doi-asserted-by":"crossref","unstructured":"Kim, D., et al.: Learning cross-modal contrastive features for video domain adaptation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13618\u201313627 (2021)","DOI":"10.1109\/ICCV48922.2021.01336"},{"key":"15_CR22","doi-asserted-by":"crossref","unstructured":"Kim, D., et al.: Learning cross-modal contrastive features for video domain adaptation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 13618\u201313627 (October 2021)","DOI":"10.1109\/ICCV48922.2021.01336"},{"issue":"4","key":"15_CR23","doi-asserted-by":"publisher","first-page":"765","DOI":"10.1109\/TPAMI.2018.2884469","volume":"42","author":"H Kuehne","year":"2020","unstructured":"Kuehne, H., Richard, A., Gall, J.: A hybrid rnn-hmm approach for weakly supervised temporal action segmentation. IEEE Trans. Pattern Anal. Mach. Intell. 42(4), 765\u2013779 (2020). https:\/\/doi.org\/10.1109\/TPAMI.2018.2884469","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"15_CR24","doi-asserted-by":"crossref","unstructured":"Lea, C., Flynn, M.D., Vidal, R., Reiter, A., Hager, G.D.: Temporal convolutional networks for action segmentation and detection. In: proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 156\u2013165 (2017)","DOI":"10.1109\/CVPR.2017.113"},{"key":"15_CR25","doi-asserted-by":"crossref","unstructured":"Li, G., Jampani, V., Sun, D., Sevilla-Lara, L.: Locate: localize and transfer object parts for weakly supervised affordance grounding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10922\u201310931 (2023)","DOI":"10.1109\/CVPR52729.2023.01051"},{"key":"15_CR26","unstructured":"Li, S.J., AbuFarha, Y., Liu, Y., Cheng, M.M., Gall, J.: Ms-tcn++: multi-stage temporal convolutional network for action segmentation. IEEE Trans. Pattern Anal. Mach. Intell. (2020)"},{"key":"15_CR27","doi-asserted-by":"crossref","unstructured":"Li, Y., Nagarajan, T., Xiong, B., Grauman, K.: Ego-exo: transferring visual representations from third-person to first-person videos. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 6943\u20136953 (2021)","DOI":"10.1109\/CVPR46437.2021.00687"},{"key":"15_CR28","doi-asserted-by":"crossref","unstructured":"Li, Y., Liu, M., Rehg, J.M.: In the eye of beholder: joint learning of gaze and actions in first person video. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 619\u2013635 (2018)","DOI":"10.1007\/978-3-030-01228-1_38"},{"key":"15_CR29","doi-asserted-by":"crossref","unstructured":"Lin, J., Gan, C., Han, S.: Tsm: temporal shift module for efficient video understanding. In: Proceedings of the IEEE International Conference on Computer Vision (2019)","DOI":"10.1109\/ICCV.2019.00718"},{"key":"15_CR30","doi-asserted-by":"crossref","unstructured":"Liu, D., Li, Q., Dinh, A., Jiang, T., Shah, M., Xu, C.: Diffusion action segmentation. arXiv preprint arXiv:2303.17959 (2023)","DOI":"10.1109\/ICCV51070.2023.00930"},{"key":"15_CR31","unstructured":"Liu, M., Chen, X., Zhang, Y., Li, Y., Rehg, J.M.: Attention distillation for learning video representations. arXiv preprint arXiv:1904.03249 (2019)"},{"key":"15_CR32","doi-asserted-by":"publisher","first-page":"7774","DOI":"10.1109\/TCSVT.2023.3281671","volume":"33","author":"X Liu","year":"2023","unstructured":"Liu, X., Zhou, S., Lei, T., Jiang, P., Chen, Z., Lu, H.: First-person video domain adaptation with multi-scene cross-site datasets and attention-based methods. IEEE Trans. Circ. Syst. Video Technol. 33, 7774\u20137788 (2023). https:\/\/doi.org\/10.1109\/TCSVT.2023.3281671","journal-title":"IEEE Trans. Circ. Syst. Video Technol."},{"key":"15_CR33","doi-asserted-by":"crossref","unstructured":"Munro, J., Damen, D.: Multi-modal domain adaptation for fine-grained action recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 122\u2013132 (2020)","DOI":"10.1109\/CVPR42600.2020.00020"},{"key":"15_CR34","unstructured":"Oquab, M., et\u00a0al.: Dinov2: learning robust visual features without supervision. arXiv preprint arXiv:2304.07193 (2023)"},{"key":"15_CR35","doi-asserted-by":"crossref","unstructured":"Passalis, N., Tzelepi, M., Tefas, A.: Heterogeneous knowledge distillation using information flow modeling. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2339\u20132348 (2020)","DOI":"10.1109\/CVPR42600.2020.00241"},{"key":"15_CR36","unstructured":"Reza, S., Sundareshan, B., Moghaddam, M., Camps, O.: Enhancing transformer backbone for egocentric video action segmentation. arXiv preprint arXiv:2305.11365 (2023)"},{"key":"15_CR37","unstructured":"Romero, A., Ballas, N., Kahou, S.E., Chassang, A., Gatta, C., Bengio, Y.: Fitnets: hints for thin deep nets. arXiv preprint arXiv:1412.6550 (2014)"},{"key":"15_CR38","doi-asserted-by":"crossref","unstructured":"Sayed, S., Ghoddoosian, R., Trivedi, B., Athitsos, V.: A new dataset and approach for timestamp supervised action segmentation using human object interaction. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3132\u20133141 (2023)","DOI":"10.1109\/CVPRW59228.2023.00315"},{"key":"15_CR39","doi-asserted-by":"crossref","unstructured":"Sener, F., et al.: Assembly101: a large-scale multi-view video dataset for understanding procedural activities. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 21096\u201321106 (2022)","DOI":"10.1109\/CVPR52688.2022.02042"},{"key":"15_CR40","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"154","DOI":"10.1007\/978-3-030-58517-4_10","volume-title":"Computer Vision \u2013 ECCV 2020","author":"F Sener","year":"2020","unstructured":"Sener, F., Singhania, D., Yao, A.: Temporal aggregate representations for long-range video understanding. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12361, pp. 154\u2013171. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58517-4_10"},{"key":"15_CR41","doi-asserted-by":"crossref","unstructured":"Sigurdsson, G.A., Gupta, A., Schmid, C., Farhadi, A., Alahari, K.: Actor and observer: joint modeling of first and third-person videos. In: proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7396\u20137404 (2018)","DOI":"10.1109\/CVPR.2018.00772"},{"key":"15_CR42","unstructured":"Singhania, D., Rahaman, R., Yao, A.: Coarse to fine multi-resolution temporal convolutional network. arXiv preprint arXiv:2105.10859 (2021)"},{"key":"15_CR43","doi-asserted-by":"crossref","unstructured":"Singhania, D., Rahaman, R., Yao, A.: Iterative contrast-classify for semi-supervised temporal action segmentation. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a036, pp. 2262\u20132270 (2022)","DOI":"10.1609\/aaai.v36i2.20124"},{"key":"15_CR44","doi-asserted-by":"publisher","unstructured":"Spriggs, E.H., De\u00a0La\u00a0Torre, F., Hebert, M.: Temporal segmentation and activity classification from first-person sensing. In: 2009 IEEE Computer Society Conference on Computer Vision and Pattern Recognition Workshops, pp. 17\u201324 (2009). https:\/\/doi.org\/10.1109\/CVPRW.2009.5204354","DOI":"10.1109\/CVPRW.2009.5204354"},{"key":"15_CR45","doi-asserted-by":"crossref","unstructured":"Stroud, J., Ross, D., Sun, C., Deng, J., Sukthankar, R.: D3d: distilled 3d networks for video action recognition. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 625\u2013634 (2020)","DOI":"10.1109\/WACV45572.2020.9093274"},{"key":"15_CR46","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"443","DOI":"10.1007\/978-3-319-49409-8_35","volume-title":"Computer Vision \u2013 ECCV 2016 Workshops","author":"B Sun","year":"2016","unstructured":"Sun, B., Saenko, K.: Deep CORAL: correlation alignment for deep domain adaptation. In: Hua, G., J\u00e9gou, H. (eds.) ECCV 2016. LNCS, vol. 9915, pp. 443\u2013450. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-49409-8_35"},{"key":"15_CR47","doi-asserted-by":"crossref","unstructured":"Tran, V., Wang, Y., Zhang, Z., Hoai, M.: Knowledge distillation for human action anticipation. In: 2021 IEEE International Conference on Image Processing (ICIP), pp. 2518\u20132522. IEEE (2021)","DOI":"10.1109\/ICIP42928.2021.9506693"},{"key":"15_CR48","doi-asserted-by":"crossref","unstructured":"Wang, X., Hu, J.F., Lai, J.H., Zhang, J., Zheng, W.S.: Progressive teacher-student learning for early action prediction. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3556\u20133565 (2019)","DOI":"10.1109\/CVPR.2019.00367"},{"issue":"5","key":"15_CR49","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3400066","volume":"11","author":"G Wilson","year":"2020","unstructured":"Wilson, G., Cook, D.J.: A survey of unsupervised deep domain adaptation. ACM Trans. Intell. Syst. Technol. (TIST) 11(5), 1\u201346 (2020)","journal-title":"ACM Trans. Intell. Syst. Technol. (TIST)"},{"key":"15_CR50","first-page":"14890","volume":"35","author":"Z Xu","year":"2022","unstructured":"Xu, Z., Rawat, Y., Wong, Y., Kankanhalli, M.S., Shah, M.: Don\u2019t pour cereal into coffee: differentiable temporal logic for temporal action segmentation. Adv. Neural. Inf. Process. Syst. 35, 14890\u201314903 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"15_CR51","unstructured":"Xue, Z., Grauman, K.: Learning fine-grained view-invariant representations from unpaired ego-exo videos via temporal alignment. arXiv preprint arXiv:2306.05526 (2023)"},{"key":"15_CR52","doi-asserted-by":"crossref","unstructured":"Yim, J., Joo, D., Bae, J., Kim, J.: A gift from knowledge distillation: fast optimization, network minimization and transfer learning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4133\u20134141 (2017)","DOI":"10.1109\/CVPR.2017.754"},{"key":"15_CR53","doi-asserted-by":"crossref","unstructured":"Yu, H., Cai, M., Liu, Y., Lu, F.: What i see is what you see: joint attention learning for first and third person video co-analysis. In: Proceedings of the 27th ACM International Conference on Multimedia, pp. 1358\u20131366 (2019)","DOI":"10.1145\/3343031.3350896"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73220-1_15","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T20:06:37Z","timestamp":1730577997000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73220-1_15"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,3]]},"ISBN":["9783031732195","9783031732201"],"references-count":53,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73220-1_15","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,3]]},"assertion":[{"value":"3 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}