{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T21:08:35Z","timestamp":1776114515335,"version":"3.50.1"},"reference-count":155,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"11","license":[{"start":{"date-parts":[[2025,11,1]],"date-time":"2025-11-01T00:00:00Z","timestamp":1761955200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2025,11]]},"DOI":"10.1109\/tpami.2024.3381075","type":"journal-article","created":{"date-parts":[[2024,7,26]],"date-time":"2024-07-26T13:35:32Z","timestamp":1722000932000},"page":"9468-9509","source":"Crossref","is-referenced-by-count":6,"title":["Ego4D: Around the World in 3,600 Hours of Egocentric Video"],"prefix":"10.1109","volume":"47","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9591-5873","authenticated-orcid":false,"given":"Kristen","family":"Grauman","sequence":"first","affiliation":[{"name":"FAIR, Menlo Park, CA, USA"}]},{"given":"Andrew","family":"Westbury","sequence":"additional","affiliation":[{"name":"FAIR, Menlo Park, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0133-6335","authenticated-orcid":false,"given":"Eugene","family":"Byrne","sequence":"additional","affiliation":[{"name":"FAIR, Menlo Park, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-2142-0992","authenticated-orcid":false,"given":"Vincent","family":"Cartillier","sequence":"additional","affiliation":[{"name":"FAIR, Menlo Park, CA, USA"}]},{"given":"Zachary","family":"Chavis","sequence":"additional","affiliation":[{"name":"University of Minnesota, Minneapolis, MN, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6911-0302","authenticated-orcid":false,"given":"Antonino","family":"Furnari","sequence":"additional","affiliation":[{"name":"University of Catania, Catania, Italy"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2393-6832","authenticated-orcid":false,"given":"Rohit","family":"Girdhar","sequence":"additional","affiliation":[{"name":"FAIR, Menlo Park, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4083-9463","authenticated-orcid":false,"given":"Jackson","family":"Hamburger","sequence":"additional","affiliation":[{"name":"FAIR, Menlo Park, CA, USA"}]},{"given":"Hao","family":"Jiang","sequence":"additional","affiliation":[{"name":"Meta Reality Labs, Menlo Park, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-5061-0740","authenticated-orcid":false,"given":"Devansh","family":"Kukreja","sequence":"additional","affiliation":[{"name":"FAIR, Menlo Park, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6650-9972","authenticated-orcid":false,"given":"Miao","family":"Liu","sequence":"additional","affiliation":[{"name":"Meta, London, U.K."}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9736-3948","authenticated-orcid":false,"given":"Xingyu","family":"Liu","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"given":"Miguel","family":"Martin","sequence":"additional","affiliation":[{"name":"FAIR, Menlo Park, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1627-3842","authenticated-orcid":false,"given":"Tushar","family":"Nagarajan","sequence":"additional","affiliation":[{"name":"FAIR, Menlo Park, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-9200-5980","authenticated-orcid":false,"given":"Ilija","family":"Radosavovic","sequence":"additional","affiliation":[{"name":"UC Berkeley, Berkeley, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2833-7038","authenticated-orcid":false,"given":"Santhosh Kumar","family":"Ramakrishnan","sequence":"additional","affiliation":[{"name":"FAIR, Menlo Park, CA, USA"}]},{"given":"Fiona","family":"Ryan","sequence":"additional","affiliation":[{"name":"Georgia Tech, Atlanta, GA, USA"}]},{"given":"Jayant","family":"Sharma","sequence":"additional","affiliation":[{"name":"University of Minnesota, Minneapolis, MN, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5918-9029","authenticated-orcid":false,"given":"Michael","family":"Wray","sequence":"additional","affiliation":[{"name":"University of Bristol, Bristol, U.K."}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9152-4632","authenticated-orcid":false,"given":"Mengmeng","family":"Xu","sequence":"additional","affiliation":[{"name":"King Abdullah University of Science and Technology, Thuwal, Saudi Arabia"}]},{"given":"Eric Zhongcong","family":"Xu","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4993-5416","authenticated-orcid":false,"given":"Chen","family":"Zhao","sequence":"additional","affiliation":[{"name":"King Abdullah University of Science and Technology, Thuwal, Saudi Arabia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2636-0066","authenticated-orcid":false,"given":"Siddhant","family":"Bansal","sequence":"additional","affiliation":[{"name":"University of Bristol, Bristol, U.K."}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3540-1472","authenticated-orcid":false,"given":"Dhruv","family":"Batra","sequence":"additional","affiliation":[{"name":"FAIR, Menlo Park, CA, USA"}]},{"given":"Sean","family":"Crane","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"given":"Tien","family":"Do","sequence":"additional","affiliation":[{"name":"University of Minnesota, Minneapolis, MN, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1773-7557","authenticated-orcid":false,"given":"Morrie","family":"Doulaty","sequence":"additional","affiliation":[{"name":"Meta, London, U.K."}]},{"given":"Akshay","family":"Erapalli","sequence":"additional","affiliation":[{"name":"FAIR, Menlo Park, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9756-7238","authenticated-orcid":false,"given":"Christoph","family":"Feichtenhofer","sequence":"additional","affiliation":[{"name":"FAIR, Menlo Park, CA, USA"}]},{"given":"Adriano","family":"Fragomeni","sequence":"additional","affiliation":[{"name":"University of Bristol, Bristol, U.K."}]},{"given":"Qichen","family":"Fu","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6212-2850","authenticated-orcid":false,"given":"Abrham","family":"Gebreselasie","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University Africa, Kigali, Rwanda"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9445-9952","authenticated-orcid":false,"given":"Cristina","family":"Gonz\u00e1lez","sequence":"additional","affiliation":[{"name":"Universidad de los Andes, Santiago, Chile"}]},{"given":"James","family":"Hillis","sequence":"additional","affiliation":[{"name":"Meta, London, U.K."}]},{"given":"Xuhua","family":"Huang","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8067-6227","authenticated-orcid":false,"given":"Yifei","family":"Huang","sequence":"additional","affiliation":[{"name":"University of Tokyo, Tokyo, Japan"}]},{"given":"Wenqi","family":"Jia","sequence":"additional","affiliation":[{"name":"Georgia Tech, Atlanta, GA, USA"}]},{"given":"Weslie","family":"Khoo","sequence":"additional","affiliation":[{"name":"Indiana University, Bloomington, IN, USA"}]},{"given":"J\u00e1chym","family":"Kol\u00e1\u0159","sequence":"additional","affiliation":[{"name":"Meta, London, U.K."}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0207-8680","authenticated-orcid":false,"given":"Satwik","family":"Kottur","sequence":"additional","affiliation":[{"name":"FAIR, Menlo Park, CA, USA"}]},{"given":"Anurag","family":"Kumar","sequence":"additional","affiliation":[{"name":"Meta, London, U.K."}]},{"given":"Federico","family":"Landini","sequence":"additional","affiliation":[{"name":"FAIR, Menlo Park, CA, USA"}]},{"given":"Chao","family":"Li","sequence":"additional","affiliation":[{"name":"Meta Reality Labs, Menlo Park, CA, USA"}]},{"given":"Yanghao","family":"Li","sequence":"additional","affiliation":[{"name":"FAIR, Menlo Park, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7085-3813","authenticated-orcid":false,"given":"Zhenqiang","family":"Li","sequence":"additional","affiliation":[{"name":"University of Tokyo, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2169-1395","authenticated-orcid":false,"given":"Karttikeya","family":"Mangalam","sequence":"additional","affiliation":[{"name":"UC Berkeley, Berkeley, CA, USA"}]},{"given":"Raghava","family":"Modhugu","sequence":"additional","affiliation":[{"name":"International Institute of Information Technology, Hyderabad, Hyderabad, Telangana, USA"}]},{"given":"Jonathan","family":"Munro","sequence":"additional","affiliation":[{"name":"University of Bristol, Bristol, U.K."}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5592-8218","authenticated-orcid":false,"given":"Tullie","family":"Murrell","sequence":"additional","affiliation":[{"name":"FAIR, Menlo Park, CA, USA"}]},{"given":"Takumi","family":"Nishiyasu","sequence":"additional","affiliation":[{"name":"University of Tokyo, Tokyo, Japan"}]},{"given":"Will","family":"Price","sequence":"additional","affiliation":[{"name":"University of Bristol, Bristol, U.K."}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4352-7999","authenticated-orcid":false,"given":"Paola Ruiz","family":"Puentes","sequence":"additional","affiliation":[{"name":"Universidad de los Andes, Santiago, Chile"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6234-0831","authenticated-orcid":false,"given":"Merey","family":"Ramazanova","sequence":"additional","affiliation":[{"name":"King Abdullah University of Science and Technology, Thuwal, Saudi Arabia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3754-1156","authenticated-orcid":false,"given":"Leda","family":"Sari","sequence":"additional","affiliation":[{"name":"Meta, London, U.K."}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8554-9083","authenticated-orcid":false,"given":"Kiran","family":"Somasundaram","sequence":"additional","affiliation":[{"name":"Meta, London, U.K."}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5064-5884","authenticated-orcid":false,"given":"Audrey","family":"Southerland","sequence":"additional","affiliation":[{"name":"Georgia Tech, Atlanta, GA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4206-710X","authenticated-orcid":false,"given":"Yusuke","family":"Sugano","sequence":"additional","affiliation":[{"name":"University of Tokyo, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0021-5661","authenticated-orcid":false,"given":"Ruijie","family":"Tao","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore"}]},{"given":"Minh","family":"Vo","sequence":"additional","affiliation":[{"name":"Meta Reality Labs, Menlo Park, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1077-5294","authenticated-orcid":false,"given":"Yuchen","family":"Wang","sequence":"additional","affiliation":[{"name":"Indiana University, Bloomington, IN, USA"}]},{"given":"Xindi","family":"Wu","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4050-6543","authenticated-orcid":false,"given":"Takuma","family":"Yagi","sequence":"additional","affiliation":[{"name":"University of Tokyo, Tokyo, Japan"}]},{"given":"Ziwei","family":"Zhao","sequence":"additional","affiliation":[{"name":"Indiana University, Bloomington, IN, USA"}]},{"given":"Yunyi","family":"Zhu","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5244-2407","authenticated-orcid":false,"given":"Pablo","family":"Arbel\u00e1ez","sequence":"additional","affiliation":[{"name":"Universidad de los Andes, Santiago, Chile"}]},{"given":"David","family":"Crandall","sequence":"additional","affiliation":[{"name":"Indiana University, Bloomington, IN, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8804-6238","authenticated-orcid":false,"given":"Dima","family":"Damen","sequence":"additional","affiliation":[{"name":"University of Bristol, Bristol, U.K."}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6034-0432","authenticated-orcid":false,"given":"Giovanni Maria","family":"Farinella","sequence":"additional","affiliation":[{"name":"University of Catania, Catania, Italy"}]},{"given":"Christian","family":"Fuegen","sequence":"additional","affiliation":[{"name":"Meta, London, U.K."}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5534-587X","authenticated-orcid":false,"given":"Bernard","family":"Ghanem","sequence":"additional","affiliation":[{"name":"King Abdullah University of Science and Technology, Thuwal, Saudi Arabia"}]},{"given":"Vamsi Krishna","family":"Ithapu","sequence":"additional","affiliation":[{"name":"Meta, London, U.K."}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6767-7057","authenticated-orcid":false,"given":"C. V.","family":"Jawahar","sequence":"additional","affiliation":[{"name":"International Institute of Information Technology, Hyderabad, Hyderabad, Telangana, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6666-7460","authenticated-orcid":false,"given":"Hanbyul","family":"Joo","sequence":"additional","affiliation":[{"name":"FAIR, Menlo Park, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9389-4060","authenticated-orcid":false,"given":"Kris","family":"Kitani","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9158-9401","authenticated-orcid":false,"given":"Haizhou","family":"Li","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-9091-8989","authenticated-orcid":false,"given":"Richard","family":"Newcombe","sequence":"additional","affiliation":[{"name":"Meta, London, U.K."}]},{"given":"Aude","family":"Oliva","sequence":"additional","affiliation":[{"name":"Massachusetts Institute of Technology, Cambridge, MA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6613-0738","authenticated-orcid":false,"given":"Hyun Soo","family":"Park","sequence":"additional","affiliation":[{"name":"University of Minnesota, Minneapolis, MN, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1793-5462","authenticated-orcid":false,"given":"James M.","family":"Rehg","sequence":"additional","affiliation":[{"name":"Georgia Tech, Atlanta, GA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0097-4537","authenticated-orcid":false,"given":"Yoichi","family":"Sato","sequence":"additional","affiliation":[{"name":"University of Tokyo, Tokyo, Japan"}]},{"given":"Jianbo","family":"Shi","sequence":"additional","affiliation":[{"name":"University of Pennsylvania, Philadelphia, PA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7681-2166","authenticated-orcid":false,"given":"Mike Zheng","family":"Shou","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore"}]},{"given":"Antonio","family":"Torralba","sequence":"additional","affiliation":[{"name":"Massachusetts Institute of Technology, Cambridge, MA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6924-7285","authenticated-orcid":false,"given":"Lorenzo","family":"Torresani","sequence":"additional","affiliation":[{"name":"FAIR, Menlo Park, CA, USA"}]},{"given":"Mingfei","family":"Yan","sequence":"additional","affiliation":[{"name":"Meta, London, U.K."}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3695-1580","authenticated-orcid":false,"given":"Jitendra","family":"Malik","sequence":"additional","affiliation":[{"name":"FAIR, Menlo Park, CA, USA"}]}],"member":"263","reference":[{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2889052"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1400"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58523-5_13"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.5821\/dissertation-2117-94212"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_27"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.226"},{"key":"ref9","article-title":"Multiple object tracking performance metrics and evaluation in a smart room environment","volume-title":"Proc. IEEE 6th Int. Workshop Vis. Surveill. Conjunction ECCV","author":"Bernardin"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1155\/2008\/246309"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2017.XIII.012"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2017.2740062"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10605-2_35"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/icassp40776.2020.9052974"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2016.XII.034"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-2028"},{"key":"ref18","article-title":"Audio-visual embodied navigation","volume":"97","author":"Chen","year":"2019","journal-title":"Environment"},{"key":"ref19","article-title":"InternVideo-Ego4D: A pack of champion solutions to Ego4D challenges","author":"Chen","year":"2022"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1038\/s41467-020-19712-x"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00544"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1064"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2337"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2005.177"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.2991965"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01531-2"},{"key":"ref28","first-page":"753","article-title":"Scaling egocentric vision: The EPIC-KITCHENS dataset","volume-title":"Proc. Eur. Conf. Comput. Vis.","author":"Damen"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.5244\/c.28.30"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2016.02.016"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/THMS.2016.2623480"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref33","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018"},{"key":"ref34","article-title":"EasyCom: An augmented reality dataset to support algorithms for easy communication in noisy environments","author":"Donley","year":"2021"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16215"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr42600.2020.00095"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01694-6"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1145\/3197517.3201357"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-009-0275-4"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01123"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6247805"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.333"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2004.383"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1016\/j.jvcir.2017.10.004"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.2992889"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-776"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01219-9_3"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00041"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00398"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01524"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2648793"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01325"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01563"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01842"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00633"},{"key":"ref59","article-title":"spaCy: Industrial-strength natural language processing in Python","author":"Honnibal","year":"2020"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683142"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298744"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1155\/2007\/64506"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58574-7_46"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.373"},{"key":"ref65","article-title":"The kinetics human action video dataset","author":"Kay","year":"2017"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00559"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00701"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1016\/S0167-6393(01)00041-3"},{"key":"ref69","volume-title":"Nonverbal Communication in Human Interaction","author":"Knapp","year":"2014"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"ref71","first-page":"1097","article-title":"ImageNet classification with deep convolutional neural networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Krizhevsky"},{"key":"ref72","article-title":"Guide to the Carnegie Mellon University multimodal activity (CMU-MMAC) database","author":"la Torre","year":"2009"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2009.5354442"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6247820"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-014-0794-5"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/T-AFFC.2012.17"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/iccv.2013.326"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.399"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3051319"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01228-1_38"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00687"},{"key":"ref82","article-title":"Egocentric video-language pretraining","author":"Lin","year":"2022"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00399"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_1"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00684"},{"key":"ref87","article-title":"VIP: Towards universal visual reward and representation via value-implicit pre-training","author":"Ma","year":"2022"},{"key":"ref88","article-title":"Estimating more camera poses for ego-centric videos is essential for VQ3D","author":"Mai","year":"2022"},{"key":"ref89","first-page":"651","article-title":"DexVIP: Learning dexterous grasping with human hand pose priors from video","volume-title":"Proc. Conf. Robot Learn.","author":"Mandikal"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00359"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-013-0655-7"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.5244\/C.25.22"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/wacv56688.2023.00599"},{"key":"ref94","first-page":"137","article-title":"The AMI meeting corpus","volume-title":"Proc. 5th Int. Conf. Methods Techn. Behav. Res.","author":"McCowan"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00272"},{"key":"ref96","article-title":"Intel labs at Ego4D challenge 2022: A better baseline for audio-visual diarization","author":"Min","year":"2022"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.129"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.5244\/C.31.59"},{"key":"ref99","first-page":"360","article-title":"Self-supervised generation of spatial audio for 360${}^\\circ$\u2218 video","volume-title":"Proc. 32nd Int. Conf. Neural Inf. Process. Syst.","author":"Morgado"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00878"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_11"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-950"},{"key":"ref103","article-title":"R3M: A universal visual representation for robot manipulation","author":"Nair","year":"2022"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.721"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2019.00354"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00991"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2014.2303576"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.3025105"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_39"},{"key":"ref110","article-title":"Automatic mutual gaze detection in face-to-face dyadic interaction videos","volume":"1","author":"Palmero","year":"2018","journal-title":"Measuring Behav."},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.508"},{"key":"ref112","first-page":"422","article-title":"3D social saliency from head-mounted cameras","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Park"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6248010"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00369"},{"key":"ref115","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. 38th Int. Conf. Mach. Learn.","author":"Radford"},{"key":"ref116","article-title":"Real-world robot learning with masked visual pre-training","author":"Radosavovic","year":"2022"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.1016\/j.patrec.2019.12.016"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00161"},{"key":"ref119","first-page":"199","article-title":"Where are they looking?","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Recasens"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1109\/MPRV.2014.23"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2016.2577031"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2016.2577031"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2021.103252"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053900"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.352"},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.02042"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2952095"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00989"},{"key":"ref129","article-title":"Charades-ego: A large-scale dataset of paired third and first person videos","author":"Sigurdsson","year":"2018"},{"key":"ref130","article-title":"Silero VAD: Pre-trained enterprise-grade voice activity detector (VAD), number detector and language classifier","year":"2021"},{"key":"ref131","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00253"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2016.7477717"},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2018.8545516"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_28"},{"key":"ref135","article-title":"Is someone speaking? Exploring long-term temporal features for audio-visual active speaker detection","author":"Tao","year":"2021"},{"key":"ref136","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_16"},{"key":"ref137","article-title":"Episodic and semantic memory","volume-title":"Organization of Memory","author":"Tulving","year":"1972"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"ref139","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2016.291"},{"key":"ref140","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00806"},{"key":"ref141","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.3015894"},{"key":"ref142","article-title":"Audiovisual slowfast networks for video recognition","author":"Xiao","year":"2020"},{"key":"ref143","article-title":"Masked visual pre-training for motor control","author":"Xiao","year":"2022"},{"key":"ref144","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"ref145","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52729.2023.00255"},{"key":"ref146","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.288"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12295"},{"key":"ref148","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.585"},{"key":"ref149","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20068-7_11"},{"key":"ref150","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6984"},{"key":"ref151","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01340"},{"key":"ref152","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_35"},{"key":"ref153","article-title":"Exploring state change capture of heterogeneous backbones @ Ego4D hands and objects challenge 2022","author":"Zheng","year":"2022"},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2723009"},{"key":"ref155","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.511"},{"key":"ref156","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46484-8_16"},{"key":"ref157","doi-asserted-by":"publisher","DOI":"10.1007\/s11633-021-1293-0"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/34\/11192800\/10611736.pdf?arnumber=10611736","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,6]],"date-time":"2025-10-06T17:36:35Z","timestamp":1759772195000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10611736\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11]]},"references-count":155,"journal-issue":{"issue":"11"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2024.3381075","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11]]}}}