{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:30:57Z","timestamp":1778081457073,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":102,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/100000002","name":"NIH (National Institutes of Health)","doi-asserted-by":"publisher","award":["R01EY034562"],"award-info":[{"award-number":["R01EY034562"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/100000002","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/100000185","name":"Defense Advanced Research Projects Agency","doi-asserted-by":"publisher","award":["HR00112220003"],"award-info":[{"award-number":["HR00112220003"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/100000185","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681618","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"1682-1691","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["EAGLE: Egocentric AGgregated Language-video Engine"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-8235-2158","authenticated-orcid":false,"given":"Jing","family":"Bi","sequence":"first","affiliation":[{"name":"University of Rochester, Rochester, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2796-1787","authenticated-orcid":false,"given":"Yunlong","family":"Tang","sequence":"additional","affiliation":[{"name":"University of Rochester, Rochester, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0126-1259","authenticated-orcid":false,"given":"Luchuan","family":"Song","sequence":"additional","affiliation":[{"name":"University of Rochester, Rochester, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1014-2937","authenticated-orcid":false,"given":"Ali","family":"Vosoughi","sequence":"additional","affiliation":[{"name":"University of Rochester, Rochester, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1553-3224","authenticated-orcid":false,"given":"Nguyen","family":"Nguyen","sequence":"additional","affiliation":[{"name":"University of Rochester, Rochester, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2183-822X","authenticated-orcid":false,"given":"Chenliang","family":"Xu","sequence":"additional","affiliation":[{"name":"University of Rochester, Rochester, NY, USA"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV). 5225--5237","author":"Peri","unstructured":"Peri Akiva et al. 2023. Self-Supervised Object Detection from Egocentric Videos. In Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV). 5225--5237."},{"key":"e_1_3_2_2_2_1","volume-title":"Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966","author":"Jinze Bai","year":"2023","unstructured":"Jinze Bai et al. 2023. Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/iccv48922.2021.00175"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19778-9_38"},{"key":"e_1_3_2_2_5_1","first-page":"4","article-title":"Is space-time attention all you need for video understanding?","volume":"2","author":"Gedas Bertasius","year":"2021","unstructured":"Gedas Bertasius et al. 2021. Is space-time attention all you need for video understanding?. In ICML, Vol. 2. 4.","journal-title":"ICML"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01532"},{"key":"e_1_3_2_2_7_1","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV). 1--5.","author":"Bi Jing","year":"2023","unstructured":"Jing Bi, Nguyen Manh Nguyen, Ali Vosoughi, and Chenliang Xu. 2023. MISAR: A Multimodal Instructional System with Augmented Reality. In Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV). 1--5."},{"key":"e_1_3_2_2_8_1","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","author":"Caba Fabian","unstructured":"Fabian Caba Heilbron et al. 2015. ActivityNet: A large-scale video benchmark for human activity understanding. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_2_2_9_1","volume-title":"Shikra: Unleashing Multimodal LLM's Referential Dialogue Magic. arXiv preprint arXiv:2306.15195","author":"Chen Keqin","year":"2023","unstructured":"Keqin Chen, Zhao Zhang, Weili Zeng, Richong Zhang, Feng Zhu, and Rui Zhao. 2023. Shikra: Unleashing Multimodal LLM's Referential Dialogue Magic. arXiv preprint arXiv:2306.15195 (2023)."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/iccv51070.2023.00127"},{"key":"e_1_3_2_2_11_1","unstructured":"Hyung Won Chung et al. 2022. Scaling instruction-finetuned language models. arXiv preprint arXiv:2210.11416 (2022)."},{"key":"e_1_3_2_2_12_1","unstructured":"Wenliang Dai et al. 2023. InstructBLIP: Towards General-purpose Vision- Language Models with Instruction Tuning. arXiv preprint arXiv:2305.06500 (2023)."},{"key":"e_1_3_2_2_13_1","volume-title":"Computer Vision and Pattern Recognition (CVPR), 2014 IEEE Conference on. IEEE, 3639--3646","author":"Dima","unstructured":"Dima Damen et al. 2014. You-Do, I-Learn: Discovering Task Relevant Objects and their Modes of Interaction from Multi-User Egocentric Video. In Computer Vision and Pattern Recognition (CVPR), 2014 IEEE Conference on. IEEE, 3639--3646."},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01531--2"},{"key":"e_1_3_2_2_15_1","volume-title":"Proceedings of the Neural Information Processing Systems (NeurIPS) Track on Datasets and Benchmarks.","author":"Ahmad","unstructured":"Ahmad Darkhalil et al. 2022. EPIC-KITCHENS VISOR Benchmark: VIdeo Segmentations and Object Relations. In Proceedings of the Neural Information Processing Systems (NeurIPS) Track on Datasets and Benchmarks."},{"key":"e_1_3_2_2_16_1","volume-title":"HoloLens 2 Sensor Streaming. arXiv preprint arXiv:2211.02648","author":"Dibene Juan C","year":"2022","unstructured":"Juan C Dibene and Enrique Dunn. 2022. HoloLens 2 Sensor Streaming. arXiv preprint arXiv:2211.02648 (2022)."},{"key":"e_1_3_2_2_17_1","volume-title":"Proceedings of the International Conference on Computer Vision (ICCV).","author":"Christoph","unstructured":"Christoph Feichtenhofer et al. 2019. SlowFast networks for video recognition. In Proceedings of the International Conference on Computer Vision (ICCV)."},{"key":"e_1_3_2_2_18_1","unstructured":"Mingqian Feng et al. 2024. Do More Details Introduce More Hallucinations in LVLM-based Image Captioning? arXiv preprint arXiv:2406.12663 (2024)."},{"key":"e_1_3_2_2_19_1","unstructured":"Peng Gao et al. 2023. LLaMA-Adapter V2: Parameter-Efficient Visual Instruction Model. arXiv preprint arXiv:2304.15010 (2023)."},{"key":"e_1_3_2_2_20_1","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition. 6904--6913","author":"Yash","unstructured":"Yash Goyal et al. 2017. Making the v in vqa matter: Elevating the role of image understanding in visual question answering. In Proceedings of the IEEE conference on computer vision and pattern recognition. 6904--6913."},{"key":"e_1_3_2_2_21_1","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 18995--19012","author":"Kristen","unstructured":"Kristen Grauman et al. 2022. Ego4d: Around the world in 3,000 hours of egocentric video. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 18995--19012."},{"key":"e_1_3_2_2_22_1","volume-title":"Imagebind-llm: Multi-modality instruction tuning. arXiv preprint arXiv:2309.03905","author":"Jiaming Han","year":"2023","unstructured":"Jiaming Han et al. 2023. Imagebind-llm: Multi-modality instruction tuning. arXiv preprint arXiv:2309.03905 (2023)."},{"key":"e_1_3_2_2_23_1","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV). 15417--15429","author":"Rishi","unstructured":"Rishi Hazra et al. 2023. EgoTV: Egocentric Task Verification from Natural Language Task Descriptions. In Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV). 15417--15429."},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"e_1_3_2_2_25_1","volume-title":"International Conference on Machine Learning. PMLR, 2790--2799","author":"Neil","unstructured":"Neil Houlsby et al. 2019. Parameter-efficient transfer learning for NLP. In International Conference on Machine Learning. PMLR, 2790--2799."},{"key":"e_1_3_2_2_26_1","volume-title":"Promptcap: Prompt-guided task-aware image captioning. arXiv preprint arXiv:2211.09699","author":"Yushi Hu","year":"2022","unstructured":"Yushi Hu et al. 2022. Promptcap: Prompt-guided task-aware image captioning. arXiv preprint arXiv:2211.09699 (2022)."},{"key":"e_1_3_2_2_27_1","volume-title":"FINEMATCH: Aspect-based Fine-grained Image and Text Mismatch Detection and Correction. arXiv preprint arXiv:2404.14715","author":"Hang Hua","year":"2024","unstructured":"Hang Hua et al. 2024. FINEMATCH: Aspect-based Fine-grained Image and Text Mismatch Detection and Correction. arXiv preprint arXiv:2404.14715 (2024)."},{"key":"e_1_3_2_2_28_1","volume-title":"V2xum-llm: Cross-modal video summarization with temporal prompt instruction tuning. arXiv preprint arXiv:2404.12353","author":"Hua Hang","year":"2024","unstructured":"Hang Hua, Yunlong Tang, Chenliang Xu, and Jiebo Luo. 2024. V2xum-llm: Cross-modal video summarization with temporal prompt instruction tuning. arXiv preprint arXiv:2404.12353 (2024)."},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"crossref","unstructured":"Bin Huang et al. 2023. VTimeLLM: Empower LLM to Grasp Video Moments. arXiv:2311.18445 [cs.CV]","DOI":"10.1109\/CVPR52733.2024.01353"},{"key":"e_1_3_2_2_30_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 22910--22921","author":"Chao","unstructured":"Chao Huang et al. 2023. Egocentric Audio-Visual Object Localization. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 22910--22921."},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.3007841"},{"key":"e_1_3_2_2_32_1","volume-title":"Gqa: A new dataset for real-world visual reasoning and compositional question answering. In CVPR.","author":"Hudson Drew A","year":"2019","unstructured":"Drew A Hudson and Christopher D Manning. 2019. Gqa: A new dataset for real-world visual reasoning and compositional question answering. In CVPR."},{"key":"e_1_3_2_2_33_1","volume-title":"Perceiver IO: A General Architecture for Structured Inputs & Outputs. In International Conference on Learning Representations.","author":"Andrew","unstructured":"Andrew Jaegle et al. 2021. Perceiver IO: A General Architecture for Structured Inputs & Outputs. In International Conference on Learning Representations."},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/iccvw.2019.00540"},{"key":"e_1_3_2_2_35_1","unstructured":"Will Kay et al. 2017. The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)."},{"key":"e_1_3_2_2_36_1","volume-title":"Proceedings of the IEEE International Conference on Computer Vision (ICCV). 3752--3761","author":"Evangelos","unstructured":"Evangelos Kazakos et al. 2019. Epic-Kitchens: A Dataset for Object Recognition in Egocentric Video. In Proceedings of the IEEE International Conference on Computer Vision (ICCV). 3752--3761."},{"key":"e_1_3_2_2_37_1","volume-title":"Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP). 787--798","author":"Sahar","unstructured":"Sahar Kazemzadeh et al. 2014. Referitgame: Referring to objects in photographs of natural scenes. In Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP). 787--798."},{"key":"e_1_3_2_2_38_1","volume-title":"Proceedings of the International Conference on Computer Vision (ICCV).","author":"Hilde","unstructured":"Hilde Kuehne et al. 2011. HMDB: A large video database for human motion recognition. In Proceedings of the International Conference on Computer Vision (ICCV)."},{"key":"e_1_3_2_2_39_1","volume-title":"Lisa: Reasoning segmentation via large language model. arXiv preprint arXiv:2308.00692","author":"Xin Lai","year":"2023","unstructured":"Xin Lai et al. 2023. Lisa: Reasoning segmentation via large language model. arXiv preprint arXiv:2308.00692 (2023)."},{"key":"e_1_3_2_2_40_1","volume-title":"Mimic-it: Multi-modal in-context instruction tuning. arXiv preprint arXiv:2306.05425","author":"Bo Li","year":"2023","unstructured":"Bo Li et al. 2023. Mimic-it: Multi-modal in-context instruction tuning. arXiv preprint arXiv:2306.05425 (2023)."},{"key":"e_1_3_2_2_41_1","volume-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)."},{"key":"e_1_3_2_2_42_1","volume-title":"International Conference on Machine Learning. PMLR, 12888-- 12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International Conference on Machine Learning. PMLR, 12888-- 12900."},{"key":"e_1_3_2_2_43_1","volume-title":"Videochat: Chat-centric video understanding. arXiv preprint arXiv:2305.06355","author":"Li KunChang","year":"2023","unstructured":"KunChang Li et al. 2023. Videochat: Chat-centric video understanding. arXiv preprint arXiv:2305.06355 (2023)."},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298625"},{"key":"e_1_3_2_2_45_1","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision. 2794--2804","author":"Qinghong Kevin","unstructured":"Kevin Qinghong Lin et al. 2023. UniVTG: Towards Unified Video-Language Temporal Grounding. In Proceedings of the IEEE\/CVF International Conference on Computer Vision. 2794--2804."},{"key":"e_1_3_2_2_46_1","unstructured":"Haotian Liu et al. 2023. Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744 (2023)."},{"key":"e_1_3_2_2_47_1","unstructured":"Haotian Liu et al. 2023. Visual instruction tuning. arXiv preprint arXiv:2304.08485 (2023)."},{"key":"e_1_3_2_2_48_1","unstructured":"Pinxin Liu Luchuan Song Daoan Zhang Hang Hua Yunlong Tang et al. 2024. Emo-Avatar: Efficient Monocular Video Style Avatar through Texture Rendering. arXiv preprint arXiv:2402.00827 (2024)."},{"key":"e_1_3_2_2_49_1","unstructured":"Shilong Liu Zhaoyang Zeng Tianhe Ren Feng Li Hao Zhang Jie Yang Chunyuan Li Jianwei Yang Hang Su Jun Zhu and Lei Zhang. 2023. Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection. arXiv:2303.05499 [cs.CV]"},{"key":"e_1_3_2_2_50_1","volume-title":"Gpteval: Nlg evaluation using gpt-4 with better human alignment. arXiv preprint arXiv:2303.16634","author":"Liu Yang","year":"2023","unstructured":"Yang Liu, Dan Iter, Yichong Xu, Shuohang Wang, Ruochen Xu, and Chenguang Zhu. 2023. Gpteval: Nlg evaluation using gpt-4 with better human alignment. arXiv preprint arXiv:2303.16634 (2023)."},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"publisher","unstructured":"Diogo Luvizon et al. 2020. Multi-task Deep Learning for Real-Time 3D Human Pose Estimation and Action Recognition. IEEE Transactions on Pattern Analysis and Machine Intelligence (2020) 1--1. https:\/\/doi.org\/10.1109\/tpami.2020.2976014","DOI":"10.1109\/tpami.2020.2976014"},{"key":"e_1_3_2_2_52_1","unstructured":"Muhammad Maaz et al. 2023. Video-ChatGPT: Towards Detailed Video Understanding via Large Vision and Language Models. arXiv preprint arXiv:2306.05424 (2023)."},{"key":"e_1_3_2_2_53_1","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV). 45--57","author":"Jinjie","unstructured":"Jinjie Mai et al. 2023. EgoLoc: Revisiting 3D Object Localization from Egocentric Videos with Visual Queries. In Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV). 45--57."},{"key":"e_1_3_2_2_54_1","unstructured":"Karttikeya Mangalam et al. 2023. EgoSchema: A Diagnostic Benchmark for Very Long-form Video Language Understanding. arXiv preprint arXiv:2308.09126 (2023)."},{"key":"e_1_3_2_2_55_1","volume-title":"OK-VQA: A Visual Question Answering Benchmark Requiring External Knowledge. In Conference on Computer Vision and Pattern Recognition (CVPR).","author":"Kenneth","unstructured":"Kenneth Marino et al. 2019. OK-VQA: A Visual Question Answering Benchmark Requiring External Knowledge. In Conference on Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_2_2_56_1","volume-title":"2019 international conference on document analysis and recognition (ICDAR). IEEE, 947--952","author":"Anand","unstructured":"Anand Mishra et al. 2019. Ocr-vqa: Visual question answering by reading text in images. In 2019 international conference on document analysis and recognition (ICDAR). IEEE, 947--952."},{"key":"e_1_3_2_2_57_1","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 3628--3638","author":"Tirumala","unstructured":"Tirumala Nagarajan et al. 2019. Grounded Human-Object Interaction Tasks with Real-World Object Videos. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 3628--3638."},{"key":"e_1_3_2_2_58_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2021.11.081"},{"key":"e_1_3_2_2_59_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 12999--13008","author":"Takehiko","unstructured":"Takehiko Ohkawa et al. 2023. AssemblyHands: Towards Egocentric Activity Understanding via 3D Hand Pose Estimation. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 12999--13008."},{"key":"e_1_3_2_2_61_1","doi-asserted-by":"crossref","unstructured":"Yulin Pan et al. 2023. Scanning Only Once: An End-to-end Framework for Fast Temporal Grounding in Long Videos. arXiv preprint arXiv:2303.08345 (2023).","DOI":"10.1109\/ICCV51070.2023.01266"},{"key":"e_1_3_2_2_62_1","unstructured":"Zhiliang Peng et al. 2023. Kosmos-2: Grounding Multimodal Large Language Models to the World. arXiv preprint arXiv:2306.14824 (2023)."},{"key":"e_1_3_2_2_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6248010"},{"key":"e_1_3_2_2_64_1","unstructured":"Chiara Plizzari et al. 2023. An Outlook into the Future of Egocentric Vision. arXiv:2308.07123 [cs.CV]"},{"key":"e_1_3_2_2_65_1","unstructured":"Kevin Qinghong Lin et al. 2022. Egocentric Video-Language Pretraining. arXiv e-prints (2022) arXiv--2206."},{"key":"e_1_3_2_2_66_1","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV). 5213--5224","author":"Gorjan","unstructured":"Gorjan Radevski et al. 2023. Multimodal Distillation for Egocentric Action Recognition. In Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV). 5213--5224."},{"key":"e_1_3_2_2_67_1","unstructured":"Alec Radford et al. 2019. Language Models are Unsupervised Multitask Learners. https:\/\/api.semanticscholar.org\/CorpusID:160025533"},{"key":"e_1_3_2_2_68_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Alec","unstructured":"Alec Radford et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_2_69_1","first-page":"5485","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Colin Raffel","year":"2020","unstructured":"Colin Raffel et al. 2020. Exploring the limits of transfer learning with a unified text-to-text transformer. The Journal of Machine Learning Research 21, 1 (2020), 5485--5551.","journal-title":"The Journal of Machine Learning Research"},{"key":"e_1_3_2_2_70_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 6545--6554","author":"Hanoona","unstructured":"Hanoona Rasheed et al. 2023. Fine-tuned clip models are efficient video learners. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 6545--6554."},{"key":"e_1_3_2_2_71_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.399"},{"key":"e_1_3_2_2_72_1","unstructured":"Sebastian Ruder. 2017. An Overview of Multi-Task Learning in Deep Neural Networks. arXiv:1706.05098 [cs.LG]"},{"key":"e_1_3_2_2_73_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 14663--14674","author":"Fiona","unstructured":"Fiona Ryan et al. 2023. Egocentric Auditory Attention Localization in Conversations. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 14663--14674."},{"key":"e_1_3_2_2_74_1","volume-title":"A-okvqa: A benchmark for visual question answering using world knowledge. In European Conference on Computer Vision","author":"Dustin Schwenk","year":"2022","unstructured":"Dustin Schwenk et al. 2022. A-okvqa: A benchmark for visual question answering using world knowledge. In European Conference on Computer Vision. Springer, 146--162."},{"key":"e_1_3_2_2_75_1","volume-title":"2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE. https:\/\/doi.org\/10","author":"Fadime","year":"2022","unstructured":"Fadime Sener et al. 2022. Assembly101: A Large-Scale Multi-View Video Dataset for Understanding Procedural Activities. In 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE. https:\/\/doi.org\/10.1109\/ cvpr52688.2022.02042"},{"key":"e_1_3_2_2_76_1","volume-title":"Proceedings of the 29th ACM International Conference on Multimedia. 1756--1765","author":"Xindi","unstructured":"Xindi Shang et al. 2021. Multimodal video summarization via time-aware transformers. In Proceedings of the 29th ACM International Conference on Multimedia. 1756--1765."},{"key":"e_1_3_2_2_77_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"e_1_3_2_2_78_1","volume-title":"Proceedings, Part II 16","author":"Oleksii","unstructured":"Oleksii Sidorov et al. 2020. Textcaps: a dataset for image captioning with reading comprehension. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part II 16. Springer, 742--758."},{"key":"e_1_3_2_2_79_1","volume-title":"Proceedings of the 2012 IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","author":"Soomro Khurram","year":"2012","unstructured":"Khurram Soomro, Amir Roshan Zamir, and Mubarak Shah. 2012. UCF101: A dataset of 101 human actions classes from videos in the wild. In Proceedings of the 2012 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_2_2_80_1","volume-title":"Proceedings of the Asian Conference on Computer Vision (ACCV). 3519--3535","author":"Yunlong","unstructured":"Yunlong Tang et al. 2022. Multi-modal Segment Assemblage Network for Ad Video Editing with Importance-Coherence Reward. In Proceedings of the Asian Conference on Computer Vision (ACCV). 3519--3535."},{"key":"e_1_3_2_2_81_1","unstructured":"Yunlong Tang et al. 2023. LLMVA-GEBC: Large Language Model with Video Adapter for Generic Event Boundary Captioning. arXiv preprint arXiv:2306.10354 (2023)."},{"key":"e_1_3_2_2_82_1","unstructured":"Yunlong Tang Jing Bi Siting Xu et al. 2023. Video Understanding with Large Language Models: A Survey. arXiv:2312.17432 [cs.CV]"},{"key":"e_1_3_2_2_83_1","volume-title":"AVicuna: Audio- Visual LLM with Interleaver and Context-Boundary Alignment for Temporal Referential Dialogue. arXiv preprint arXiv:2403.16276","author":"Tang Yunlong","year":"2024","unstructured":"Yunlong Tang, Daiki Shimada, Jing Bi, and Chenliang Xu. 2024. AVicuna: Audio- Visual LLM with Interleaver and Context-Boundary Alignment for Temporal Referential Dialogue. arXiv preprint arXiv:2403.16276 (2024)."},{"key":"e_1_3_2_2_84_1","unstructured":"Guangzhi Wang et al. 2023. What Makes for Good Visual Tokenizers for Large Language Models? arXiv preprint arXiv:2305.12223 (2023)."},{"key":"e_1_3_2_2_85_1","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV). 5250--5261","author":"Huiyu","unstructured":"Huiyu Wang et al. 2023. Ego-Only: Egocentric Action Detection without Exocentric Transferring. In Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV). 5250--5261."},{"key":"e_1_3_2_2_86_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 6598--6608","author":"Jinpeng","unstructured":"Jinpeng Wang et al. 2023. All in one: Exploring unified video-language pretraining. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 6598--6608."},{"key":"e_1_3_2_2_87_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 13031--13040","author":"Jian","unstructured":"Jian Wang et al. 2023. Scene-Aware Egocentric 3D Human Pose Estimation. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 13031--13040."},{"key":"e_1_3_2_2_88_1","unstructured":"Teng Wang Jinrui Zhang Junjie Fei Yixiao Ge Hao Zheng Yunlong Tang et al. 2023. Caption anything: Interactive image description with diverse multimodal controls. arXiv preprint arXiv:2305.02677 (2023)."},{"key":"e_1_3_2_2_89_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 21243--21253","author":"Yilin","unstructured":"Yilin Wen et al. 2023. Hierarchical Temporal Transformer for 3D Hand Pose Estimation and Action Recognition From Egocentric RGB Videos. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 21243--21253."},{"key":"e_1_3_2_2_90_1","unstructured":"Jinheng Xie et al. 2023. VisorGPT: Learning Visual Prior via Generative Pre- Training. arXiv preprint arXiv:2305.13777 (2023)."},{"key":"e_1_3_2_2_91_1","volume-title":"Videoclip: Contrastive pre-training for zero-shot video-text understanding. arXiv preprint arXiv:2109.14084","author":"Hu Xu","year":"2021","unstructured":"Hu Xu et al. 2021. Videoclip: Contrastive pre-training for zero-shot video-text understanding. arXiv preprint arXiv:2109.14084 (2021)."},{"key":"e_1_3_2_2_92_1","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV). 5273--5284","author":"Yue","unstructured":"Yue Xu et al. 2023. EgoPCA: A New Framework for Egocentric Hand-Object Interaction Understanding. In Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV). 5273--5284."},{"key":"e_1_3_2_2_93_1","volume-title":"Pink: Unveiling the Power of Referential Comprehension for Multi-modal LLMs. arXiv preprint arXiv:2310.00582","author":"Shiyu Xuan","year":"2023","unstructured":"Shiyu Xuan et al. 2023. Pink: Unveiling the Power of Referential Comprehension for Multi-modal LLMs. arXiv preprint arXiv:2310.00582 (2023)."},{"key":"e_1_3_2_2_94_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2310--2320","author":"Zihui","unstructured":"Zihui Xue et al. 2023. Egocentric Video Task Translation. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2310--2320."},{"key":"e_1_3_2_2_95_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_2_2_96_1","volume-title":"Video-llama: An instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858","author":"Hang Zhang","year":"2023","unstructured":"Hang Zhang et al. 2023. Video-llama: An instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858 (2023)."},{"key":"e_1_3_2_2_97_1","doi-asserted-by":"crossref","unstructured":"Hang Zhang et al. 2023. Video-LLaMA: An Instruction-tuned Audio-Visual Language Model for Video Understanding. arXiv:2306.02858 [cs.CL]","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"e_1_3_2_2_98_1","unstructured":"Shilong Zhang et al. 2023. Gpt4roi: Instruction tuning large language model on region-of-interest. arXiv preprint arXiv:2307.03601 (2023)."},{"key":"e_1_3_2_2_99_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 6586--6597","author":"Yue","unstructured":"Yue Zhao et al. 2023. Learning Video Representations from Large Language Models. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 6586--6597."},{"key":"e_1_3_2_2_100_1","unstructured":"Lianmin Zheng et al. 2023. Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. arXiv:2306.05685 [cs.CL]"},{"key":"e_1_3_2_2_101_1","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 1049--1058","author":"Bolei","unstructured":"Bolei Zhou et al. 2015. Temporal action localization in untrimmed videos via multi-stage CNNs. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 1049--1058."},{"key":"e_1_3_2_2_102_1","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV). 20110--20120","author":"Chenchen","unstructured":"Chenchen Zhu et al. 2023. EgoObjects: A Large-Scale Egocentric Dataset for Fine-Grained Object Understanding. In Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV). 20110--20120."},{"key":"e_1_3_2_2_103_1","unstructured":"Deyao Zhu et al. 2023. Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681618","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681618","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:49Z","timestamp":1750295869000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681618"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":102,"alternative-id":["10.1145\/3664647.3681618","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681618","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}