{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:54:14Z","timestamp":1781538854626,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":57,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810764","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"681-689","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Egocentric Action Recognition with Retrieval-Augmented Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-8450-0388","authenticated-orcid":false,"given":"Yishan","family":"Zou","sequence":"first","affiliation":[{"name":"Ulster University, Belfast, United Kingdom"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0882-7902","authenticated-orcid":false,"given":"Chris","family":"Nugent","sequence":"additional","affiliation":[{"name":"Ulster University, Belfast, United Kingdom"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8931-2454","authenticated-orcid":false,"given":"Matthew","family":"Burns","sequence":"additional","affiliation":[{"name":"Ulster University, Belfast, United Kingdom"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2008-1736","authenticated-orcid":false,"given":"Shengli","family":"Wu","sequence":"additional","affiliation":[{"name":"Ulster University, Belfast, United Kingdom"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1492-0970","authenticated-orcid":false,"given":"Mingzhu","family":"Xu","sequence":"additional","affiliation":[{"name":"Shandong University, Jinan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1582-5764","authenticated-orcid":false,"given":"Meng","family":"Liu","sequence":"additional","affiliation":[{"name":"Shandong Jianzhu Universiry, Jinan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","first-page":"2206","volume-title":"Proceedings of the International Conference on Machine Learning","author":"Borgeaud Sebastian","year":"2022","unstructured":"Sebastian Borgeaud, Arthur Mensch, Jordan Hoffmann, Trevor Cai, Eliza Rutherford, Katie Millican, George\u00a0Bm Van Den\u00a0Driessche, Jean-Baptiste Lespiau, Bogdan Damoc, Aidan Clark, et\u00a0al. 2022. Improving language models by retrieving from trillions of tokens. In Proceedings of the International Conference on Machine Learning. 2206\u20132240."},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01719"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"crossref","unstructured":"Wenhu Chen Hexiang Hu Xi Chen Pat Verga and William\u00a0W Cohen. 2022. Murag: Multimodal retrieval-augmented generator for open question answering over images and text. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2210.02928.","DOI":"10.18653\/v1\/2022.emnlp-main.375"},{"key":"e_1_3_3_1_5_2","unstructured":"Wenhu Chen Hexiang Hu Chitwan Saharia and William\u00a0W Cohen. 2022. Re-imagen: Retrieval-augmented text-to-image generator. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2209.14491."},{"key":"e_1_3_3_1_6_2","first-page":"720","volume-title":"Proceedings of the European Conference on Computer Vision","author":"Damen Dima","year":"2018","unstructured":"Dima Damen, Hazel Doughty, Giovanni\u00a0Maria Farinella, Sanja Fidler, Antonino Furnari, Evangelos Kazakos, Davide Moltisanti, Jonathan Munro, Toby Perrett, Will Price, et\u00a0al. 2018. Scaling egocentric vision: The epic-kitchens dataset. In Proceedings of the European Conference on Computer Vision. 720\u2013736."},{"key":"e_1_3_3_1_7_2","first-page":"4171","volume-title":"Proceedings of North American Chapter of the Association for Computational Linguistics-HLT","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of North American Chapter of the Association for Computational Linguistics-HLT. 4171\u20134186."},{"key":"e_1_3_3_1_8_2","first-page":"457","volume-title":"Proceedings of the European Conference on Computer Vision","author":"Gowda Shreyank\u00a0N","year":"2024","unstructured":"Shreyank\u00a0N Gowda, Anurag Arnab, and Jonathan Huang. 2024. Optimizing factorized encoder models: Time and memory reduction for scalable and efficient action recognition. In Proceedings of the European Conference on Computer Vision. 457\u2013474."},{"key":"e_1_3_3_1_9_2","unstructured":"Alex Graves Greg Wayne and Ivo Danihelka. 2014. Neural turing machines. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1410.5401."},{"key":"e_1_3_3_1_10_2","first-page":"3929","volume-title":"Proceedings of the International Conference on Machine Learning","author":"Guu Kelvin","year":"2020","unstructured":"Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat, and Mingwei Chang. 2020. Retrieval augmented language model pre-training. In Proceedings of the International Conference on Machine Learning. 3929\u20133938."},{"key":"e_1_3_3_1_11_2","unstructured":"Ahmet Iscen Mathilde Caron Alireza Fathi and Cordelia Schmid. 2023. Retrieval-enhanced contrastive vision-text models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2306.07196."},{"key":"e_1_3_3_1_12_2","first-page":"617","volume-title":"Proceedings of the Conference on Robot Learning","author":"Kumar Ashish","year":"2020","unstructured":"Ashish Kumar, Saurabh Gupta, and Jitendra Malik. 2020. Learning navigation subroutines from egocentric videos. In Proceedings of the Conference on Robot Learning. 617\u2013626."},{"key":"e_1_3_3_1_13_2","unstructured":"Patrick Lewis Ethan Perez Aleksandra Piktus Fabio Petroni Vladimir Karpukhin Naman Goyal Heinrich K\u00fcttler Mike Lewis Wen-tau Yih Tim Rockt\u00e4schel et\u00a0al. 2020. Retrieval-augmented generation for knowledge-intensive nlp tasks. Advances in Neural Information Processing Systems 33 (2020) 9459\u20139474."},{"key":"e_1_3_3_1_14_2","unstructured":"Bo Li Yuanhan Zhang Dong Guo Renrui Zhang Feng Li Hao Zhang Kaichen Zhang Peiyuan Zhang Yanwei Li Ziwei Liu et\u00a0al. 2024. Llava-onevision: Easy visual task transfer. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.03326."},{"key":"e_1_3_3_1_15_2","unstructured":"Feng Li Renrui Zhang Hao Zhang Yuanhan Zhang Bo Li Wei Li Zejun Ma and Chunyuan Li. 2024. Llava-next-interleave: Tackling multi-image video and 3d in large multimodal models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.07895."},{"key":"e_1_3_3_1_16_2","unstructured":"Kunchang Li Yali Wang Yinan He Yizhuo Li Yi Wang Limin Wang and Yu Qiao. 2022. Uniformerv2: Spatiotemporal learning by arming image vits with video uniformer. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2211.09552."},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01228-1_38"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298625"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.324"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1145\/3209978.3210003"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1145\/3240508.3240549"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00264"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00683"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"crossref","unstructured":"Xiusheng Lu Yanbin Hao Lechao Cheng Sicheng Zhao Yutao Liu and Mingli Song. 2025. Mixed attention and channel shift transformer for efficient action recognition. ACM Transactions on Multimedia Computing Communications and Applications 21 (2025) 1\u201320.","DOI":"10.1145\/3712594"},{"key":"e_1_3_3_1_25_2","unstructured":"Huaihai Lyu Chaofan Chen Yuheng Ji and Changsheng Xu. 2025. Egoprompt: Prompt learning for egocentric action recognition. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2508.03266."},{"key":"e_1_3_3_1_26_2","first-page":"1","volume-title":"Proceedings of the International Conference on Learning Representations","author":"Menon Aditya\u00a0Krishna","year":"2021","unstructured":"Aditya\u00a0Krishna Menon, Sadeep Jayasumana, Ankit\u00a0Singh Rawat, Himanshu Jain, Andreas Veit, and Sanjiv Kumar. 2021. Long-tail learning via logit adjustment. In Proceedings of the International Conference on Learning Representations. 1\u201324."},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01275"},{"key":"e_1_3_3_1_28_2","first-page":"1","volume-title":"Proceedings of the International Conference on Learning Representations","author":"Pei Baoqi","year":"2025","unstructured":"Baoqi Pei, Yifei Huang, Jilan Xu, Guo Chen, Yuping He, Lijin Yang, Yali Wang, Weidi Xie, Yu Qiao, Fei Wu, and Limin Wang. 2025. Modeling fine-grained hand-object dynamics for egocentric video representation learning. In Proceedings of the International Conference on Learning Representations. 1\u201322."},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00487"},{"key":"e_1_3_3_1_30_2","first-page":"8748","volume-title":"Proceedings of the International Conference on Machine Learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In Proceedings of the International Conference on Machine Learning. 8748\u20138763."},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00278"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/3549555.3549585"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00431"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.02042"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00641"},{"key":"e_1_3_3_1_36_2","unstructured":"Gunnar\u00a0A Sigurdsson Abhinav Gupta Cordelia Schmid Ali Farhadi and Karteek Alahari. 2018. Charades-ego: A large-scale dataset of paired third and first person videos. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1804.09626."},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"crossref","unstructured":"Xuemeng Song Haoqiang Lin Haokun Wen Bohan Hou Mingzhu Xu and Liqiang Nie. 2025. A comprehensive survey on composed image retrieval. ACM Transactions on Information Systems 44 1 (2025) 54.","DOI":"10.1145\/3767328"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"crossref","unstructured":"Shuhan Tan Tushar Nagarajan and Kristen Grauman. 2023. Egodistill: Egocentric head motion distillation for efficient video understanding. Advances in Neural Information Processing Systems 36 (2023) 33485\u201333498.","DOI":"10.52202\/075280-1455"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00091"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"crossref","unstructured":"Thanh-Dat Truong and Khoa Luu. 2025. Cross-view action recognition understanding from exocentric to egocentric perspective. Neurocomputing 614 (2025) 128731.","DOI":"10.1016\/j.neucom.2024.128731"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00484"},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00306"},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01854"},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01322"},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV61041.2025.00897"},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01284"},{"key":"e_1_3_3_1_47_2","first-page":"1","volume-title":"Proceedings of the International Conference on Learning Representations","author":"Xu Jilan","year":"2025","unstructured":"Jilan Xu, Yifei Huang, Baoqi Pei, Junlin Hou, Qingqiu Li, Guo Chen, Yuejie Zhang, Rui Feng, and Weidi Xie. 2025. EgoExo-Gen: Ego-centric video prediction by watching exo-centric videos. In Proceedings of the International Conference on Learning Representations. 1\u201318."},{"key":"e_1_3_3_1_48_2","doi-asserted-by":"crossref","unstructured":"Mingzhu Xu Zhengyu Sun Yijun Hu Haoyu Tang Yupeng Hu Xuemeng Song and Liqiang Nie. 2025. Superpixel segmentation with edge guided local-global attention network. IEEE Transactions on Circuits and Systems for Video Technology 35 12 (2025) 11922\u201311934.","DOI":"10.1109\/TCSVT.2025.3587485"},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00486"},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"crossref","unstructured":"Zihui\u00a0Sherry Xue and Kristen Grauman. 2023. Learning fine-grained view-invariant representations from unpaired ego-exo videos via temporal alignment. Advances in Neural Information Processing Systems 36 (2023) 53688\u201353710.","DOI":"10.52202\/075280-2336"},{"key":"e_1_3_3_1_51_2","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462823"},{"key":"e_1_3_3_1_52_2","doi-asserted-by":"crossref","unstructured":"Xun Yang Shanshan Wang Jian Dong Jianfeng Dong Meng Wang and Tat-Seng Chua. 2022. Video moment retrieval with cross-modal neural architecture search. IEEE Transactions on Image Processing 31 (2022) 1204\u20131216.","DOI":"10.1109\/TIP.2022.3140611"},{"key":"e_1_3_3_1_53_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20215"},{"key":"e_1_3_3_1_54_2","unstructured":"Michihiro Yasunaga Armen Aghajanyan Weijia Shi Rich James Jure Leskovec Percy Liang Mike Lewis Luke Zettlemoyer and Wen-tau Yih. 2022. Retrieval-augmented multimodal language modeling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2211.12561."},{"key":"e_1_3_3_1_55_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10889703"},{"key":"e_1_3_3_1_56_2","unstructured":"Yue Zhao and Philipp Kr\u00e4henb\u00fchl. 2023. Training a large video model on a single machine in a day. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.16669."},{"key":"e_1_3_3_1_57_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00637"},{"key":"e_1_3_3_1_58_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12342"}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:00:28Z","timestamp":1781535628000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810764"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":57,"alternative-id":["10.1145\/3805622.3810764","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810764","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}