{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T10:15:14Z","timestamp":1777889714229,"version":"3.51.4"},"reference-count":62,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.01311","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"14128-14140","source":"Crossref","is-referenced-by-count":0,"title":["ProbRes: Probabilistic Jump Diffusion for Open-World Egocentric Activity Recognition"],"prefix":"10.1109","author":[{"given":"Sanjoy","family":"Kundu","sequence":"first","affiliation":[{"name":"Auburn University Auburn,CSSE Department,Alabama,USA,36849"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shanmukha","family":"Vellamcheti","sequence":"additional","affiliation":[{"name":"Auburn University Auburn,CSSE Department,Alabama,USA,36849"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sathyanarayanan N.","family":"Aakur","sequence":"additional","affiliation":[{"name":"Auburn University Auburn,CSSE Department,Alabama,USA,36849"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.5220\/0010288009350942"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1016\/j.patrec.2022.03.007"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02209"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19778-9_38"},{"key":"ref5","author":"Brown","year":"2020","journal-title":"Language Models are Few-Shot Learners"},{"key":"ref6","author":"Bubeck","year":"2023","journal-title":"Sparks of Artificial General Intelligence: Early experiments with GPT-4"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1440"},{"key":"ref8","author":"Chen","journal-title":"A Simple Framework for Contrastive Learning of Visual Representations"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2024.3521658"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1090\/mbk\/121\/79"},{"key":"ref11","author":"Guo","year":"2025","journal-title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning"},{"key":"ref12","author":"Devlin","journal-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding"},{"key":"ref13","author":"Dong","year":"2022","journal-title":"Open World DETR: Transformer based Open World Object Detection"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01369"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00536"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-33718-5_23"},{"key":"ref17","article-title":"The Llama 3 Herd of Models","author":"Grattafiori","year":"2024","journal-title":"arXiv"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01842"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01834"},{"key":"ref20","author":"Gu","journal-title":"Open-vocabulary Object Detection via Vision and Language Knowledge Distillation"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3386569.3392452"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.52202\/068431-0242"},{"key":"ref23","author":"Jia","year":"2021","journal-title":"Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision"},{"key":"ref24","first-page":"18661","article-title":"Supervised Contrastive Learning","author":"Khosla","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00609"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73202-7_3"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01765"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00812"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.399"},{"key":"ref30","author":"Li","journal-title":"Supervision Exists Everywhere: A Data Efficient Contrastive Language-Image Pre-training Paradigm"},{"key":"ref31","article-title":"Egocentric Video-Language Pretraining","author":"Qinghong Lin","year":"2022","journal-title":"arXiv"},{"key":"ref32","author":"Liu","year":"2019","journal-title":"RoBERTa: A Robustly Optimized BERT Pretraining Approach"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.350"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.209"},{"key":"ref35","article-title":"Compositional entailment learning for hyperbolic vision-language models","author":"Pal","year":"2024","journal-title":"arXiv preprint"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28247"},{"key":"ref37","author":"Pichai","year":"2024","journal-title":"Introducing gemini 2.0: Our new ai model for the agentic era"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00487"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1613\/jair.1.14476"},{"key":"ref40","author":"Radford","journal-title":"Improving Language Understanding by Generative Pre-Training"},{"key":"ref41","author":"Radford","journal-title":"Language Models are Unsupervised Multitask Learners"},{"key":"ref42","author":"Radford","year":"2021","journal-title":"Learning Transferable Visual Models From Natural Language Supervision"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298691"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.02042"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2018.8462891"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00772"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v31i1.11164"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v31i1.11164"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01019"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.649"},{"key":"ref51","author":"Touvron","year":"2023","journal-title":"LLaMA: Open and Efficient Foundation Language Models"},{"key":"ref52","article-title":"Llama 2: Open Foundation and FineTuned Chat Models","author":"Touvron","year":"2023","journal-title":"arXiv"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00806"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.3115\/981732.981751"},{"key":"ref55","author":"Xi","journal-title":"UMB: Understanding Model Behavior for Open-World Object Detection"},{"key":"ref56","author":"Xu","journal-title":"Do Egocentric Video-Language Models Truly Understand Hand-Object Interactions?"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.52202\/075280-2336"},{"key":"ref58","author":"Ye","journal-title":"MMEgo: Towards Building Egocentric Multimodal LLMs"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00221"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2017.21"},{"key":"ref61","article-title":"Learning Video Representations from Large Language Models","author":"Zhao","year":"2022","journal-title":"arXiv"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.210"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11445159.pdf?arnumber=11445159","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T05:18:46Z","timestamp":1777612726000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11445159\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":62,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.01311","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}