{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:28:53Z","timestamp":1778081333125,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":29,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,16]]},"DOI":"10.1145\/3699682.3728327","type":"proceedings-article","created":{"date-parts":[[2025,6,13]],"date-time":"2025-06-13T13:05:37Z","timestamp":1749819937000},"page":"224-233","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["UI-JEPA: Towards Active Perception of User Intent through Onscreen User Activity"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-1349-6412","authenticated-orcid":false,"given":"Yicheng","family":"Fu","sequence":"first","affiliation":[{"name":"Stanford University, Stanford, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4244-4553","authenticated-orcid":false,"given":"Raviteja","family":"Anantha","sequence":"additional","affiliation":[{"name":"Amazon, Seattle, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-6281-5202","authenticated-orcid":false,"given":"Prabal","family":"Vashisht","sequence":"additional","affiliation":[{"name":"Apple Inc., Seattle, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-5302-1241","authenticated-orcid":false,"given":"Jianpeng","family":"Cheng","sequence":"additional","affiliation":[{"name":"Meta, Menlo Park, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7396-4658","authenticated-orcid":false,"given":"Etai","family":"Littwin","sequence":"additional","affiliation":[{"name":"Apple Inc., Cupertino, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,6,13]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"Marah Abdin Sam\u00a0Ade Jacobs Ammar\u00a0Ahmad Awan Jyoti Aneja Ahmed Awadallah Hany Awadalla Nguyen Bach Amit Bahree Arash Bakhtiari Harkirat Behl et\u00a0al. 2024. Phi-3 technical report: A highly capable language model locally on your phone. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.14219 (2024)."},{"key":"e_1_3_3_2_3_2","unstructured":"Anthropic. 2024. Anthropic Claude 3.5 Sonnet. https:\/\/www.anthropic.com\/news\/claude-3-5-sonnet"},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01499"},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"crossref","unstructured":"Gilles Baechler Srinivas Sunkara Maria Wang Fedir Zubach Hassan Mansoor Vincent Etter Victor C\u0103rbune Jason Lin Jindong Chen and Abhanshu Sharma. 2024. ScreenAI: A Vision-Language Model for UI and Infographics Understanding. arxiv:https:\/\/arXiv.org\/abs\/2402.04615\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2402.04615","DOI":"10.24963\/ijcai.2024\/339"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2021\/235"},{"key":"e_1_3_3_2_7_2","unstructured":"Adrien Bardes Quentin Garrido Jean Ponce Xinlei Chen Michael Rabbat Yann LeCun Mido Assran and Nicolas Ballas. 2024. V-JEPA: Latent Video Prediction for Visual Representation Learning. (2024). https:\/\/openreview.net\/forum?id=WFYbBOEOtv"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.1145\/3126594.3126651"},{"key":"e_1_3_3_2_9_2","unstructured":"Tim Dettmers Artidoro Pagnoni Ari Holtzman and Luke Zettlemoyer. 2023. QLoRA: Efficient Finetuning of Quantized LLMs. arxiv:https:\/\/arXiv.org\/abs\/2305.14314\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/2305.14314"},{"key":"e_1_3_3_2_10_2","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly Jakob Uszkoreit and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. arxiv:https:\/\/arXiv.org\/abs\/2010.11929\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2010.11929"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i7.16741"},{"key":"e_1_3_3_2_12_2","volume-title":"In ICLR, 2022.","author":"Hu Edward","year":"2022","unstructured":"Edward Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2022. LoRA: Low-Rank Adaptation of Large Language Models. In In ICLR, 2022.https:\/\/iclr.cc\/virtual\/2022\/poster\/6319"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642074"},{"key":"e_1_3_3_2_14_2","unstructured":"Yann LeCun. 2022. A Path Towards Autonomous Machine Intelligence. (2022). https:\/\/openreview.net\/pdf?id=BZ5a1r-kVsf"},{"key":"e_1_3_3_2_15_2","unstructured":"Gang Li and Yang Li. 2023. Spotlight: Mobile UI Understanding using Vision-Language Models with a Focus. arxiv:https:\/\/arXiv.org\/abs\/2209.14927\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2209.14927"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445049"},{"key":"e_1_3_3_2_17_2","first-page":"74","volume-title":"Text Summarization Branches Out","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. ROUGE: A Package for Automatic Evaluation of Summaries. In Text Summarization Branches Out. Association for Computational Linguistics, Barcelona, Spain, 74\u201381. https:\/\/aclanthology.org\/W04-1013"},{"key":"e_1_3_3_2_18_2","unstructured":"OpenAI. 2024. GPT-4 Technical Report. arxiv:https:\/\/arXiv.org\/abs\/2303.08774\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2303.08774"},{"key":"e_1_3_3_2_19_2","unstructured":"OpenAI. 2024. GPT-4o Mini. https:\/\/openai.com\/index\/gpt-4o-mini-advancing-cost-efficient-intelligence\/"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","DOI":"10.1038\/4580"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"crossref","unstructured":"Nils Reimers and Iryna Gurevych. 2019. Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks. arxiv:https:\/\/arXiv.org\/abs\/1908.10084\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/1908.10084","DOI":"10.18653\/v1\/D19-1410"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642777"},{"key":"e_1_3_3_2_23_2","unstructured":"Antti Tarvainen and Harri Valpola. 2018. Mean teachers are better role models: Weight-averaged consistency targets improve semi-supervised deep learning results. arxiv:https:\/\/arXiv.org\/abs\/1703.01780\u00a0[cs.NE] https:\/\/arxiv.org\/abs\/1703.01780"},{"key":"e_1_3_3_2_24_2","unstructured":"Dr\u00a0Alan\u00a0D. Thompson. 2024. The Memo - Special edition: Claude 3 Opus. Substack. LifeArchitect.ai."},{"key":"e_1_3_3_2_25_2","volume-title":"36th Conference on Neural Information Processing Systems (NeurIPS 2022)","author":"Tong Zhan","year":"2023","unstructured":"Zhan Tong, Yibing Song, Jue Wang, and Limin Wang. 2023. VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training. In 36th Conference on Neural Information Processing Systems (NeurIPS 2022). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2022\/file\/416f9cb3276121c42eebb86352a4354a-Paper-Conference.pdf"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"crossref","unstructured":"Bryan Wang Gang Li Xin Zhou Zhourong Chen Tovi Grossman and Yang Li. 2021. Screen2Words: Automatic Mobile UI Summarization with Multimodal Learning. arxiv:https:\/\/arXiv.org\/abs\/2108.03353\u00a0[cs.HC] https:\/\/arxiv.org\/abs\/2108.03353","DOI":"10.1145\/3472749.3474765"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/3636534.3649379"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606824"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"crossref","unstructured":"Keen You Haotian Zhang Eldon Schoop Floris Weers Amanda Swearngin Jeffrey Nichols Yinfei Yang and Zhe Gan. 2024. Ferret-UI: Grounded Mobile UI Understanding with Multimodal LLMs. arxiv:https:\/\/arXiv.org\/abs\/2404.05719\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2404.05719","DOI":"10.1007\/978-3-031-73039-9_14"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445186"}],"event":{"name":"UMAP '25: 33rd ACM Conference on User Modeling, Adaptation and Personalization","location":"New York City USA","acronym":"UMAP '25","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction","SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Proceedings of the 33rd ACM Conference on User Modeling, Adaptation and Personalization"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3699682.3728327","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,13]],"date-time":"2025-06-13T13:08:39Z","timestamp":1749820119000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3699682.3728327"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,13]]},"references-count":29,"alternative-id":["10.1145\/3699682.3728327","10.1145\/3699682"],"URL":"https:\/\/doi.org\/10.1145\/3699682.3728327","relation":{},"subject":[],"published":{"date-parts":[[2025,6,13]]},"assertion":[{"value":"2025-06-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}