{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,14]],"date-time":"2026-04-14T00:38:44Z","timestamp":1776127124526,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":125,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,9,28]]},"DOI":"10.1145\/3746059.3747612","type":"proceedings-article","created":{"date-parts":[[2025,9,27]],"date-time":"2025-09-27T07:49:12Z","timestamp":1758959352000},"page":"1-24","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":9,"title":["Vid2Coach: Transforming How-To Videos into Task Assistants"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0031-9180","authenticated-orcid":false,"given":"Mina","family":"Huh","sequence":"first","affiliation":[{"name":"The University of Texas at Austin, Austin, Texas, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7394-5169","authenticated-orcid":false,"given":"Zihui","family":"Xue","sequence":"additional","affiliation":[{"name":"The University of Texas at Austin, Austin, Texas, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7250-5699","authenticated-orcid":false,"given":"Ujjaini","family":"Das","sequence":"additional","affiliation":[{"name":"The University of Texas at Austin, Austin, Texas, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-5710-4342","authenticated-orcid":false,"given":"Kumar","family":"Ashutosh","sequence":"additional","affiliation":[{"name":"The University of Texas at Austin, Austin, Texas, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9591-5873","authenticated-orcid":false,"given":"Kristen","family":"Grauman","sequence":"additional","affiliation":[{"name":"The University of Texas at Austin, Austin, Texas, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3908-4366","authenticated-orcid":false,"given":"Amy","family":"Pavel","sequence":"additional","affiliation":[{"name":"University of California, Berkeley, Berkeley, California, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,9,27]]},"reference":[{"key":"e_1_3_3_3_2_2","unstructured":"Last visited: 2025. https:\/\/www.bemyeyes.com\/"},{"key":"e_1_3_3_3_3_2","unstructured":"Last visited: 2025. https:\/\/aira.io\/"},{"key":"e_1_3_3_3_4_2","unstructured":"Josh Achiam Steven Adler Sandhini Agarwal Lama Ahmad Ilge Akkaya Florencia\u00a0Leoni Aleman Diogo Almeida Janko Altenschmidt Sam Altman Shyamal Anadkat et\u00a0al. 2023. Gpt-4 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.08774 (2023)."},{"key":"e_1_3_3_3_5_2","unstructured":"Google Cloud\u00a0Vertex AI. [n. d.]. Multimodal Live API. https:\/\/cloud.google.com\/vertex-ai\/generative-ai\/docs\/multimodal-live-api"},{"key":"e_1_3_3_3_6_2","doi-asserted-by":"publisher","DOI":"10.1145\/3663548.3675659"},{"key":"e_1_3_3_3_7_2","doi-asserted-by":"crossref","unstructured":"Riku Arakawa Jill\u00a0Fain Lehman and Mayank Goel. 2024. Prism-q&a: Step-aware voice assistant on a smartwatch enabled by multimodal procedure tracking and large language models. Proceedings of the ACM on Interactive Mobile Wearable and Ubiquitous Technologies 8 4 (2024) 1\u201326.","DOI":"10.1145\/3699759"},{"key":"e_1_3_3_3_8_2","doi-asserted-by":"publisher","DOI":"10.1145\/3654777.3676350"},{"key":"e_1_3_3_3_9_2","doi-asserted-by":"crossref","unstructured":"Rob Argent Ailish Daly and Brian Caulfield. 2018. Patient involvement with home-based exercise programs: can connected health interventions influence adherence? JMIR mHealth and uHealth 6 3 (2018) e8518.","DOI":"10.2196\/mhealth.8518"},{"key":"e_1_3_3_3_10_2","unstructured":"Kumar Ashutosh Tushar Nagarajan Georgios Pavlakos Kris Kitani and Kristen Grauman. 2024. ExpertAF: Expert actionable feedback from video. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.00672 (2024)."},{"key":"e_1_3_3_3_11_2","unstructured":"Kumar Ashutosh Santhosh\u00a0Kumar Ramakrishnan Triantafyllos Afouras and Kristen Grauman. 2023. Video-mined task graphs for keystep recognition in instructional videos. Advances in Neural Information Processing Systems 36 (2023) 67833\u201367846."},{"key":"e_1_3_3_3_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01779"},{"key":"e_1_3_3_3_13_2","unstructured":"Zechen Bai Pichao Wang Tianjun Xiao Tong He Zongbo Han Zheng Zhang and Mike\u00a0Zheng Shou. 2024. Hallucination of multimodal large language models: A survey. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.18930 (2024)."},{"key":"e_1_3_3_3_14_2","doi-asserted-by":"publisher","DOI":"10.1145\/2380116.2380129"},{"key":"e_1_3_3_3_15_2","unstructured":"BBC. 2018. Mary Berry\u2019s tasty eggs Benedict Florentine - Classic Mary Berry - BBC. https:\/\/www.youtube.com\/watch?v=YybJTrdwWQk"},{"key":"e_1_3_3_3_16_2","doi-asserted-by":"crossref","unstructured":"Hugh Beyer and Karen Holtzblatt. 1999. Contextual design. interactions 6 1 (1999) 32\u201342.","DOI":"10.1145\/291224.291229"},{"key":"e_1_3_3_3_17_2","doi-asserted-by":"publisher","DOI":"10.1145\/1866029.1866080"},{"key":"e_1_3_3_3_18_2","doi-asserted-by":"crossref","unstructured":"Marie\u00a0Claire Bilyk Jessica\u00a0M Sontrop Gwen\u00a0E Chapman Susan\u00a0I Barr and Linda Mamer. 2009. Food experiences and eating patterns of visually impaired and blind people. Canadian Journal of Dietetic practice and research 70 1 (2009) 13\u201318.","DOI":"10.3148\/70.1.2009.13"},{"key":"e_1_3_3_3_19_2","unstructured":"Chi-Min Chan Chunpu Xu Ruibin Yuan Hongyin Luo Wei Xue Yike Guo and Jie Fu. 2024. Rq-rag: Learning to refine queries for retrieval augmented generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.00610 (2024)."},{"key":"e_1_3_3_3_20_2","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445131"},{"key":"e_1_3_3_3_21_2","doi-asserted-by":"publisher","DOI":"10.1145\/3654777.3676375"},{"key":"e_1_3_3_3_22_2","doi-asserted-by":"publisher","DOI":"10.1145\/1240866.1241003"},{"key":"e_1_3_3_3_23_2","doi-asserted-by":"publisher","DOI":"10.1145\/2501988.2502052"},{"key":"e_1_3_3_3_24_2","unstructured":"Ken Click. 2024. Crisp Tortilla Pizza!https:\/\/www.youtube.com\/watch?v=2U45pP3i85g"},{"key":"e_1_3_3_3_25_2","doi-asserted-by":"publisher","DOI":"10.1145\/2470654.2481340"},{"key":"e_1_3_3_3_26_2","doi-asserted-by":"crossref","unstructured":"Elyse\u00a0M Connors Polly\u00a0M Abbott Daniel\u00a0E Norris Jennifer\u00a0J Ottowitz and Brigitte\u00a0N Morren. 2023. The Perspectives of Vision Rehabilitation Therapists on the State of the Profession: A Time for Action? Journal of Visual Impairment & Blindness 117 4 (2023) 303\u2013313.","DOI":"10.1177\/0145482X231194634"},{"key":"e_1_3_3_3_27_2","unstructured":"Cook!\u00a0Stacey Cook. 2024. Beef And Broccoli Stir Fry | Beef Stir Fry With Vegetables. https:\/\/www.youtube.com\/watch?v=BBABeZjlRM8"},{"key":"e_1_3_3_3_28_2","unstructured":"Crouton Crackerjacks. 2014. How to Make Tiramisu!! Classic Italian Dessert Recipe. https:\/\/www.youtube.com\/watch?v=bvVH4Mk2ku4"},{"key":"e_1_3_3_3_29_2","unstructured":"Crouton Crackerjacks. 2017. How to Make Strawberry Jam!! Homemade Small Batch Preserves Recipe. https:\/\/www.youtube.com\/watch?v=F5LhDkAfxA8"},{"key":"e_1_3_3_3_30_2","unstructured":"Google Deepmind. [n. d.]. Project Astra. https:\/\/deepmind.google\/models\/project-astra\/"},{"key":"e_1_3_3_3_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01817"},{"key":"e_1_3_3_3_32_2","unstructured":"Fallow. 2023. POV: How to Make an Omelette Like a Chef. https:\/\/www.youtube.com\/watch?v=fqqwFWqxUr4"},{"key":"e_1_3_3_3_33_2","doi-asserted-by":"publisher","DOI":"10.1145\/3290605.3300527"},{"key":"e_1_3_3_3_34_2","unstructured":"Chaoyou Fu Yuhan Dai Yongdong Luo Lei Li Shuhuai Ren Renrui Zhang Zihan Wang Chenyu Zhou Yunhang Shen Mengdan Zhang et\u00a0al. 2024. Video-mme: The first-ever comprehensive evaluation benchmark of multi-modal llms in video analysis. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.21075 (2024)."},{"key":"e_1_3_3_3_35_2","doi-asserted-by":"crossref","unstructured":"Charles Goodwin and John Heritage. 1990. Conversation analysis. Annual review of anthropology 19 (1990) 283\u2013307.","DOI":"10.1146\/annurev.an.19.100190.001435"},{"key":"e_1_3_3_3_36_2","doi-asserted-by":"publisher","DOI":"10.1145\/1866029.1866054"},{"key":"e_1_3_3_3_37_2","doi-asserted-by":"publisher","DOI":"10.1016\/S0166-4115(08)62386-9"},{"key":"e_1_3_3_3_38_2","unstructured":"HoneySuckle. [n. d.]. World\u2019s Best CHOCOLATE CHIP COOKIES Recipe: Crunchy Outside Soft & Chewy Inside. https:\/\/www.youtube.com\/watch?v=f-M3JN_7LGU"},{"key":"e_1_3_3_3_39_2","unstructured":"Honeysuckle. 2020. World\u2019s Best CHOCOLATE CHIP COOKIES Recipe: Crunchy Outside Soft & Chewy Inside. https:\/\/www.youtube.com\/watch?v=f-M3JN_7LGU"},{"key":"e_1_3_3_3_40_2","doi-asserted-by":"crossref","unstructured":"Baixiang Huang Canyu Chen and Kai Shu. 2024. Can large language models identify authorship? arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.08213 (2024).","DOI":"10.18653\/v1\/2024.findings-emnlp.26"},{"key":"e_1_3_3_3_41_2","doi-asserted-by":"publisher","DOI":"10.1145\/3173574.3173869"},{"key":"e_1_3_3_3_42_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3502081"},{"key":"e_1_3_3_3_43_2","doi-asserted-by":"publisher","DOI":"10.1145\/3654777.3676369"},{"key":"e_1_3_3_3_44_2","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606735"},{"key":"e_1_3_3_3_45_2","unstructured":"Mina Huh Fangyuan Xu Yi-Hao Peng Chongyan Chen Hansika Murugu Danna Gurari Eunsol Choi and Amy Pavel. 2024. Long-Form Answers to Visual Questions from Blind and Low Vision People. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.06303 (2024)."},{"key":"e_1_3_3_3_46_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581494"},{"key":"e_1_3_3_3_47_2","doi-asserted-by":"publisher","DOI":"10.1145\/3563657.3596059"},{"key":"e_1_3_3_3_48_2","doi-asserted-by":"publisher","DOI":"10.1109\/IROS55552.2023.10342380"},{"key":"e_1_3_3_3_49_2","doi-asserted-by":"publisher","DOI":"10.1109\/SMC.2017.8122808"},{"key":"e_1_3_3_3_50_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642183"},{"key":"e_1_3_3_3_51_2","first-page":"1","volume-title":"Proceedings of the CHI Conference on Human Factors in Computing Systems","author":"Jiang Lucy","year":"2024","unstructured":"Lucy Jiang, Crescentia Jung, Mahika Phutane, Abigale Stangl, and Shiri Azenkot. 2024. \u201cIt\u2019s Kind of Context Dependent\u201d: Understanding Blind and Low Vision People\u2019s Video Accessibility Preferences Across Viewing Scenarios. In Proceedings of the CHI Conference on Human Factors in Computing Systems. 1\u201320."},{"key":"e_1_3_3_3_52_2","unstructured":"Liqiang Jing Ruosen Li Yunmo Chen and Xinya Du. 2023. FaithScore: Fine-grained Evaluations of Hallucinations in Large Vision-Language Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.01477 (2023)."},{"key":"e_1_3_3_3_53_2","unstructured":"Johnny. 2020. How to Make the Meat filling for Dumplings (Mandu). https:\/\/www.youtube.com\/watch?v=61bEy8CX53c"},{"key":"e_1_3_3_3_54_2","unstructured":"The\u00a0Wallstreet Journal. [n. d.]. Meta\u2019s AI-Powered Ray-Bans Are Life-Enhancing for the Blind. https:\/\/www.wsj.com\/tech\/ai\/metas-ai-powered-ray-bans-are-life-enhancing-for-the-blind-3ae38026"},{"key":"e_1_3_3_3_55_2","doi-asserted-by":"publisher","DOI":"10.1145\/634067.634227"},{"key":"e_1_3_3_3_56_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3580772"},{"key":"e_1_3_3_3_57_2","doi-asserted-by":"publisher","DOI":"10.1145\/2556288.2556986"},{"key":"e_1_3_3_3_58_2","unstructured":"Natashas Kitchen. [n. d.]. Dessert: Easy Mini Pavlovas - Homemade Meringues Recipe. https:\/\/www.youtube.com\/watch?v=Zo5ATW4eq8o"},{"key":"e_1_3_3_3_59_2","unstructured":"Natasha\u2019s Kitchen. 2016. Dessert: Easy Mini Pavlovas - Homemade Meringues Recipe. https:\/\/www.youtube.com\/watch?v=Zo5ATW4eq8o"},{"key":"e_1_3_3_3_60_2","unstructured":"Preppy Kitchen. 2021. Mashed Potatoes Recipe. https:\/\/www.youtube.com\/watch?v=HfdFlenF6XI"},{"key":"e_1_3_3_3_61_2","unstructured":"KQED. 2020. Bread Flapjacks | Jacques P\u00e9pin Cooking At Home | KQED. https:\/\/www.youtube.com\/watch?v=86CeN5AFMG0"},{"key":"e_1_3_3_3_62_2","doi-asserted-by":"publisher","DOI":"10.1145\/3132525.3134796"},{"key":"e_1_3_3_3_63_2","doi-asserted-by":"publisher","DOI":"10.1145\/3654777.3676449"},{"key":"e_1_3_3_3_64_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642230"},{"key":"e_1_3_3_3_65_2","unstructured":"Yibin Lei Yu Cao Tianyi Zhou Tao Shen and Andrew Yates. 2024. Corpus-steered query expansion with large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.18031 (2024)."},{"key":"e_1_3_3_3_66_2","unstructured":"Patrick Lewis Ethan Perez Aleksandra Piktus Fabio Petroni Vladimir Karpukhin Naman Goyal Heinrich K\u00fcttler Mike Lewis Wen-tau Yih Tim Rockt\u00e4schel et\u00a0al. 2020. Retrieval-augmented generation for knowledge-intensive nlp tasks. Advances in neural information processing systems 33 (2020) 9459\u20139474."},{"key":"e_1_3_3_3_67_2","unstructured":"Bo Li Yuanhan Zhang Dong Guo Renrui Zhang Feng Li Hao Zhang Kaichen Zhang Peiyuan Zhang Yanwei Li Ziwei Liu et\u00a0al. 2024. Llava-onevision: Easy visual task transfer. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.03326 (2024)."},{"key":"e_1_3_3_3_68_2","unstructured":"Chaoyu Li Sid Padmanabhuni Maryam Cheema Hasti Seifi and Pooyan Fazli. 2025. VideoA11y: Method and Dataset for Accessible Video Description. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.20480 (2025)."},{"key":"e_1_3_3_3_69_2","doi-asserted-by":"publisher","DOI":"10.1145\/3441852.3471215"},{"key":"e_1_3_3_3_70_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642233"},{"key":"e_1_3_3_3_71_2","unstructured":"Franklin\u00a0Mingzhe Li Kaitlyn Ng Bin Zhu and Patrick Carrington. 2025. OSCAR: Object Status and Contextual Awareness for Recipes to Support Non-Visual Cooking. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2503.05962 (2025)."},{"key":"e_1_3_3_3_72_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3517490"},{"key":"e_1_3_3_3_73_2","doi-asserted-by":"publisher","DOI":"10.1145\/3663548.3675662"},{"key":"e_1_3_3_3_74_2","doi-asserted-by":"publisher","DOI":"10.1145\/3173574.3173961"},{"key":"e_1_3_3_3_75_2","unstructured":"Li Liu Diji Yang Sijia Zhong Kalyana Suma\u00a0Sree Tholeti Lei Ding Yi Zhang and Leilani\u00a0H Gilpin. 2024. Right this way: Can VLMs Guide Us to See More to Answer Questions? arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2411.00394 (2024)."},{"key":"e_1_3_3_3_76_2","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445233"},{"key":"e_1_3_3_3_77_2","doi-asserted-by":"publisher","DOI":"10.1145\/3526113.3545703"},{"key":"e_1_3_3_3_78_2","unstructured":"Microsoft. [n. d.]. Microsoft AI Audio Descriptions. https:\/\/github.com\/microsoft\/ai-audio-descriptions"},{"key":"e_1_3_3_3_79_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00272"},{"key":"e_1_3_3_3_80_2","doi-asserted-by":"crossref","unstructured":"Sewon Min Kalpesh Krishna Xinxi Lyu Mike Lewis Wen-tau Yih Pang\u00a0Wei Koh Mohit Iyyer Luke Zettlemoyer and Hannaneh Hajishirzi. 2023. Factscore: Fine-grained atomic evaluation of factual precision in long form text generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.14251 (2023).","DOI":"10.18653\/v1\/2023.emnlp-main.741"},{"key":"e_1_3_3_3_81_2","doi-asserted-by":"publisher","DOI":"10.1145\/3242587.3242633"},{"key":"e_1_3_3_3_82_2","doi-asserted-by":"publisher","DOI":"10.1145\/3500868.3559470"},{"key":"e_1_3_3_3_83_2","doi-asserted-by":"publisher","DOI":"10.1145\/3663548.3675617"},{"key":"e_1_3_3_3_84_2","doi-asserted-by":"publisher","DOI":"10.1145\/2702123.2702209"},{"key":"e_1_3_3_3_85_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642632"},{"key":"e_1_3_3_3_86_2","unstructured":"OpenAI. [n. d.]. OpenAI Whisper. https:\/\/openai.com\/index\/whisper\/"},{"key":"e_1_3_3_3_87_2","volume-title":"Educational Data Mining 2014","author":"Ostrow Korinn","year":"2014","unstructured":"Korinn Ostrow and Neil Heffernan. 2014. Testing the multimedia principle in the real world: a comparison of video vs. Text feedback in authentic middle school math assignments. In Educational Data Mining 2014."},{"key":"e_1_3_3_3_88_2","unstructured":"Viorica Patraucean Lucas Smaira Ankush Gupta Adria Recasens Larisa Markeeva Dylan Banarse Skanda Koppula Mateusz Malinowski Yi Yang Carl Doersch et\u00a0al. 2023. Perception test: A diagnostic benchmark for multimodal video models. Advances in Neural Information Processing Systems 36 (2023) 42748\u201342761."},{"key":"e_1_3_3_3_89_2","doi-asserted-by":"publisher","DOI":"10.1145\/2642918.2647400"},{"key":"e_1_3_3_3_90_2","doi-asserted-by":"publisher","DOI":"10.1145\/3379337.3415864"},{"key":"e_1_3_3_3_91_2","unstructured":"Rohith Peddi Shivvrat Arya Bharath Challa Likhitha Pallapothula Akshay Vyas Bhavya Gouripeddi Qifan Zhang Jikai Wang Vasundhara Komaragiri Eric Ragan et\u00a0al. 2024. CaptainCook4D: A dataset for understanding errors in procedural activities. Advances in Neural Information Processing Systems 37 (2024) 135626\u2013135679."},{"key":"e_1_3_3_3_92_2","doi-asserted-by":"publisher","DOI":"10.1145\/3441852.3471234"},{"key":"e_1_3_3_3_93_2","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445572"},{"key":"e_1_3_3_3_94_2","doi-asserted-by":"publisher","DOI":"10.1145\/3573051.3593376"},{"key":"e_1_3_3_3_95_2","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748\u20138763."},{"key":"e_1_3_3_3_96_2","unstructured":"Gordon Ramsay. [n. d.]. How To Cook Eggs Benedict | Gordon Ramsay. https:\/\/www.youtube.com\/watch?v=gBJjRYk0yC0"},{"key":"e_1_3_3_3_97_2","unstructured":"Gordon Ramsey. 2018. How To Cook Eggs Benedict | Gordon Ramsay. https:\/\/www.youtube.com\/watch?v=f-M3JN_7LGU"},{"key":"e_1_3_3_3_98_2","doi-asserted-by":"publisher","DOI":"10.1145\/2513383.2513392"},{"key":"e_1_3_3_3_99_2","doi-asserted-by":"publisher","DOI":"10.1145\/3677386.3682103"},{"key":"e_1_3_3_3_100_2","doi-asserted-by":"publisher","DOI":"10.1145\/3210825.3213565"},{"key":"e_1_3_3_3_101_2","unstructured":"Yale Song Eugene Byrne Tushar Nagarajan Huiyu Wang Miguel Martin and Lorenzo Torresani. 2024. Ego4d goal-step: Toward hierarchical understanding of procedural activities. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_3_3_102_2","unstructured":"Sweetology. [n. d.]. Gingerbread House Assembly. https:\/\/www.youtube.com\/watch?v=ZW6plAZ-dkY"},{"key":"e_1_3_3_3_103_2","unstructured":"Gemini Team Rohan Anil Sebastian Borgeaud Jean-Baptiste Alayrac Jiahui Yu Radu Soricut Johan Schalkwyk Andrew\u00a0M Dai Anja Hauth Katie Millican et\u00a0al. 2023. Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.11805 (2023)."},{"key":"e_1_3_3_3_104_2","unstructured":"Just\u00a0Jordan Things. [n. d.]. How To Build a Flower Arrangement In Only 10 Minutes. https:\/\/www.youtube.com\/watch?v=9PVFYYLjN-w"},{"key":"e_1_3_3_3_105_2","doi-asserted-by":"publisher","DOI":"10.1145\/3290605.3300514"},{"key":"e_1_3_3_3_106_2","unstructured":"TikTok. [n. d.]. TikTok. https:\/\/tiktok.com"},{"key":"e_1_3_3_3_107_2","doi-asserted-by":"crossref","unstructured":"Tram Thi\u00a0Minh Tran Shane Brown Oliver Weidlich Soojeong Yoo and Callum Parker. 2025. Wearable AR in Everyday Contexts: Insights from a Digital Ethnography of YouTube Videos. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.06191 (2025).","DOI":"10.1145\/3706598.3713572"},{"key":"e_1_3_3_3_108_2","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445721"},{"key":"e_1_3_3_3_109_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642839"},{"key":"e_1_3_3_3_110_2","doi-asserted-by":"crossref","unstructured":"Marynel V\u00e1zquez and Aaron Steinfeld. 2014. An assisted photography framework to help visually impaired users properly aim a camera. ACM Transactions on Computer-Human Interaction (TOCHI) 21 5 (2014) 1\u201329.","DOI":"10.1145\/2651380"},{"key":"e_1_3_3_3_111_2","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan Jinze Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge et\u00a0al. 2024. Qwen2-vl: Enhancing vision-language model\u2019s perception of the world at any resolution. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.12191 (2024)."},{"key":"e_1_3_3_3_112_2","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445347"},{"key":"e_1_3_3_3_113_2","doi-asserted-by":"crossref","unstructured":"Maximiliane Windl and Sven Mayer. 2022. The skewed privacy concerns of bystanders in smart environments. Proceedings of the ACM on Human-Computer Interaction 6 MHCI (2022) 1\u201321.","DOI":"10.1145\/3546719"},{"key":"e_1_3_3_3_114_2","unstructured":"Guangxuan Xiao Yuandong Tian Beidi Chen Song Han and Mike Lewis. 2023. Efficient streaming language models with attention sinks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.17453 (2023)."},{"key":"e_1_3_3_3_115_2","unstructured":"Shuchang Xu Xiaofu Jin Huamin Qu and Yukang Yan. 2025. DanmuA11y: Making Time-Synced On-Screen Video Comments (Danmu) Accessible to Blind and Low Vision Users via Multi-Viewer Audio Discussions. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.15711 (2025)."},{"key":"e_1_3_3_3_116_2","unstructured":"Zihui Xue Joungbin An Xitong Yang and Kristen Grauman. 2024. Progress-Aware Video Frame Captioning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.02071 (2024)."},{"key":"e_1_3_3_3_117_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01750"},{"key":"e_1_3_3_3_118_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581126"},{"key":"e_1_3_3_3_119_2","doi-asserted-by":"publisher","DOI":"10.1145\/3708359.3712144"},{"key":"e_1_3_3_3_120_2","doi-asserted-by":"publisher","DOI":"10.1145\/3490099.3511106"},{"key":"e_1_3_3_3_121_2","unstructured":"YouDescribe. [n. d.]. YouDescribe - Audio Description for YouTube Videos. https:\/\/youdescribe.org\/"},{"key":"e_1_3_3_3_122_2","unstructured":"YouTube. [n. d.]. YouTube. https:\/\/www.youtube.com\/"},{"key":"e_1_3_3_3_123_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3580655"},{"key":"e_1_3_3_3_124_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3502036"},{"key":"e_1_3_3_3_125_2","doi-asserted-by":"crossref","unstructured":"Yaoyao Zhong Junbin Xiao Wei Ji Yicong Li Weihong Deng and Tat-Seng Chua. 2022. Video question answering: Datasets algorithms and challenges. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2203.01225 (2022).","DOI":"10.18653\/v1\/2022.emnlp-main.432"},{"key":"e_1_3_3_3_126_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01033"}],"event":{"name":"UIST '25: The 38th Annual ACM Symposium on User Interface Software and Technology","location":"Busan Republic of Korea","acronym":"UIST '25","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction","SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["Proceedings of the 38th Annual ACM Symposium on User Interface Software and Technology"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746059.3747612","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,27]],"date-time":"2025-09-27T22:14:24Z","timestamp":1759011264000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746059.3747612"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,27]]},"references-count":125,"alternative-id":["10.1145\/3746059.3747612","10.1145\/3746059"],"URL":"https:\/\/doi.org\/10.1145\/3746059.3747612","relation":{},"subject":[],"published":{"date-parts":[[2025,9,27]]},"assertion":[{"value":"2025-09-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}