{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,27]],"date-time":"2025-09-27T22:40:12Z","timestamp":1759012812743,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":81,"publisher":"ACM","funder":[{"name":"Hong Kong University Grants Committee","award":["17209822"],"award-info":[{"award-number":["17209822"]}]},{"name":"Hong Kong Innovation and Technology Commission","award":["ITS\/383\/23FP"],"award-info":[{"award-number":["ITS\/383\/23FP"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,9,28]]},"DOI":"10.1145\/3746059.3747626","type":"proceedings-article","created":{"date-parts":[[2025,9,27]],"date-time":"2025-09-27T07:49:12Z","timestamp":1758959352000},"page":"1-18","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["NoteIt: A System Converting Instructional Videos to Interactable Notes Through Multimodal Video Understanding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2496-3429","authenticated-orcid":false,"given":"Running","family":"Zhao","sequence":"first","affiliation":[{"name":"The University of Hong Kong, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4857-7143","authenticated-orcid":false,"given":"Zhihan","family":"Jiang","sequence":"additional","affiliation":[{"name":"The University of Hong Kong, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3650-7332","authenticated-orcid":false,"given":"Xinchen","family":"Zhang","sequence":"additional","affiliation":[{"name":"The University of Hong Kong, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-0628-2851","authenticated-orcid":false,"given":"Chirui","family":"Chang","sequence":"additional","affiliation":[{"name":"The University of Hong Kong, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4223-3502","authenticated-orcid":false,"given":"Handi","family":"Chen","sequence":"additional","affiliation":[{"name":"The University of Hong Kong, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8861-4001","authenticated-orcid":false,"given":"Weipeng","family":"Deng","sequence":"additional","affiliation":[{"name":"The University of Hong Kong, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-7940-1459","authenticated-orcid":false,"given":"Luyao","family":"Jin","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4285-1626","authenticated-orcid":false,"given":"Xiaojuan","family":"Qi","sequence":"additional","affiliation":[{"name":"The University of Hong Kong, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1976-7992","authenticated-orcid":false,"given":"Xun","family":"Qian","sequence":"additional","affiliation":[{"name":"Google, Mountain View, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3454-8731","authenticated-orcid":false,"given":"Edith C.H.","family":"Ngai","sequence":"additional","affiliation":[{"name":"The University of Hong Kong, Hong Kong, China"}]}],"member":"320","published-online":{"date-parts":[[2025,9,27]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"crossref","unstructured":"Aaron Bangor Philip\u00a0T Kortum and James\u00a0T Miller. 2008. An empirical evaluation of the system usability scale. Intl. Journal of Human\u2013Computer Interaction 24 6 (2008) 574\u2013594.","DOI":"10.1080\/10447310802205776"},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","DOI":"10.1145\/1753326.1753417"},{"key":"e_1_3_3_2_4_2","unstructured":"ByteDance. 2024. Seed-TTS: A Family of High-Quality Versatile Speech Generation Models. arxiv:https:\/\/arXiv.org\/abs\/2406.02430\u00a0[eess.AS] https:\/\/arxiv.org\/abs\/2406.02430"},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","DOI":"10.1145\/3490099.3511132"},{"key":"e_1_3_3_2_6_2","unstructured":"Brandon Castellano. [n. d.]. PySceneDetect: Video Cut Detection and Analysis Tool. https:\/\/github.com\/Breakthrough\/PySceneDetect."},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.1145\/3173574.3174025"},{"key":"e_1_3_3_2_8_2","unstructured":"Jun Chen Deyao Zhu Kilichbek Haydarov Xiang Li and Mohamed Elhoseiny. 2023. Video ChatCaptioner: Towards enriched spatiotemporal descriptions. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.04227 (2023)."},{"key":"e_1_3_3_2_9_2","unstructured":"Lin Chen Xilin Wei Jinsong Li Xiaoyi Dong Pan Zhang Yuhang Zang Zehui Chen Haodong Duan Zhenyu Tang Li Yuan et\u00a0al. 2024. Sharegpt4video: Improving video understanding and generation with better captions. Advances in Neural Information Processing Systems 37 (2024) 19472\u201319495."},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642443"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/VR.2019.8798338"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.1145\/3472749.3474778"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1145\/2380116.2380130"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","DOI":"10.1145\/2501988.2502052"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-51828-8_23"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"crossref","unstructured":"Franz Faul Edgar Erdfelder Axel Buchner and Albert-Georg Lang. 2009. Statistical power analyses using G* Power 3.1: Tests for correlation and regression analyses. Behavior research methods 41 4 (2009) 1149\u20131160.","DOI":"10.3758\/BRM.41.4.1149"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376437"},{"key":"e_1_3_3_2_18_2","unstructured":"Google. 2024. NotebookLM. https:\/\/notebooklm.google\/. Accessed: 2025-04-07."},{"key":"e_1_3_3_2_19_2","unstructured":"Mingfei Han Linjie Yang Xiaojun Chang and Heng Wang. 2023. Shot2story20k: A new benchmark for comprehensive understanding of multi-shot videos. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.10300 (2023)."},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"crossref","unstructured":"Wafa\u2019A Hazaymeh and Moath\u00a0Khalaf Alomery. 2022. The Effectiveness of Visual Mind Mapping Strategy for Improving English Language Learners\u2019 Critical Thinking Skills and Reading Ability. European Journal of Educational Research 11 1 (2022) 141\u2013150.","DOI":"10.12973\/eu-jer.11.1.141"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"crossref","unstructured":"Markus\u00a0H Hefter. 2024. Note-taking fosters distance video learning: smartphones as risk and intellectual values as protective factors. Scientific Reports 14 1 (2024) 16962.","DOI":"10.1038\/s41598-024-67898-7"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"crossref","unstructured":"Aidan Hogan Eva Blomqvist Michael Cochez Claudia d\u2019Amato Gerard\u00a0De Melo Claudio Gutierrez Sabrina Kirrane Jos\u00e9 Emilio\u00a0Labra Gayo Roberto Navigli Sebastian Neumaier et\u00a0al. 2021. Knowledge graphs. ACM Computing Surveys (Csur) 54 4 (2021) 1\u201337.","DOI":"10.1145\/3447772"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01353"},{"key":"e_1_3_3_2_24_2","unstructured":"Qirui Huang Min Lu Joel Lanir Dani Lischinski Daniel Cohen-Or and Hui Huang. 2024. Graphimind: Llm-centric interface for information graphics design. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.13245 (2024)."},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"crossref","unstructured":"Zhihan Jiang Handi Chen Rui Zhou Jing Deng Xinchen Zhang Running Zhao Cong Xie Yifang Wang and Edith\u00a0CH Ngai. 2023. Healthprism: a visual analytics system for exploring children\u2019s physical and mental health profiles with multimodal data. IEEE Transactions on Visualization and Computer Graphics 30 1 (2023) 1205\u20131215.","DOI":"10.1109\/TVCG.2023.3326943"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"crossref","unstructured":"Zhihan Jiang Xin He Chenhui Lu Binbin Zhou Xiaoliang Fan Cheng Wang Xiaojuan Ma Edith\u00a0CH Ngai and Longbiao Chen. 2022. Understanding drivers\u2019 visual and comprehension loads in traffic violation hotspots leveraging crowd-based driving simulation. IEEE transactions on intelligent transportation systems 23 12 (2022) 23369\u201323383.","DOI":"10.1109\/TITS.2022.3204068"},{"key":"e_1_3_3_2_27_2","unstructured":"Zhihan Jiang Running Zhao Lin Lin Yue Yu Handi Chen Xinchen Zhang Xuhai Xu Yifang Wang Xiaojuan Ma and Edith\u00a0CH Ngai. 2025. DietGlance: Dietary Monitoring and Personalized Analysis at a Glance with Knowledge-Empowered AI Assistant. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.01317 (2025)."},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","DOI":"10.1145\/1054972.1055046"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"crossref","unstructured":"Kenneth\u00a0A Kiewra Nelson\u00a0F DuBois David Christian Anne McShane Michelle Meyerhoffer and David Roskelley. 1991. Note-taking functions and techniques. Journal of educational psychology 83 2 (1991) 240.","DOI":"10.1037\/0022-0663.83.2.240"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.1145\/2556288.2556986"},{"key":"e_1_3_3_2_31_2","volume-title":"Workshop on AI Robotics at the Int. Conf. on Intelligent Robots and Systems (IROS)","author":"Knepper Ross\u00a0A","year":"2014","unstructured":"Ross\u00a0A Knepper, Dishaan Ahuja, Geoffrey Lalonde, and Daniela Rus. 2014. Distributed assembly with and\/or graphs. In Workshop on AI Robotics at the Int. Conf. on Intelligent Robots and Systems (IROS)."},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","DOI":"10.1093\/acprof:oso\/9780199744336.001.0001"},{"key":"e_1_3_3_2_33_2","first-page":"19730","volume-title":"International conference on machine learning","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730\u201319742."},{"key":"e_1_3_3_2_34_2","first-page":"19730","volume-title":"International conference on machine learning","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730\u201319742."},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01480"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","DOI":"10.1145\/2909132.2909277"},{"key":"e_1_3_3_2_37_2","first-page":"15","volume-title":"Graphics Interface","author":"Nawhal Megha","year":"2019","unstructured":"Megha Nawhal, Jacqueline\u00a0B Lang, Greg Mori, and Parmit\u00a0K Chilana. 2019. VideoWhiz: Non-Linear Interactive Overviews for Recipe Videos.. In Graphics Interface. 15\u20131."},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"publisher","DOI":"10.1145\/2858036.2858137"},{"key":"e_1_3_3_2_39_2","unstructured":"NoteGPT. 2024. NoteGPT. https:\/\/notegpt.io\/. Accessed: 2025-04-07."},{"key":"e_1_3_3_2_40_2","unstructured":"OpenAI. 2024. GPT-4o System Card. arxiv:https:\/\/arXiv.org\/abs\/2410.21276\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2410.21276"},{"key":"e_1_3_3_2_41_2","unstructured":"Maxime Oquab Timoth\u00e9e Darcet Th\u00e9o Moutakanni Huy\u00a0V. Vo Marc Szafraniec Vasil Khalidov Pierre Fernandez Daniel HAZIZA Francisco Massa Alaaeldin El-Nouby Mido Assran Nicolas Ballas Wojciech Galuba Russell Howes Po-Yao Huang Shang-Wen Li Ishan Misra Michael Rabbat Vasu Sharma Gabriel Synnaeve Hu Xu Herve Jegou Julien Mairal Patrick Labatut Armand Joulin and Piotr Bojanowski. 2024. DINOv2: Learning Robust Visual Features without Supervision. Transactions on Machine Learning Research (2024). https:\/\/openreview.net\/forum?id=a68SUt6zFt Featured Certification."},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413765"},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"publisher","DOI":"10.1145\/2642918.2647400"},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"publisher","DOI":"10.1145\/3708359.3712093"},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3580921"},{"key":"e_1_3_3_2_46_2","unstructured":"Leping Qiu Erin\u00a0Seongyoon Kim Sangho Suh Ludwig Sidenmark and Tovi Grossman. 2025. MaRginalia: Enabling In-person Lecture Capturing and Note-taking Through Mixed Reality. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.16010 (2025)."},{"key":"e_1_3_3_2_47_2","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748\u20138763."},{"key":"e_1_3_3_2_48_2","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748\u20138763."},{"key":"e_1_3_3_2_49_2","first-page":"28492","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2023","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. 2023. Robust speech recognition via large-scale weak supervision. In International conference on machine learning. PMLR, 28492\u201328518."},{"key":"e_1_3_3_2_50_2","doi-asserted-by":"crossref","unstructured":"Ren\u00e9 Ranftl Katrin Lasinger David Hafner Konrad Schindler and Vladlen Koltun. 2020. Towards robust monocular depth estimation: Mixing datasets for zero-shot cross-dataset transfer. IEEE transactions on pattern analysis and machine intelligence 44 3 (2020) 1623\u20131637.","DOI":"10.1109\/TPAMI.2020.3019967"},{"key":"e_1_3_3_2_51_2","unstructured":"Nikhila Ravi Valentin Gabeur Yuan-Ting Hu Ronghang Hu Chaitanya Ryali Tengyu Ma Haitham Khedr Roman R\u00e4dle Chloe Rolland Laura Gustafson et\u00a0al. 2024. Sam 2: Segment anything in images and videos. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.00714 (2024)."},{"key":"e_1_3_3_2_52_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-50020-7_7"},{"key":"e_1_3_3_2_53_2","doi-asserted-by":"crossref","unstructured":"Bernard Rosner Robert\u00a0J Glynn and Mei-Ling\u00a0T Lee. 2006. The Wilcoxon signed rank test for paired comparisons of clustered data. Biometrics 62 1 (2006) 185\u2013192.","DOI":"10.1111\/j.1541-0420.2005.00389.x"},{"key":"e_1_3_3_2_54_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126544"},{"key":"e_1_3_3_2_55_2","unstructured":"Ahmad Seifi and Amir Moshayeri. 2024. The Influence of Color Schemes and Aesthetics on User Satisfaction in Web Design: An Empirical Study. International Journal of Advanced Human Computer Interaction 2 2 (2024) 33\u201343."},{"key":"e_1_3_3_2_56_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3517735"},{"key":"e_1_3_3_2_57_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01020"},{"key":"e_1_3_3_2_58_2","doi-asserted-by":"publisher","unstructured":"Hijung\u00a0Valentina Shin Floraine Berthouzoz Wilmot Li and Fr\u00e9do Durand. 2015. Visual transcripts: lecture notes from blackboard-style lecture videos. ACM Trans. Graph. 34 6 Article 240 (Nov. 2015) 10\u00a0pages. 10.1145\/2816795.2818123","DOI":"10.1145\/2816795.2818123"},{"key":"e_1_3_3_2_59_2","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376155"},{"key":"e_1_3_3_2_60_2","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445185"},{"key":"e_1_3_3_2_61_2","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445721"},{"key":"e_1_3_3_2_62_2","doi-asserted-by":"crossref","unstructured":"Alexandre\u00a0N Tuch Sandra\u00a0P Roth Kasper Hornb\u00e6k Klaus Opwis and Javier\u00a0A Bargas-Avila. 2012. Is beautiful really usable? Toward understanding the relation between usability aesthetics and affect in HCI. Computers in human behavior 28 5 (2012) 1596\u20131607.","DOI":"10.1016\/j.chb.2012.03.024"},{"key":"e_1_3_3_2_63_2","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376759"},{"key":"e_1_3_3_2_64_2","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445162"},{"key":"e_1_3_3_2_65_2","doi-asserted-by":"publisher","DOI":"10.1145\/2556288.2557407"},{"key":"e_1_3_3_2_66_2","doi-asserted-by":"crossref","unstructured":"Liang Wang Gaofeng Che Jiantuan Hu and Lin Chen. 2024. Online review helpfulness and information overload: the roles of text image and video elements. Journal of Theoretical and Applied Electronic Commerce Research 19 2 (2024) 1243\u20131266.","DOI":"10.3390\/jtaer19020064"},{"key":"e_1_3_3_2_67_2","unstructured":"Mingqiu Wang Izhak Shafran Hagen Soltau Wei Han Yuan Cao Dian Yu and Laurent\u00a0El Shafey. 2024. Retrieval Augmented End-to-End Spoken Dialog Models. arxiv:https:\/\/arXiv.org\/abs\/2402.01828\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2402.01828"},{"key":"e_1_3_3_2_68_2","unstructured":"Xiaohan Wang Yuhui Zhang Orr Zohar and Serena Yeung-Levy. 2024. VideoAgent: Long-form Video Understanding with Large Language Model as Agent. arxiv:https:\/\/arXiv.org\/abs\/2403.10517\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2403.10517"},{"key":"e_1_3_3_2_69_2","doi-asserted-by":"crossref","unstructured":"Zhou Wang Alan\u00a0C Bovik Hamid\u00a0R Sheikh and Eero\u00a0P Simoncelli. 2004. Image quality assessment: from error visibility to structural similarity. IEEE transactions on image processing 13 4 (2004) 600\u2013612.","DOI":"10.1109\/TIP.2003.819861"},{"key":"e_1_3_3_2_70_2","unstructured":"Zhenhailong Wang Manling Li Ruochen Xu Luowei Zhou Jie Lei Xudong Lin Shuohang Wang Ziyi Yang Chenguang Zhu Derek Hoiem et\u00a0al. 2022. Language models with image descriptors are strong few-shot video-language learners. Advances in Neural Information Processing Systems 35 (2022) 8483\u20138497."},{"key":"e_1_3_3_2_71_2","doi-asserted-by":"publisher","DOI":"10.1145\/2675133.2675219"},{"key":"e_1_3_3_2_72_2","doi-asserted-by":"publisher","unstructured":"Chengpei Xu Wenjing Jia Ruomei Wang Xiangjian He Baoquan Zhao and Yuanfang Zhang. 2023. Semantic Navigation of PowerPoint-Based Lecture Video for AutoNote Generation. IEEE Transactions on Learning Technologies 16 1 (2023) 1\u201317. 10.1109\/TLT.2022.3216535","DOI":"10.1109\/TLT.2022.3216535"},{"key":"e_1_3_3_2_73_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2019.00159"},{"key":"e_1_3_3_2_74_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01032"},{"key":"e_1_3_3_2_75_2","unstructured":"Dongjie Yang Suyuan Huang Chengqiang Lu Xiaodong Han Haoxin Zhang Yan Gao Yao Hu and Hai Zhao. 2024. Vript: A video is worth thousands of words. Advances in Neural Information Processing Systems 37 (2024) 57240\u201357261."},{"key":"e_1_3_3_2_76_2","unstructured":"Jihan Yang Shusheng Yang Anjali\u00a0W Gupta Rilyn Han Li Fei-Fei and Saining Xie. 2024. Thinking in space: How multimodal large language models see remember and recall spaces. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.14171 (2024)."},{"key":"e_1_3_3_2_77_2","unstructured":"Saelyne Yang Sangkyung Kwak Tae\u00a0Soo Kim and Juho Kim. 2022. Improving Video Interfaces by Presenting Informational Units of Videos. CHI\u201922 Extended Abstracts. Association for Computing Machinery (2022)."},{"key":"e_1_3_3_2_78_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581126"},{"key":"e_1_3_3_2_79_2","doi-asserted-by":"publisher","DOI":"10.1145\/3708359.3712144"},{"key":"e_1_3_3_2_80_2","volume-title":"The Eleventh International Conference on Learning Representations","author":"Zeng Andy","year":"2023","unstructured":"Andy Zeng, Maria Attarian, brian ichter, Krzysztof\u00a0Marcin Choromanski, Adrian Wong, Stefan Welker, Federico Tombari, Aveek Purohit, Michael\u00a0S Ryoo, Vikas Sindhwani, Johnny Lee, Vincent Vanhoucke, and Pete Florence. 2023. Socratic Models: Composing Zero-Shot Multimodal Reasoning with Language. In The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=G2Q2Mh3avow"},{"key":"e_1_3_3_2_81_2","doi-asserted-by":"crossref","unstructured":"Hang Zhang Xin Li and Lidong Bing. 2023. Video-llama: An instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2306.02858 (2023).","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"e_1_3_3_2_82_2","unstructured":"Deyao Zhu Jun Chen Xiaoqian Shen Xiang Li and Mohamed Elhoseiny. 2023. Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.10592 (2023)."}],"event":{"name":"UIST '25: The 38th Annual ACM Symposium on User Interface Software and Technology","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction","SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"],"location":"Busan Republic of Korea","acronym":"UIST '25"},"container-title":["Proceedings of the 38th Annual ACM Symposium on User Interface Software and Technology"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746059.3747626","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,27]],"date-time":"2025-09-27T22:12:53Z","timestamp":1759011173000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746059.3747626"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,27]]},"references-count":81,"alternative-id":["10.1145\/3746059.3747626","10.1145\/3746059"],"URL":"https:\/\/doi.org\/10.1145\/3746059.3747626","relation":{},"subject":[],"published":{"date-parts":[[2025,9,27]]},"assertion":[{"value":"2025-09-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}