{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,26]],"date-time":"2026-06-26T23:57:03Z","timestamp":1782518223225,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,10,12]],"date-time":"2026-10-12T00:00:00Z","timestamp":1791763200000},"content-version":"vor","delay-in-days":365,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["2112635"],"award-info":[{"award-number":["2112635"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,13]]},"DOI":"10.1145\/3716553.3750795","type":"proceedings-article","created":{"date-parts":[[2025,10,11]],"date-time":"2025-10-11T13:13:16Z","timestamp":1760188396000},"page":"516-525","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["A Multimodal Classroom Video Question-Answering Framework for Automated Understanding of Collaborative Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-8469-0355","authenticated-orcid":false,"given":"Nithin","family":"Sivakumaran","sequence":"first","affiliation":[{"name":"UNC Chapel Hill, Chapel Hill, North Carolina, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4233-6847","authenticated-orcid":false,"given":"Chia-Yu","family":"Yang","sequence":"additional","affiliation":[{"name":"UNC Chapel Hill, Chapel Hill, North Carolina, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-9702-9154","authenticated-orcid":false,"given":"Abhay","family":"Zala","sequence":"additional","affiliation":[{"name":"UNC Chapel Hill, Chapel Hill, North Carolina, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-1670-0054","authenticated-orcid":false,"given":"Shoubin","family":"Yu","sequence":"additional","affiliation":[{"name":"UNC Chapel Hill, Chapel Hill, North Carolina, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-0684-8358","authenticated-orcid":false,"given":"Daeun","family":"Hong","sequence":"additional","affiliation":[{"name":"Indiana University Bloomington, Bloomington, Indiana, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8681-1358","authenticated-orcid":false,"given":"Xiaotian","family":"Zou","sequence":"additional","affiliation":[{"name":"Indiana University Bloomington, Bloomington, Indiana, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6689-505X","authenticated-orcid":false,"given":"Elias","family":"Stengel-Eskin","sequence":"additional","affiliation":[{"name":"UNC Chapel Hill, Chapel Hill, North Carolina, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3253-8369","authenticated-orcid":false,"given":"Dan","family":"Carpenter","sequence":"additional","affiliation":[{"name":"North Carolina State University, Raleigh, North Carolina, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8900-0514","authenticated-orcid":false,"given":"Wookhee","family":"Min","sequence":"additional","affiliation":[{"name":"North Carolina State University, Raleigh, North Carolina, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2275-5212","authenticated-orcid":false,"given":"Cindy","family":"Hmelo-Silver","sequence":"additional","affiliation":[{"name":"Indiana University, Bloomington, Indiana, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2038-9239","authenticated-orcid":false,"given":"Jonathan","family":"Rowe","sequence":"additional","affiliation":[{"name":"North Carolina State University, Raleigh, North Carolina, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1481-6601","authenticated-orcid":false,"given":"James","family":"Lester","sequence":"additional","affiliation":[{"name":"North Carolina State University, Raleigh, North Carolina, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-2965-5354","authenticated-orcid":false,"given":"Mohit","family":"Bansal","sequence":"additional","affiliation":[{"name":"UNC Chapel Hill, Chapel Hill, North Carolina, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,10,12]]},"reference":[{"key":"e_1_3_3_2_2_2","first-page":"224","volume-title":"Proceedings of the 17th International Conference on Educational Data Mining","author":"Acosta Halim","year":"2024","unstructured":"Halim Acosta, Seung Lee, Bradford Mott, Haesol Bae, Krista Glazewski, Cindy Hmelo-Silver, and James Lester. 2024. Multimodal Learning Analytics for Predicting Student Collaboration Satisfaction in Collaborative Game-Based Learning. In Proceedings of the 17th International Conference on Educational Data Mining. 224\u2013235."},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"crossref","unstructured":"Jessica Andrews-Todd Yang Jiang Jonathan Steinberg Samuel\u00a0L Pugh and Sidney\u00a0K D\u2019Mello. 2023. Investigating collaborative problem solving skills and outcomes across computer-based tasks. Computers & Education 207 (2023) 104928.","DOI":"10.1016\/j.compedu.2023.104928"},{"key":"e_1_3_3_2_4_2","volume-title":"INTERSPEECH","author":"Bain Max","year":"2023","unstructured":"Max Bain, Jaesung Huh, Tengda Han, and Andrew Zisserman. 2023. WhisperX: Time-Accurate Speech Transcription of Long-Form Audio. In INTERSPEECH."},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","unstructured":"Hao-Shu Fang Jiefeng Li Hongyang Tang Chao Xu Haoyi Zhu Yuliang Xiu Yong-Lu Li and Cewu Lu. 2023. AlphaPose: Whole-Body Regional Multi-Person Pose Estimation and Tracking in Real-Time. IEEE Transactions on Pattern Analysis and Machine Intelligence 45 6 (2023) 7157\u20137173. 10.1109\/TPAMI.2022.3222784","DOI":"10.1109\/TPAMI.2022.3222784"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"crossref","unstructured":"Stephen\u00a0M Fiore Arthur Graesser and Samuel Greiff. 2018. Collaborative problem-solving education for the twenty-first-century workforce. Nature human behaviour 2 6 (2018) 367\u2013369.","DOI":"10.1038\/s41562-018-0363-y"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"crossref","unstructured":"Arthur\u00a0C Graesser Stephen\u00a0M Fiore Samuel Greiff Jessica Andrews-Todd Peter\u00a0W Foltz and Friedrich\u00a0W Hesse. 2018. Advancing the science of collaborative problem solving. Psychological science in the public interest 19 2 (2018) 59\u201392.","DOI":"10.1177\/1529100618808244"},{"key":"e_1_3_3_2_8_2","unstructured":"Daya Guo Qihao Zhu Dejian Yang Zhenda Xie Kai Dong Wentao Zhang Guanting Chen Xiao Bi Y. Wu Y.\u00a0K. Li Fuli Luo Yingfei Xiong and Wenfeng Liang. 2024. DeepSeek-Coder: When the Large Language Model Meets Programming \u2013 The Rise of Code Intelligence. arxiv:https:\/\/arXiv.org\/abs\/2401.14196\u00a0[cs.SE]"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01436"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.618"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"crossref","unstructured":"Cindy\u00a0E Hmelo-Silver. 2004. Problem-based learning: What and how do students learn? Educational psychology review 16 (2004) 235\u2013266.","DOI":"10.1023\/B:EDPR.0000034022.16470.f3"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.5244\/C.34.29"},{"key":"e_1_3_3_2_13_2","unstructured":"Albert\u00a0Q. Jiang Alexandre Sablayrolles Arthur Mensch Chris Bamford Devendra\u00a0Singh Chaplot Diego de\u00a0las Casas Florian Bressand Gianna Lengyel Guillaume Lample Lucile Saulnier L\u00e9lio\u00a0Renard Lavaud Marie-Anne Lachaux Pierre Stock Teven\u00a0Le Scao Thibaut Lavril Thomas Wang Timoth\u00e9e Lacroix and William\u00a0El Sayed. 2023. Mistral 7B. arxiv:https:\/\/arXiv.org\/abs\/2310.06825\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2310.06825"},{"key":"e_1_3_3_2_14_2","unstructured":"Glenn Jocher Ayush Chaurasia and Jing Qiu. 2023. Ultralytics YOLO. https:\/\/github.com\/ultralytics\/ultralytics"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.261"},{"key":"e_1_3_3_2_16_2","volume-title":"Content Analysis: An Introduction to Its Methodology (4th ed.)","author":"Krippendorff Klaus","year":"2018","unstructured":"Klaus Krippendorff. 2018. Content Analysis: An Introduction to Its Methodology (4th ed.). Sage Publications."},{"key":"e_1_3_3_2_17_2","volume-title":"NeurIPS","author":"Lei Jie","year":"2021","unstructured":"Jie Lei, Tamara\u00a0L. Berg, and Mohit Bansal. 2021. QVHighlights: Detecting Moments and Highlights in Videos via Natural Language Queries. In NeurIPS."},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58589-1_27"},{"key":"e_1_3_3_2_19_2","volume-title":"ICML","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. In ICML."},{"key":"e_1_3_3_2_20_2","unstructured":"Kunchang Li Yinan He Yi Wang Yizhuo Li Wenhai Wang Ping Luo Yali Wang Limin Wang and Yu Qiao. 2023. VideoChat: Chat-Centric Video Understanding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.06355 (2023)."},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02095"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.4018\/978-1-4666-9441-5.ch013"},{"key":"e_1_3_3_2_23_2","volume-title":"ECCV","author":"Liu Shilong","year":"2024","unstructured":"Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, et\u00a0al. 2024. Grounding DINO: Marrying dino with grounded pre-training for open-set object detection. In ECCV."},{"key":"e_1_3_3_2_24_2","volume-title":"Proceedings of the European Conference on Computer Vision (ECCV)","author":"Liu Ye","year":"2024","unstructured":"Ye Liu, Jixuan He, Wanhua Li, Junsik Kim, Donglai Wei, Hanspeter Pfister, and Chang\u00a0Wen Chen. 2024. R2-Tuning: Efficient Image-to-Video Transfer Learning for Video Temporal Grounding. In Proceedings of the European Conference on Computer Vision (ECCV)."},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","DOI":"10.1145\/3506860.3506865"},{"key":"e_1_3_3_2_26_2","unstructured":"Ziqiao Ma Jiayi Pan and Joyce Chai. 2023. World-to-words: Grounded open vocabulary acquisition through fast mapping in vision-language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2306.08685 (2023)."},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"e_1_3_3_2_28_2","unstructured":"WonJun Moon Sangeek Hyun SuBeen Lee and Jae-Pil Heo. 2024. Correlation-guided Query-Dependency Calibration in Video Representation Learning for Temporal Grounding. arxiv:https:\/\/arXiv.org\/abs\/2311.08835\u00a0[cs.CV]"},{"key":"e_1_3_3_2_29_2","unstructured":"OpenAI. 2024. GPT-4o. https:\/\/openai.com\/index\/hello-gpt-4o\/"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"crossref","unstructured":"Jun Oshima and H\u00a0Ulrich Hoppe. 2021. Finding meaning in log-file data. International handbook of computer-supported collaborative learning (2021) 569\u2013584.","DOI":"10.1007\/978-3-030-65291-3_31"},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/SISY52375.2021.9582508"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10161317"},{"key":"e_1_3_3_2_33_2","volume-title":"ICLS","author":"Starr Emma\u00a0L","year":"2018","unstructured":"Emma\u00a0L Starr, Joseph\u00a0M Reilly, and Bertrand Schneider. 2018. Toward Using Multi-Modal Learning Analytics to Support and Measure Collaboration in Co-Located Dyads.. In ICLS."},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"crossref","unstructured":"\u00d6mer S\u00fcmer Patricia Goldberg Sidney D\u2019Mello Peter Gerjets Ulrich Trautwein and Enkelejda Kasneci. 2021. Multimodal engagement analysis from facial videos in the classroom. IEEE Transactions on Affective Computing 14 2 (2021) 1012\u20131027.","DOI":"10.1109\/TAFFC.2021.3127692"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01092"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01998"},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"crossref","unstructured":"Alina\u00a0A von Davier Jiangang Hao Lei Liu and Patrick Kyllonen. 2017. Interdisciplinary research agenda in support of assessment of collaborative problem solving: Lessons learned from developing a collaborative science assessment prototype. Computers in Human Behavior 76 (2017) 631\u2013640.","DOI":"10.1016\/j.chb.2017.04.059"},{"key":"e_1_3_3_2_38_2","volume-title":"ECCV","author":"Wang Xijun","year":"2024","unstructured":"Xijun Wang, Junbang Liang, Chun-Kai Wang, Kenan Deng, Yu Lou, Ming Lin, and Shan Yang. 2024. ViLA: Efficient Video-Language Alignment for Video Question Answering. In ECCV."},{"key":"e_1_3_3_2_39_2","unstructured":"Ziyang Wang Shoubin Yu Elias Stengel-Eskin Jaehong Yoon Feng Cheng Gedas Bertasius and Mohit Bansal. 2024. VideoTree: Adaptive Tree-based Video Representation for LLM Reasoning on Long Videos. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.19209 (2024)."},{"key":"e_1_3_3_2_40_2","volume-title":"NeurIPS","author":"Yu Shoubin","year":"2023","unstructured":"Shoubin Yu, Jaemin Cho, Prateek Yadav, and Mohit Bansal. 2023. Self-Chained Image-Language Model for Video Localization and Question Answering. In NeurIPS."},{"key":"e_1_3_3_2_41_2","unstructured":"Shoubin Yu Jaehong Yoon and Mohit Bansal. 2024. CREMA: Generalizable and Efficient Video-Language Reasoning via Multimodal Modular Fusion. arxiv:https:\/\/arXiv.org\/abs\/2402.05889\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2402.05889"},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02208"},{"key":"e_1_3_3_2_43_2","volume-title":"International Conference on Learning Representations","author":"Zhang Tianyi","year":"2020","unstructured":"Tianyi Zhang, Varsha Kishore, Felix Wu, Kilian\u00a0Q. Weinberger, and Yoav Artzi. 2020. BERTScore: Evaluating Text Generation with BERT. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=SkeHuCVFDr"},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01349"},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19836-6_22"}],"event":{"name":"ICMI '25: International Conference on Multimodal Interaction","location":"Canberra Australia","acronym":"ICMI '25","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the 27th International Conference on Multimodal Interaction"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3716553.3750795","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3716553.3750795","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,26]],"date-time":"2026-01-26T22:26:20Z","timestamp":1769466380000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3716553.3750795"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,12]]},"references-count":44,"alternative-id":["10.1145\/3716553.3750795","10.1145\/3716553"],"URL":"https:\/\/doi.org\/10.1145\/3716553.3750795","relation":{},"subject":[],"published":{"date-parts":[[2025,10,12]]},"assertion":[{"value":"2025-10-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}