{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,14]],"date-time":"2026-05-14T12:07:25Z","timestamp":1778760445718,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":20,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T00:00:00Z","timestamp":1776902400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"NIH \/ NIGMS via NM-INBRE","award":["P20GM103451"],"award-info":[{"award-number":["P20GM103451"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,23]]},"DOI":"10.1145\/3746467.3801517","type":"proceedings-article","created":{"date-parts":[[2026,5,14]],"date-time":"2026-05-14T11:06:32Z","timestamp":1778756792000},"page":"235-240","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["From Phase Grounding to Intelligent Surgical Narratives"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-1049-0516","authenticated-orcid":false,"given":"Ethan","family":"Peterson","sequence":"first","affiliation":[{"name":"Computer Science &amp; Engineering, New Mexico Institute of Mining and Technology, Socorro, NM, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8926-1941","authenticated-orcid":false,"given":"Huixin","family":"Zhan","sequence":"additional","affiliation":[{"name":"Computer Science &amp; Engineering, New Mexico Institute of Mining and Technology, Socorro, NM, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,5,14]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.5662\/wjm.v5.i4.238"},{"key":"e_1_3_2_1_2_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly Jakob Uszkoreit and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. arXiv:2010.11929 [cs.CV] https:\/\/arxiv.org\/abs\/2010.11929"},{"key":"e_1_3_2_1_3_1","volume-title":"Ren\u00e9 Vidal, Sanjeev Khudanpur, and Gregory D. Hager.","author":"Gao Yixin","year":"2014","unstructured":"Yixin Gao, S. Swaroop Vedula, Carol E. Reiley, Narges Ahmidi, Balakrishnan Varadarajan, Henry C. Lin, Lingling Tao, Luca Zappella, Benjam\u00edn B\u00e9jar, David D. Yuh, Chi Chiung Grace Chen, Ren\u00e9 Vidal, Sanjeev Khudanpur, and Gregory D. Hager. 2014. JHU-ISI Gesture and Skill Assessment Working Set (JIGSAWS): A Surgical Activity Dataset for Human Motion Modeling. In Modeling and Monitoring of Computer Assisted Interventions (M2CAI) - MICCAI Workshop. Springer, Cambridge, USA. https:\/\/cirl.lcsr.jhu.edu\/wp-content\/uploads\/2015\/11\/JIGSAWS.pdf"},{"key":"e_1_3_2_1_4_1","volume-title":"Muhammad Abdullah Jamal, and Omid Mohareri","author":"Honarmand Mohammadmahdi","year":"2024","unstructured":"Mohammadmahdi Honarmand, Muhammad Abdullah Jamal, and Omid Mohareri. 2024. VidLPRO: A Video-Language Pre-Training Framework for Robotic and Laparoscopic Surgery. arXiv:2409.04732 [cs.CV] https:\/\/arxiv.org\/abs\/2409.04732"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.anorl.2017.03.003"},{"key":"e_1_3_2_1_6_1","volume-title":"Peter C W Kim, and Jinjun Xiong","author":"Li Jiajie","year":"2024","unstructured":"Jiajie Li, Garrett Skinner, Gene Yang, Brian R Quaranto, Steven D Schwaitzberg, Peter C W Kim, and Jinjun Xiong. 2024. LLaVA-Surg: Towards Multimodal Surgical Assistant Via Structured Surgical Video Learning. arXiv:2408.07981 [cs.CV] https:\/\/arxiv.org\/abs\/2408.07981"},{"key":"e_1_3_2_1_7_1","unstructured":"Shanda Li Xiangning Chen Di He and Cho-Jui Hsieh. 2021. Can Vision Transformers Perform Convolution? arXiv:2111.01353 [cs.CV] https:\/\/arxiv.org\/abs\/2111.01353"},{"key":"e_1_3_2_1_8_1","volume-title":"Hung","author":"Li Xi","year":"2025","unstructured":"Xi Li, Nicholas Matsumoto, Ujjwal Pasupulety, Atharva Deo, Cherine Yang, Jay Moran, Miguel E. Hernandez, Peter Wager, Jasmine Lin, Jeanine Kim, Alvin C. Goh, Christian Wagner, Geoffrey A. Sonn, and Andrew J. Hung. 2025. End to End AI System for Surgical Gesture Sequence Recognition and Clinical Outcome Prediction. arXiv:2511.11899 [cs.AI] https:\/\/arxiv.org\/abs\/2511.11899"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.5220\/0007352000210029"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41746-022-00738-y"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.media.2010.10.001"},{"key":"e_1_3_2_1_12_1","volume-title":"Omid Mohareri, and Muhammad Abdullah Jamal.","author":"Perez Alejandra","year":"2025","unstructured":"Alejandra Perez, Chinedu Nwoye, Ramtin Raji Kermani, Omid Mohareri, and Muhammad Abdullah Jamal. 2025. SurgLaVi: Large-Scale Hierarchical Dataset for Surgical Vision-Language Representation Learning. arXiv:2509.10555 [cs.CV] https:\/\/arxiv.org\/abs\/2509.10555"},{"key":"e_1_3_2_1_13_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In Proceedings of the 38th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 139), Marina Meila and Tong Zhang (Eds.). PMLR, Vienna, Austria, 8748\u20138763. https:\/\/proceedings.mlr.press\/v139\/radford21a.html"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMI.2016.2593957"},{"key":"e_1_3_2_1_15_1","unstructured":"Aaron van den Oord Yazhe Li and Oriol Vinyals. 2019. Representation Learning with Contrastive Predictive Coding. arXiv:1807.03748 [cs.LG] https:\/\/arxiv.org\/abs\/1807.03748"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Soham Walimbe Britty Baby Vinkle Srivastav and Nicolas Padoy. 2025. Adaptation of Multi-Modal Representation Models for Multi-Task Surgical Computer Vision. arXiv:2507.05020 [cs.CV] https:\/\/arxiv.org\/abs\/2507.05020","DOI":"10.1007\/978-3-032-05141-7_3"},{"key":"e_1_3_2_1_17_1","unstructured":"Guankun Wang Junyi Wang Wenjin Mo Long Bai Kun Yuan Ming Hu Jinlin Wu Junjun He Yiming Huang Nicolas Padoy Zhen Lei Hongbin Liu Nassir Navab and Hongliang Ren. 2025. SurgVidLM: Towards Multi-Grained Surgical Video Understanding with Large Language Model. arXiv:2506.17873 [cs.CV] https:\/\/arxiv.org\/abs\/2506.17873"},{"key":"e_1_3_2_1_18_1","unstructured":"Gaurav Yengera Didier Mutter Jacques Marescaux and Nicolas Padoy. 2018. Less is More: Surgical Phase Recognition with Less Annotations through Self-Supervised Pre-Training of CNN-LSTM Networks. arXiv:1805.08569 [cs.CV] https:\/\/arxiv.org\/abs\/1805.08569"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"Kun Yuan Vinkle Srivastav Nassir Navab and Nicolas Padoy. 2025. HecVL: Hierarchical Video-Language Pretraining for Zero-Shot Surgical Phase Recognition. arXiv:2405.10075 [cs.CV] https:\/\/arxiv.org\/abs\/2405.10075","DOI":"10.1007\/978-3-031-72089-5_29"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Bokai Zhang Jiayuan Meng Bin Cheng Dean Biskup Svetlana Petculescu and Angela Chapman. 2024. Friends Across Time: Multi-Scale Action Segmentation Transformer for Surgical Phase Recognition. arXiv:2401.11644 [cs.CV] https:\/\/arxiv.org\/abs\/2401.11644","DOI":"10.1109\/EMBC53108.2024.10782887"}],"event":{"name":"ACMSE 2026: 2026 ACM Southeast Conference","location":"Troy University Troy AL USA","acronym":"ACMSE 2026"},"container-title":["Proceedings of the 2026 ACM Southeast Conference"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746467.3801517","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,14]],"date-time":"2026-05-14T11:09:33Z","timestamp":1778756973000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746467.3801517"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,23]]},"references-count":20,"alternative-id":["10.1145\/3746467.3801517","10.1145\/3746467"],"URL":"https:\/\/doi.org\/10.1145\/3746467.3801517","relation":{},"subject":[],"published":{"date-parts":[[2026,4,23]]},"assertion":[{"value":"2026-05-14","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}