{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,24]],"date-time":"2026-01-24T13:33:53Z","timestamp":1769261633691,"version":"3.49.0"},"publisher-location":"Singapore","reference-count":40,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819557639","type":"print"},{"value":"9789819557646","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-5764-6_38","type":"book-chapter","created":{"date-parts":[[2026,1,23]],"date-time":"2026-01-23T06:07:53Z","timestamp":1769148473000},"page":"560-573","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Understanding Surgical Triplet Videos Through Transferable Visual Models from\u00a0Natural Language Supervision"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1704-3050","authenticated-orcid":false,"given":"Yunhao","family":"Li","sequence":"first","affiliation":[]},{"given":"Aoying","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Yu-Xi","family":"Xie","sequence":"additional","affiliation":[]},{"given":"Qiong","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Xiucai","family":"Ye","sequence":"additional","affiliation":[]},{"given":"Patrizia","family":"Savi","sequence":"additional","affiliation":[]},{"given":"Ying","family":"Hu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6483-8326","authenticated-orcid":false,"given":"Yan","family":"Pang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,1,24]]},"reference":[{"issue":"1","key":"38_CR1","doi-asserted-by":"publisher","first-page":"407","DOI":"10.1515\/cdbme-2018-0097","volume":"4","author":"T Abdulbaki Alshirbaji","year":"2018","unstructured":"Abdulbaki Alshirbaji, T., Jalal, N.A., M\u00f6ller, K.: Surgical tool classification in laparoscopic videos using convolutional neural network. Curr. Direct. Biomed. Eng. 4(1), 407\u2013410 (2018)","journal-title":"Curr. Direct. Biomed. Eng."},{"key":"38_CR2","unstructured":"Bai, J., et al.: Qwen-VL: a frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)"},{"key":"38_CR3","unstructured":"Bai, S., et\u00a0al.: Qwen2. 5-VL technical report. arXiv preprint arXiv:2502.13923 (2025)"},{"key":"38_CR4","unstructured":"Ban, Y., et al.: Concept graph neural networks for surgical video understanding. IEEE Trans. Med. Imaging (2023)"},{"key":"38_CR5","unstructured":"Bodenstedt, S., et al.: Real-time image-based instrument classification for laparoscopic surgery. arXiv preprint arXiv:1808.00178 (2018)"},{"key":"38_CR6","doi-asserted-by":"publisher","unstructured":"Chen, Y., He, S., Jin, Y., Qin, J.: Surgical activity triplet recognition via triplet disentanglement. In: Greenspan, H., et al. (eds.) MICCAI 2023. LNCS, vol. 14228, pp. 451\u2013461. Springer (2023). https:\/\/doi.org\/10.1007\/978-3-031-43996-4_43","DOI":"10.1007\/978-3-031-43996-4_43"},{"key":"38_CR7","doi-asserted-by":"publisher","unstructured":"Cheng, Y., Liu, L., Wang, S., Jin, Y., Sch\u00f6nlieb, C.B., Aviles-Rivero, A.I.: Why deep surgical models fail?: revisiting surgical action triplet recognition through the lens of robustness. In: Chen, H., Luo, L. (eds) TML4H 2023. LNCS, vol. 1393, pp. 177\u2013189. Springer, Cham (2023). https:\/\/doi.org\/10.1007\/978-3-031-39539-0_15","DOI":"10.1007\/978-3-031-39539-0_15"},{"key":"38_CR8","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"343","DOI":"10.1007\/978-3-030-59716-0_33","volume-title":"Medical Image Computing and Computer Assisted Intervention \u2013 MICCAI 2020","author":"T Czempiel","year":"2020","unstructured":"Czempiel, T., et al.: TeCNO: surgical phase recognition with multi-stage temporal convolutional networks. In: Martel, A.L., et al. (eds.) MICCAI 2020. LNCS, vol. 12263, pp. 343\u2013352. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-59716-0_33"},{"key":"38_CR9","doi-asserted-by":"crossref","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (long and Short Papers), pp. 4171\u20134186 (2019)","DOI":"10.18653\/v1\/N19-1423"},{"issue":"11","key":"38_CR10","doi-asserted-by":"publisher","first-page":"3309","DOI":"10.1109\/TMI.2022.3182995","volume":"41","author":"X Ding","year":"2022","unstructured":"Ding, X., Li, X.: Exploring segment-level semantics for online phase recognition from surgical videos. IEEE Trans. Med. Imaging 41(11), 3309\u20133319 (2022)","journal-title":"IEEE Trans. Med. Imaging"},{"key":"38_CR11","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16$$\\times $$16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"issue":"4","key":"38_CR12","doi-asserted-by":"publisher","first-page":"684","DOI":"10.1097\/SLA.0000000000004425","volume":"273","author":"CR Garrow","year":"2021","unstructured":"Garrow, C.R., et al.: Machine learning for surgical phase recognition: a systematic review. Ann. Surg. 273(4), 684\u2013693 (2021)","journal-title":"Ann. Surg."},{"key":"38_CR13","doi-asserted-by":"publisher","unstructured":"Gui, S., Wang, Z.: Tail-enhanced representation learning for surgical triplet recognition. In: Linguraru, M.G., et al. (eds.) MICCAI 2024. LNCS, vol. 15011, pp. 689\u2013699. Springer, Cham (2024). https:\/\/doi.org\/10.1007\/978-3-031-72120-5_64","DOI":"10.1007\/978-3-031-72120-5_64"},{"key":"38_CR14","doi-asserted-by":"crossref","unstructured":"Gui, S., Wang, Z., Chen, J., Zhou, X., Zhang, C., Cao, Y.: MT4MTL-KD: a multi-teacher knowledge distillation framework for triplet recognition. IEEE Trans. Med. Imaging (2023)","DOI":"10.1109\/TMI.2023.3345736"},{"key":"38_CR15","unstructured":"Honarmand, M., Jamal, M.A., Mohareri, O.: VidLPRO: a video-language pre-training framework for robotic and laparoscopic surgery. arXiv preprint arXiv:2409.04732 (2024)"},{"key":"38_CR16","unstructured":"Hurst, A., et\u00a0al.: GPT-4o system card. arXiv preprint arXiv:2410.21276 (2024)"},{"key":"38_CR17","doi-asserted-by":"crossref","unstructured":"Jha, D., et al.: Exploring deep learning methods for real-time surgical instrument segmentation in laparoscopy. In: 2021 IEEE EMBS International Conference on Biomedical and Health Informatics (BHI), pp.\u00a01\u20134. IEEE (2021)","DOI":"10.1109\/BHI50953.2021.9508610"},{"key":"38_CR18","doi-asserted-by":"crossref","unstructured":"Kati\u0107, D., et al.: LapOntoSPM: an ontology for laparoscopic surgeries and its application to surgical phase recognition. Int. J. Comput. Assist. Radiol. Surg. 10, 1427\u20131434 (2015)","DOI":"10.1007\/s11548-015-1222-1"},{"key":"38_CR19","doi-asserted-by":"crossref","unstructured":"Kletz, S., Schoeffmann, K., Benois-Pineau, J., Husslein, H.: Identifying surgical instruments in laparoscopy using deep learning instance segmentation. In: 2019 International Conference on Content-Based Multimedia Indexing (CBMI), pp.\u00a01\u20136. IEEE (2019)","DOI":"10.1109\/CBMI.2019.8877379"},{"key":"38_CR20","unstructured":"Li, J., et al.: LLaVA-surg: towards multimodal surgical assistant via structured surgical video learning. arXiv preprint arXiv:2408.07981 (2024)"},{"key":"38_CR21","doi-asserted-by":"crossref","unstructured":"Li, Y., Xia, T., Luo, H., He, B., Jia, F.: MT-FIST: a multi-task fine-grained spatial-temporal framework for surgical action triplet recognition. IEEE J. Biomed. Health Inform. (2023)","DOI":"10.1109\/JBHI.2023.3299321"},{"key":"38_CR22","unstructured":"Liu, D., Hu, A., Shah, M., Xu, C.: Surgical triplet recognition via diffusion model. arXiv preprint arXiv:2406.13210 (2024)"},{"key":"38_CR23","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: International Conference on Learning Representations, pp. 1\u201318 (2019)"},{"key":"38_CR24","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"438","DOI":"10.1007\/978-3-319-10470-6_55","volume-title":"Medical Image Computing and Computer-Assisted Intervention \u2013 MICCAI 2014","author":"L Maier-Hein","year":"2014","unstructured":"Maier-Hein, L., et al.: Can masses of non-experts train highly accurate image classifiers? In: Golland, P., Hata, N., Barillot, C., Hornegger, J., Howe, R. (eds.) MICCAI 2014. LNCS, vol. 8674, pp. 438\u2013445. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10470-6_55"},{"issue":"9","key":"38_CR25","doi-asserted-by":"publisher","first-page":"691","DOI":"10.1038\/s41551-017-0132-7","volume":"1","author":"L Maier-Hein","year":"2017","unstructured":"Maier-Hein, L., et al.: Surgical data science for next-generation interventions. Nat. Biomed. Eng. 1(9), 691\u2013696 (2017)","journal-title":"Nat. Biomed. Eng."},{"key":"38_CR26","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"364","DOI":"10.1007\/978-3-030-59716-0_35","volume-title":"Medical Image Computing and Computer Assisted Intervention \u2013 MICCAI 2020","author":"CI Nwoye","year":"2020","unstructured":"Nwoye, C.I., et al.: Recognition of instrument-tissue interactions in endoscopic videos via action triplets. In: Martel, A.L., et al. (eds.) MICCAI 2020. LNCS, vol. 12263, pp. 364\u2013374. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-59716-0_35"},{"key":"38_CR27","unstructured":"Nwoye, C.I., Padoy, N.: Data splits and metrics for benchmarking methods on surgical action triplet datasets. arXiv preprint arXiv:2204.05235 (2022)"},{"key":"38_CR28","doi-asserted-by":"crossref","unstructured":"Nwoye, C.I., et al.: Rendezvous: attention mechanisms for the recognition of surgical action triplets in endoscopic videos. Med. Image Anal. 78, 102433 (2022)","DOI":"10.1016\/j.media.2022.102433"},{"issue":"3","key":"38_CR29","doi-asserted-by":"publisher","first-page":"994","DOI":"10.1109\/TMI.2023.3326188","volume":"43","author":"Y Pang","year":"2023","unstructured":"Pang, Y., et al.: Slim UNETR: scale hybrid transformers to efficient 3D medical image segmentation under limited computational resources. IEEE Trans. Med. Imaging 43(3), 994\u20131005 (2023)","journal-title":"IEEE Trans. Med. Imaging"},{"key":"38_CR30","doi-asserted-by":"crossref","unstructured":"Primus, M.J., Schoeffmann, K., B\u00f6sz\u00f6rmenyi, L.: Instrument classification in laparoscopic videos. In: 2015 13th International Workshop on Content-Based Multimedia Indexing (CBMI), pp.\u00a01\u20136. IEEE (2015)","DOI":"10.1109\/CBMI.2015.7153616"},{"key":"38_CR31","unstructured":"Radford, A.: Improving language understanding by generative pre-training (2018)"},{"key":"38_CR32","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"issue":"8","key":"38_CR33","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., et al.: Language models are unsupervised multitask learners. OpenAI blog 1(8), 9 (2019)","journal-title":"OpenAI blog"},{"key":"38_CR34","doi-asserted-by":"publisher","first-page":"1053","DOI":"10.1007\/s11548-023-02914-1","volume":"18","author":"S Saurav","year":"2023","unstructured":"Saurav, S., Nwoye, C.I., Didier, M., Nicolas, P.: Rendezvous in time: an attention-based temporal fusion approach for surgical triplet recognition. Int. J. Comput. Assist. Radiol. Surg. 18, 1053\u20131059 (2023)","journal-title":"Int. J. Comput. Assist. Radiol. Surg."},{"issue":"1","key":"38_CR35","doi-asserted-by":"publisher","first-page":"86","DOI":"10.1109\/TMI.2016.2593957","volume":"36","author":"AP Twinanda","year":"2016","unstructured":"Twinanda, A.P., Shehata, S., Mutter, D., Marescaux, J., De Mathelin, M., Padoy, N.: EndoNet: a deep architecture for recognition tasks on laparoscopic videos. IEEE Trans. Med. Imaging 36(1), 86\u201397 (2016)","journal-title":"IEEE Trans. Med. Imaging"},{"key":"38_CR36","doi-asserted-by":"publisher","first-page":"102770","DOI":"10.1016\/j.media.2023.102770","volume":"86","author":"M Wagner","year":"2023","unstructured":"Wagner, M., et al.: Comparative validation of machine learning algorithms for surgical workflow and skill analysis with the heichole benchmark. Med. Image Anal. 86, 102770 (2023)","journal-title":"Med. Image Anal."},{"key":"38_CR37","doi-asserted-by":"crossref","unstructured":"Xi, N., Meng, J., Yuan, J.: Chain-of-look prompting for verb-centric surgical triplet recognition in endoscopic videos. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 5007\u20135016 (2023)","DOI":"10.1145\/3581783.3611898"},{"key":"38_CR38","doi-asserted-by":"publisher","unstructured":"Yamlahi, A., et\u00a0al.: Self-distillation for surgical action recognition. In: Greenspan, H., et al. (eds.) MICCAI 2023. LNCS, vol. 14228. pp. 637\u2013646. Springer, Cham (2023). https:\/\/doi.org\/10.1007\/978-3-031-43996-4_61","DOI":"10.1007\/978-3-031-43996-4_61"},{"key":"38_CR39","first-page":"122952","volume":"37","author":"K Yuan","year":"2024","unstructured":"Yuan, K., Navab, N., Padoy, N., et al.: Procedure-aware surgical video-language pretraining with hierarchical knowledge augmentation. Adv. Neural. Inf. Process. Syst. 37, 122952\u2013122983 (2024)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"38_CR40","unstructured":"Yuan, K., et al.: Learning multi-modal representations by watching hundreds of surgical video lectures. arXiv preprint arXiv:2307.15220 (2023)"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-5764-6_38","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,23]],"date-time":"2026-01-23T06:08:02Z","timestamp":1769148482000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-5764-6_38"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819557639","9789819557646"],"references-count":40,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-5764-6_38","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"24 January 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Shanghai","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 October 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/2025.prcv.cn\/index.asp","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}